Coverage for mlprodict/onnxrt/validate/validate_summary.py: 95%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Summarizes results produces by function in *validate.py*.
4"""
5import decimal
6import json
7import numpy
8import pandas
9from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version
10from ... import __version__ as ort_version
13def _clean_values_optim(val):
14 if not isinstance(val, str):
15 return val
16 if '/' in val:
17 spl = val.split('/')
18 return "/".join(_clean_values_optim(v) for v in spl)
19 if "'>=" in val:
20 val = val.split("'>=")
21 if len(val) == 2:
22 val = val[-1]
23 rep = {
24 "{'optim': 'cdist'}": "cdist"
25 }
26 for k, v in rep.items():
27 val = val.replace(k, v)
28 return val
31def _summary_report_indices(df, add_cols=None, add_index=None):
32 if 'opset' not in df.columns:
33 raise RuntimeError( # pragma: no cover
34 "Unable to create summary (opset missing)\n{}\n--\n{}".format(
35 df.columns, df.head()))
37 col_values = ["available"]
38 for col in ['problem', 'scenario', 'opset', 'optim']:
39 if col not in df.columns:
40 df[col] = '' if col != 'opset' else numpy.nan
41 indices = ["name", "problem", "scenario", 'optim', 'method_name',
42 'output_index', 'conv_options', 'inst']
43 indices = [i for i in indices if i in df.columns]
44 df["optim"] = df["optim"].fillna('')
45 for c in ['n_features', 'runtime']:
46 if c in df.columns:
47 indices.append(c)
48 if c == 'runtime':
49 df[c].fillna('-', inplace=True)
50 for c in df.columns:
51 if c.startswith('opset') or c in {'available'}:
52 df[c].fillna('?', inplace=True)
54 # Adds information about the models in the index
55 indices2 = []
56 for c in df.columns:
57 if (isinstance(c, str) and len(c) >= 5 and (
58 c.startswith("onx_") or c.startswith("skl_"))):
59 if c in {'onx_domain', 'onx_doc_string', 'onx_ir_version',
60 'onx_model_version'}:
61 continue
62 if df[c].dtype in (numpy.float32, numpy.float64, float,
63 int, numpy.int32, numpy.int64):
64 defval = -1
65 else:
66 defval = ''
67 df[c].fillna(defval, inplace=True)
68 if c.startswith('skl_'):
69 indices.append(c)
70 else:
71 indices2.append(c)
73 columns = ['opset']
74 indices = indices + indices2
75 if add_index is not None:
76 for i in add_index: # pragma: no cover
77 if i not in indices:
78 indices.append(i)
79 return columns, indices, col_values
82class _MyEncoder(json.JSONEncoder):
83 def default(self, o): # pylint: disable=E0202
84 if hasattr(o, 'get_params'):
85 obj = dict(clsname=o.__class__.__name__)
86 obj.update(o.get_params())
87 return json.dumps(obj, sort_keys=True)
88 return json.dumps(o, sort_keys=True) # pragma: no cover
91def _jsonify(x):
93 def _l(k):
94 if isinstance(k, type):
95 return k.__name__
96 return k
98 if isinstance(x, dict):
99 x = {str(_l(k)): v for k, v in x.items()}
100 try:
101 return json.dumps(x, sort_keys=True, cls=_MyEncoder)
102 except TypeError: # pragma: no cover
103 # Cannot sort.
104 return json.dumps(x, cls=_MyEncoder)
105 try:
106 if numpy.isnan(x):
107 x = ''
108 except (ValueError, TypeError):
109 pass
110 try:
111 return json.dumps(x, cls=_MyEncoder)
112 except TypeError: # pragma: no cover
113 # Cannot sort.
114 return json.dumps(x, cls=_MyEncoder)
117def summary_report(df, add_cols=None, add_index=None):
118 """
119 Finalizes the results computed by function
120 @see fn enumerate_validated_operator_opsets.
122 @param df dataframe
123 @param add_cols additional columns to take into account
124 as values
125 @param add_index additional columns to take into accound
126 as index
127 @return pivoted dataframe
129 The outcome can be seen at page about :ref:`l-onnx-pyrun`.
130 """
131 df = df.copy()
132 if 'inst' in df.columns:
133 df['inst'] = df['inst'].apply(_jsonify)
134 if 'conv_options' in df.columns:
135 df['conv_options'] = df['conv_options'].apply(_jsonify)
136 num_types = (int, float, decimal.Decimal, numpy.number)
138 def aggfunc(values):
139 if len(values) != 1:
140 if all(map(lambda x: isinstance(x, num_types),
141 values)):
142 mi, ma = min(values), max(values)
143 if numpy.isnan(mi) and numpy.isnan(ma):
144 return ""
145 if mi == ma:
146 return mi
147 return '[{},{}]'.format(mi, ma)
148 values = [str(_).replace("\n", " ").replace('\r', '').strip(" ")
149 for _ in values]
150 values = [_ for _ in values if _]
151 vals = set(values)
152 if len(vals) != 1:
153 return " // ".join(map(str, values))
154 val = values.iloc[0] if not isinstance(values, list) else values[0]
155 if isinstance(val, float) and numpy.isnan(val):
156 return ""
157 return str(val)
159 columns, indices, col_values = _summary_report_indices(
160 df, add_cols=add_cols, add_index=add_index)
161 try:
162 piv = pandas.pivot_table(df, values=col_values,
163 index=indices, columns=columns,
164 aggfunc=aggfunc).reset_index(drop=False)
165 except (KeyError, TypeError) as e: # pragma: no cover
166 raise RuntimeError(
167 "Issue with keys={}, values={}\namong {}.".format(
168 indices, col_values, df.columns)) from e
170 cols = list(piv.columns)
171 opsets = [c[1] for c in cols if isinstance(c[1], (int, float))]
173 versions = ["opset%d" % i for i in opsets]
174 last = piv.columns[-1]
175 if isinstance(last, tuple) and last == ('available', '?'):
176 versions.append('FAIL')
177 nbvalid = len(indices + versions)
178 if len(piv.columns) != nbvalid:
179 raise RuntimeError( # pragma: no cover
180 "Mismatch between {} != {}\n{}\n{}\n---\n{}\n{}\n{}".format(
181 len(piv.columns), len(indices + versions),
182 piv.columns, indices + versions,
183 df.columns, indices, col_values))
184 piv.columns = indices + versions
185 piv = piv[indices + list(reversed(versions))].copy()
186 for c in versions:
187 piv[c].fillna('-', inplace=True)
189 if "available-ERROR" in df.columns:
191 from skl2onnx.common.exceptions import MissingShapeCalculator # delayed
193 def replace_msg(text):
194 if isinstance(text, MissingShapeCalculator):
195 return "NO CONVERTER" # pragma: no cover
196 if str(text).startswith("Unable to find a shape calculator for type '"):
197 return "NO CONVERTER"
198 if str(text).startswith("Unable to find problem for model '"):
199 return "NO PROBLEM" # pragma: no cover
200 if "not implemented for float64" in str(text):
201 return "NO RUNTIME 64" # pragma: no cover
202 return str(text)
204 piv2 = pandas.pivot_table(
205 df, values="available-ERROR", index=indices,
206 columns='opset', aggfunc=aggfunc).reset_index(drop=False)
208 col = piv2.iloc[:, piv2.shape[1] - 1]
209 piv["ERROR-msg"] = col.apply(replace_msg)
211 if any('time-ratio-' in c for c in df.columns):
212 cols = [c for c in df.columns if c.startswith('time-ratio')]
213 cols.sort()
215 df_sub = df[indices + cols]
216 piv2 = df_sub.groupby(indices).mean()
217 piv = piv.merge(piv2, on=indices, how='left')
219 def rep(c):
220 if 'N=1' in c and 'N=10' not in c:
221 return c.replace("time-ratio-", "RT/SKL-")
222 else:
223 return c.replace("time-ratio-", "")
224 cols = [rep(c) for c in piv.columns]
225 piv.columns = cols
227 # min, max
228 mins = [c for c in piv.columns if c.endswith('-min')]
229 maxs = [c for c in piv.columns if c.endswith('-max')]
230 combined = []
231 for mi, ma in zip(mins, maxs):
232 combined.append(mi)
233 combined.append(ma)
234 first = [c for c in piv.columns if c not in combined]
235 piv = piv[first + combined]
237 def clean_values(value):
238 if not isinstance(value, str):
239 return value # pragma: no cover
240 if "ERROR->=1000000" in value:
241 value = "big-diff"
242 elif "ERROR" in value:
243 value = value.replace("ERROR-_", "")
244 value = value.replace("_exc", "")
245 value = "ERR: " + value
246 elif "OK-" in value:
247 value = value.replace("OK-", "OK ")
248 elif "e<" in value:
249 value = value.replace("-", " ")
250 return value
252 for c in piv.columns:
253 if "opset" in c:
254 piv[c] = piv[c].apply(clean_values)
255 if 'optim' in c:
256 piv[c] = piv[c].apply(_clean_values_optim)
258 # adding versions
259 def keep_values(x):
260 if isinstance(x, float) and numpy.isnan(x):
261 return False # pragma: no cover
262 return True
264 col_versions = [c for c in df.columns if c.startswith("v_")]
265 if len(col_versions) > 0:
266 for c in col_versions:
267 vals = set(filter(keep_values, df[c]))
268 if len(vals) != 1:
269 raise RuntimeError( # pragma: no cover
270 "Columns '{}' has multiple values {}.".format(c, vals))
271 piv[c] = list(vals)[0]
273 return piv
276def merge_benchmark(dfs, column='runtime', baseline=None, suffix='-base'):
277 """
278 Merges several benchmarks run with command line
279 :ref:`validate_runtime <l-cmd-validate_runtime>`.
281 @param dfs dictionary *{'prefix': dataframe}*
282 @param column every value from this column is prefixed
283 by the given key in *dfs*
284 @param baseline add baseline
285 @param suffix suffix to add when comparing to the baseline
286 @return merged dataframe
287 """
288 def add_prefix(prefix, v):
289 if isinstance(v, str):
290 return prefix + v
291 return v # pragma: no cover
293 conc = []
294 for k, df in dfs.items():
295 if column not in df.columns:
296 raise ValueError(
297 "Unable to find column '{}' in {} (key='{}')".format(
298 column, df.columns, k))
299 df = df.copy()
300 df[column] = df[column].apply(lambda x: add_prefix(k, x))
301 if 'inst' in df.columns:
302 df['inst'] = df['inst'].fillna('')
303 else:
304 df['inst'] = ''
305 conc.append(df)
306 merged = pandas.concat(conc).reset_index(drop=True)
307 if baseline is not None:
308 def get_key(index):
309 k = []
310 for v in index:
311 try:
312 if numpy.isnan(v):
313 continue # pragma: no cover
314 except (ValueError, TypeError):
315 pass
316 k.append(v)
317 return tuple(k)
319 columns, indices, _ = _summary_report_indices(merged)
320 indices = list(_ for _ in (indices + columns) if _ != 'runtime')
321 try:
322 bdata = merged[merged.runtime == baseline].drop(
323 'runtime', axis=1).set_index(indices, verify_integrity=True)
324 except ValueError as e:
325 bdata2 = merged[indices + ['runtime']].copy()
326 bdata2['count'] = 1
327 n_rows = bdata2['count'].sum()
328 gr = bdata2.groupby(indices + ['runtime'], as_index=False).sum(
329 ).sort_values('count', ascending=False)
330 n_rows2 = gr['count'].sum()
331 one = gr.head()[:1]
332 rows = merged.merge(one, on=indices + ['runtime'])[:2]
333 for c in ['init-types', 'bench-skl', 'bench-batch', 'init_types', 'cl']:
334 if c in rows.columns:
335 rows = rows.drop(c, axis=1)
336 srows = rows.T.to_string(min_rows=100)
337 raise ValueError(
338 "(n_rows={}, n_rows2={}) Unable to group by {}.\n{}\n-------\n{}".format(
339 n_rows, n_rows2, indices, gr.T, srows)) from e
340 if bdata.shape[0] == 0:
341 raise RuntimeError( # pragma: no cover
342 "No result for baseline '{}'.".format(baseline))
343 ratios = [c for c in merged.columns if c.startswith('time-ratio-')]
344 indexed = {}
345 for index in bdata.index:
346 row = bdata.loc[index, :]
347 key = get_key(index)
348 indexed[key] = row[ratios]
350 for i in range(merged.shape[0]):
351 key = get_key(tuple(merged.loc[i, indices]))
352 if key not in indexed:
353 continue # pragma: no cover
354 value = indexed[key]
355 for r in ratios:
356 if r.endswith('-min') or r.endswith('-max'):
357 continue
358 value2 = merged.loc[i, r]
359 new_r = value2 / value[r]
360 new_col = r + suffix
361 if new_col not in merged.columns:
362 merged[new_col] = numpy.nan
363 merged.loc[i, new_col] = new_r
365 return merged