Coverage for mlprodict/onnxrt/validate/validate_summary.py: 95%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

238 statements  

1""" 

2@file 

3@brief Summarizes results produces by function in *validate.py*. 

4""" 

5import decimal 

6import json 

7import numpy 

8import pandas 

9from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version 

10from ... import __version__ as ort_version 

11 

12 

13def _clean_values_optim(val): 

14 if not isinstance(val, str): 

15 return val 

16 if '/' in val: 

17 spl = val.split('/') 

18 return "/".join(_clean_values_optim(v) for v in spl) 

19 if "'>=" in val: 

20 val = val.split("'>=") 

21 if len(val) == 2: 

22 val = val[-1] 

23 rep = { 

24 "{'optim': 'cdist'}": "cdist" 

25 } 

26 for k, v in rep.items(): 

27 val = val.replace(k, v) 

28 return val 

29 

30 

31def _summary_report_indices(df, add_cols=None, add_index=None): 

32 if 'opset' not in df.columns: 

33 raise RuntimeError( # pragma: no cover 

34 "Unable to create summary (opset missing)\n{}\n--\n{}".format( 

35 df.columns, df.head())) 

36 

37 col_values = ["available"] 

38 for col in ['problem', 'scenario', 'opset', 'optim']: 

39 if col not in df.columns: 

40 df[col] = '' if col != 'opset' else numpy.nan 

41 indices = ["name", "problem", "scenario", 'optim', 'method_name', 

42 'output_index', 'conv_options', 'inst'] 

43 indices = [i for i in indices if i in df.columns] 

44 df["optim"] = df["optim"].fillna('') 

45 for c in ['n_features', 'runtime']: 

46 if c in df.columns: 

47 indices.append(c) 

48 if c == 'runtime': 

49 df[c].fillna('-', inplace=True) 

50 for c in df.columns: 

51 if c.startswith('opset') or c in {'available'}: 

52 df[c].fillna('?', inplace=True) 

53 

54 # Adds information about the models in the index 

55 indices2 = [] 

56 for c in df.columns: 

57 if (isinstance(c, str) and len(c) >= 5 and ( 

58 c.startswith("onx_") or c.startswith("skl_"))): 

59 if c in {'onx_domain', 'onx_doc_string', 'onx_ir_version', 

60 'onx_model_version'}: 

61 continue 

62 if df[c].dtype in (numpy.float32, numpy.float64, float, 

63 int, numpy.int32, numpy.int64): 

64 defval = -1 

65 else: 

66 defval = '' 

67 df[c].fillna(defval, inplace=True) 

68 if c.startswith('skl_'): 

69 indices.append(c) 

70 else: 

71 indices2.append(c) 

72 

73 columns = ['opset'] 

74 indices = indices + indices2 

75 if add_index is not None: 

76 for i in add_index: # pragma: no cover 

77 if i not in indices: 

78 indices.append(i) 

79 return columns, indices, col_values 

80 

81 

82class _MyEncoder(json.JSONEncoder): 

83 def default(self, o): # pylint: disable=E0202 

84 if hasattr(o, 'get_params'): 

85 obj = dict(clsname=o.__class__.__name__) 

86 obj.update(o.get_params()) 

87 return json.dumps(obj, sort_keys=True) 

88 return json.dumps(o, sort_keys=True) # pragma: no cover 

89 

90 

91def _jsonify(x): 

92 

93 def _l(k): 

94 if isinstance(k, type): 

95 return k.__name__ 

96 return k 

97 

98 if isinstance(x, dict): 

99 x = {str(_l(k)): v for k, v in x.items()} 

100 try: 

101 return json.dumps(x, sort_keys=True, cls=_MyEncoder) 

102 except TypeError: # pragma: no cover 

103 # Cannot sort. 

104 return json.dumps(x, cls=_MyEncoder) 

105 try: 

106 if numpy.isnan(x): 

107 x = '' 

108 except (ValueError, TypeError): 

109 pass 

110 try: 

111 return json.dumps(x, cls=_MyEncoder) 

112 except TypeError: # pragma: no cover 

113 # Cannot sort. 

114 return json.dumps(x, cls=_MyEncoder) 

115 

116 

117def summary_report(df, add_cols=None, add_index=None): 

118 """ 

119 Finalizes the results computed by function 

120 @see fn enumerate_validated_operator_opsets. 

121 

122 @param df dataframe 

123 @param add_cols additional columns to take into account 

124 as values 

125 @param add_index additional columns to take into accound 

126 as index 

127 @return pivoted dataframe 

128 

129 The outcome can be seen at page about :ref:`l-onnx-pyrun`. 

130 """ 

131 df = df.copy() 

132 if 'inst' in df.columns: 

133 df['inst'] = df['inst'].apply(_jsonify) 

134 if 'conv_options' in df.columns: 

135 df['conv_options'] = df['conv_options'].apply(_jsonify) 

136 num_types = (int, float, decimal.Decimal, numpy.number) 

137 

138 def aggfunc(values): 

139 if len(values) != 1: 

140 if all(map(lambda x: isinstance(x, num_types), 

141 values)): 

142 mi, ma = min(values), max(values) 

143 if numpy.isnan(mi) and numpy.isnan(ma): 

144 return "" 

145 if mi == ma: 

146 return mi 

147 return '[{},{}]'.format(mi, ma) 

148 values = [str(_).replace("\n", " ").replace('\r', '').strip(" ") 

149 for _ in values] 

150 values = [_ for _ in values if _] 

151 vals = set(values) 

152 if len(vals) != 1: 

153 return " // ".join(map(str, values)) 

154 val = values.iloc[0] if not isinstance(values, list) else values[0] 

155 if isinstance(val, float) and numpy.isnan(val): 

156 return "" 

157 return str(val) 

158 

159 columns, indices, col_values = _summary_report_indices( 

160 df, add_cols=add_cols, add_index=add_index) 

161 try: 

162 piv = pandas.pivot_table(df, values=col_values, 

163 index=indices, columns=columns, 

164 aggfunc=aggfunc).reset_index(drop=False) 

165 except (KeyError, TypeError) as e: # pragma: no cover 

166 raise RuntimeError( 

167 "Issue with keys={}, values={}\namong {}.".format( 

168 indices, col_values, df.columns)) from e 

169 

170 cols = list(piv.columns) 

171 opsets = [c[1] for c in cols if isinstance(c[1], (int, float))] 

172 

173 versions = ["opset%d" % i for i in opsets] 

174 last = piv.columns[-1] 

175 if isinstance(last, tuple) and last == ('available', '?'): 

176 versions.append('FAIL') 

177 nbvalid = len(indices + versions) 

178 if len(piv.columns) != nbvalid: 

179 raise RuntimeError( # pragma: no cover 

180 "Mismatch between {} != {}\n{}\n{}\n---\n{}\n{}\n{}".format( 

181 len(piv.columns), len(indices + versions), 

182 piv.columns, indices + versions, 

183 df.columns, indices, col_values)) 

184 piv.columns = indices + versions 

185 piv = piv[indices + list(reversed(versions))].copy() 

186 for c in versions: 

187 piv[c].fillna('-', inplace=True) 

188 

189 if "available-ERROR" in df.columns: 

190 

191 from skl2onnx.common.exceptions import MissingShapeCalculator # delayed 

192 

193 def replace_msg(text): 

194 if isinstance(text, MissingShapeCalculator): 

195 return "NO CONVERTER" # pragma: no cover 

196 if str(text).startswith("Unable to find a shape calculator for type '"): 

197 return "NO CONVERTER" 

198 if str(text).startswith("Unable to find problem for model '"): 

199 return "NO PROBLEM" # pragma: no cover 

200 if "not implemented for float64" in str(text): 

201 return "NO RUNTIME 64" # pragma: no cover 

202 return str(text) 

203 

204 piv2 = pandas.pivot_table( 

205 df, values="available-ERROR", index=indices, 

206 columns='opset', aggfunc=aggfunc).reset_index(drop=False) 

207 

208 col = piv2.iloc[:, piv2.shape[1] - 1] 

209 piv["ERROR-msg"] = col.apply(replace_msg) 

210 

211 if any('time-ratio-' in c for c in df.columns): 

212 cols = [c for c in df.columns if c.startswith('time-ratio')] 

213 cols.sort() 

214 

215 df_sub = df[indices + cols] 

216 piv2 = df_sub.groupby(indices).mean() 

217 piv = piv.merge(piv2, on=indices, how='left') 

218 

219 def rep(c): 

220 if 'N=1' in c and 'N=10' not in c: 

221 return c.replace("time-ratio-", "RT/SKL-") 

222 else: 

223 return c.replace("time-ratio-", "") 

224 cols = [rep(c) for c in piv.columns] 

225 piv.columns = cols 

226 

227 # min, max 

228 mins = [c for c in piv.columns if c.endswith('-min')] 

229 maxs = [c for c in piv.columns if c.endswith('-max')] 

230 combined = [] 

231 for mi, ma in zip(mins, maxs): 

232 combined.append(mi) 

233 combined.append(ma) 

234 first = [c for c in piv.columns if c not in combined] 

235 piv = piv[first + combined] 

236 

237 def clean_values(value): 

238 if not isinstance(value, str): 

239 return value # pragma: no cover 

240 if "ERROR->=1000000" in value: 

241 value = "big-diff" 

242 elif "ERROR" in value: 

243 value = value.replace("ERROR-_", "") 

244 value = value.replace("_exc", "") 

245 value = "ERR: " + value 

246 elif "OK-" in value: 

247 value = value.replace("OK-", "OK ") 

248 elif "e<" in value: 

249 value = value.replace("-", " ") 

250 return value 

251 

252 for c in piv.columns: 

253 if "opset" in c: 

254 piv[c] = piv[c].apply(clean_values) 

255 if 'optim' in c: 

256 piv[c] = piv[c].apply(_clean_values_optim) 

257 

258 # adding versions 

259 def keep_values(x): 

260 if isinstance(x, float) and numpy.isnan(x): 

261 return False # pragma: no cover 

262 return True 

263 

264 col_versions = [c for c in df.columns if c.startswith("v_")] 

265 if len(col_versions) > 0: 

266 for c in col_versions: 

267 vals = set(filter(keep_values, df[c])) 

268 if len(vals) != 1: 

269 raise RuntimeError( # pragma: no cover 

270 "Columns '{}' has multiple values {}.".format(c, vals)) 

271 piv[c] = list(vals)[0] 

272 

273 return piv 

274 

275 

276def merge_benchmark(dfs, column='runtime', baseline=None, suffix='-base'): 

277 """ 

278 Merges several benchmarks run with command line 

279 :ref:`validate_runtime <l-cmd-validate_runtime>`. 

280 

281 @param dfs dictionary *{'prefix': dataframe}* 

282 @param column every value from this column is prefixed 

283 by the given key in *dfs* 

284 @param baseline add baseline 

285 @param suffix suffix to add when comparing to the baseline 

286 @return merged dataframe 

287 """ 

288 def add_prefix(prefix, v): 

289 if isinstance(v, str): 

290 return prefix + v 

291 return v # pragma: no cover 

292 

293 conc = [] 

294 for k, df in dfs.items(): 

295 if column not in df.columns: 

296 raise ValueError( 

297 "Unable to find column '{}' in {} (key='{}')".format( 

298 column, df.columns, k)) 

299 df = df.copy() 

300 df[column] = df[column].apply(lambda x: add_prefix(k, x)) 

301 if 'inst' in df.columns: 

302 df['inst'] = df['inst'].fillna('') 

303 else: 

304 df['inst'] = '' 

305 conc.append(df) 

306 merged = pandas.concat(conc).reset_index(drop=True) 

307 if baseline is not None: 

308 def get_key(index): 

309 k = [] 

310 for v in index: 

311 try: 

312 if numpy.isnan(v): 

313 continue # pragma: no cover 

314 except (ValueError, TypeError): 

315 pass 

316 k.append(v) 

317 return tuple(k) 

318 

319 columns, indices, _ = _summary_report_indices(merged) 

320 indices = list(_ for _ in (indices + columns) if _ != 'runtime') 

321 try: 

322 bdata = merged[merged.runtime == baseline].drop( 

323 'runtime', axis=1).set_index(indices, verify_integrity=True) 

324 except ValueError as e: 

325 bdata2 = merged[indices + ['runtime']].copy() 

326 bdata2['count'] = 1 

327 n_rows = bdata2['count'].sum() 

328 gr = bdata2.groupby(indices + ['runtime'], as_index=False).sum( 

329 ).sort_values('count', ascending=False) 

330 n_rows2 = gr['count'].sum() 

331 one = gr.head()[:1] 

332 rows = merged.merge(one, on=indices + ['runtime'])[:2] 

333 for c in ['init-types', 'bench-skl', 'bench-batch', 'init_types', 'cl']: 

334 if c in rows.columns: 

335 rows = rows.drop(c, axis=1) 

336 srows = rows.T.to_string(min_rows=100) 

337 raise ValueError( 

338 "(n_rows={}, n_rows2={}) Unable to group by {}.\n{}\n-------\n{}".format( 

339 n_rows, n_rows2, indices, gr.T, srows)) from e 

340 if bdata.shape[0] == 0: 

341 raise RuntimeError( # pragma: no cover 

342 "No result for baseline '{}'.".format(baseline)) 

343 ratios = [c for c in merged.columns if c.startswith('time-ratio-')] 

344 indexed = {} 

345 for index in bdata.index: 

346 row = bdata.loc[index, :] 

347 key = get_key(index) 

348 indexed[key] = row[ratios] 

349 

350 for i in range(merged.shape[0]): 

351 key = get_key(tuple(merged.loc[i, indices])) 

352 if key not in indexed: 

353 continue # pragma: no cover 

354 value = indexed[key] 

355 for r in ratios: 

356 if r.endswith('-min') or r.endswith('-max'): 

357 continue 

358 value2 = merged.loc[i, r] 

359 new_r = value2 / value[r] 

360 new_col = r + suffix 

361 if new_col not in merged.columns: 

362 merged[new_col] = numpy.nan 

363 merged.loc[i, new_col] = new_r 

364 

365 return merged