Coverage for mlprodict/onnx_conv/convert.py: 91%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

161 statements  

1# -*- encoding: utf-8 -*- 

2""" 

3@file 

4@brief Overloads a conversion function. 

5""" 

6import pprint 

7from collections import OrderedDict 

8import logging 

9import numpy 

10import pandas 

11try: 

12 from sklearn.metrics._scorer import _PredictScorer 

13except ImportError: # pragma: no cover 

14 # scikit-learn < 0.22 

15 from sklearn.metrics.scorer import _PredictScorer 

16from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version 

17from skl2onnx.common.data_types import ( 

18 FloatTensorType, DoubleTensorType, DataType, guess_numpy_type, 

19 StringTensorType, Int64TensorType) 

20from skl2onnx import convert_sklearn 

21from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin 

22from skl2onnx.algebra.type_helper import _guess_type 

23from ..onnx_tools.onnx_manipulations import onnx_rename_names 

24from .register_rewritten_converters import register_rewritten_operators 

25from .register import register_converters 

26from .scorers import CustomScorerTransform 

27 

28 

29logger = logging.getLogger('mlprodict') 

30 

31 

32def _fix_opset_skl2onnx(): 

33 import skl2onnx 

34 from .. import __max_supported_opset__ 

35 if skl2onnx.__max_supported_opset__ != __max_supported_opset__: 

36 skl2onnx.__max_supported_opset__ = __max_supported_opset__ 

37 

38 

39def convert_scorer(fct, initial_types, name=None, 

40 target_opset=None, options=None, 

41 custom_conversion_functions=None, 

42 custom_shape_calculators=None, 

43 custom_parsers=None, white_op=None, 

44 black_op=None, final_types=None, 

45 verbose=0): 

46 """ 

47 Converts a scorer into :epkg:`ONNX` assuming 

48 there exists a converter associated to it. 

49 The function wraps the function into a custom 

50 transformer, then calls function *convert_sklearn* 

51 from :epkg:`sklearn-onnx`. 

52 

53 :param fct: function to convert (or a scorer from :epkg:`scikit-learn`) 

54 :param initial_types: types information 

55 :param name: name of the produced model 

56 :param target_opset: to do it with a different target opset 

57 :param options: additional parameters for the conversion 

58 :param custom_conversion_functions: a dictionary for specifying the user 

59 customized conversion function, it takes precedence over 

60 registered converters 

61 :param custom_shape_calculators: a dictionary for specifying the user 

62 customized shape calculator it takes precedence over registered 

63 shape calculators. 

64 :param custom_parsers: parsers determine which outputs is expected 

65 for which particular task, default parsers are 

66 defined for classifiers, regressors, pipeline but 

67 they can be rewritten, *custom_parsers* is a dictionary 

68 ``{ type: fct_parser(scope, model, inputs, 

69 custom_parsers=None) }`` 

70 :param white_op: white list of ONNX nodes allowed 

71 while converting a pipeline, if empty, all are allowed 

72 :param black_op: black list of ONNX nodes allowed 

73 while converting a pipeline, if empty, none are blacklisted 

74 :param final_types: a python list. Works the same way as 

75 initial_types but not mandatory, it is used 

76 to overwrites the type (if type is not None) 

77 and the name of every output. 

78 :param verbose: displays information while converting 

79 :return: :epkg:`ONNX` graph 

80 """ 

81 if hasattr(fct, '_score_func'): 

82 kwargs = fct._kwargs 

83 fct = fct._score_func 

84 else: 

85 kwargs = None # pragma: no cover 

86 if name is None: 

87 name = "mlprodict_fct_ONNX(%s)" % fct.__name__ 

88 tr = CustomScorerTransform(fct.__name__, fct, kwargs) 

89 _fix_opset_skl2onnx() 

90 return convert_sklearn( 

91 tr, initial_types=initial_types, 

92 target_opset=target_opset, options=options, 

93 custom_conversion_functions=custom_conversion_functions, 

94 custom_shape_calculators=custom_shape_calculators, 

95 custom_parsers=custom_parsers, white_op=white_op, 

96 black_op=black_op, final_types=final_types, 

97 verbose=verbose) 

98 

99 

100def guess_initial_types(X, initial_types): 

101 """ 

102 Guesses initial types from an array or a dataframe. 

103 

104 @param X array or dataframe 

105 @param initial_types hints about X 

106 @return data types 

107 """ 

108 if X is None and initial_types is None: 

109 raise NotImplementedError( # pragma: no cover 

110 "Initial types must be specified.") 

111 elif initial_types is None: 

112 if isinstance(X, (numpy.ndarray, pandas.DataFrame)): 

113 X = X[:1] 

114 if isinstance(X, pandas.DataFrame): 

115 initial_types = [] 

116 for c in X.columns: 

117 if isinstance(X[c].values[0], (str, numpy.str_)): 

118 g = StringTensorType() 

119 else: 

120 g = _guess_type(X[c].values) 

121 g.shape = [None, 1] 

122 initial_types.append((c, g)) 

123 else: 

124 gt = _guess_type(X) 

125 initial_types = [('X', gt)] 

126 return initial_types 

127 

128 

129def _replace_tensor_type(schema, tensor_type): 

130 res = [] 

131 for name, ty in schema: 

132 cl = ty.__class__ 

133 if cl in (FloatTensorType, DoubleTensorType) and cl != tensor_type: 

134 ty = tensor_type(ty.shape) 

135 res.append((name, ty)) 

136 return res 

137 

138 

139def guess_schema_from_data(X, tensor_type=None, schema=None): 

140 """ 

141 Guesses initial types from a dataset. 

142 

143 @param X dataset (dataframe, array) 

144 @param tensor_type if not None, replaces every 

145 *FloatTensorType* or *DoubleTensorType* 

146 by this one 

147 @param schema known schema 

148 @return schema (list of typed and named columns) 

149 """ 

150 init = guess_initial_types(X, schema) 

151 if tensor_type is not None: 

152 init = _replace_tensor_type(init, tensor_type) 

153 # Grouping column 

154 unique = set() 

155 for _, col in init: 

156 if len(col.shape) != 2: 

157 return init # pragma: no cover 

158 if col.shape[0] is not None: 

159 return init # pragma: no cover 

160 if len(unique) > 0 and col.__class__ not in unique: 

161 return init # pragma: no cover 

162 unique.add(col.__class__) 

163 unique = list(unique) 

164 return [('X', unique[0]([None, sum(_[1].shape[1] for _ in init)]))] 

165 

166 

167def get_inputs_from_data(X, schema=None): 

168 """ 

169 Produces input data for *onnx* runtime. 

170 

171 @param X data 

172 @param schema schema if None, schema is guessed with 

173 @see fn guess_schema_from_data 

174 @return input data 

175 """ 

176 def _cast_data(X, ct): 

177 if isinstance(ct, FloatTensorType): 

178 return X.astype(numpy.float32) 

179 if isinstance(ct, DoubleTensorType): 

180 return X.astype(numpy.float64) 

181 if isinstance(ct, StringTensorType): 

182 return X.astype(numpy.str_) 

183 if isinstance(ct, Int64TensorType): 

184 return X.astype(numpy.int64) 

185 raise RuntimeError( # pragma: no cover 

186 "Unexpected column type {} for type {}." 

187 "".format(ct, type(X))) 

188 

189 if schema is None: 

190 schema = guess_schema_from_data(X) 

191 if isinstance(X, numpy.ndarray): 

192 if len(schema) != 1: 

193 raise RuntimeError( # pragma: no cover 

194 "More than one column but input is an array.") 

195 return {schema[0][0]: _cast_data(X, schema[0][1])} 

196 if isinstance(X, pandas.DataFrame): 

197 if len(schema) != X.shape[1]: 

198 raise RuntimeError( # pragma: no cover 

199 "Mismatch between onnx columns {} and DataFrame columns {}" 

200 "".format(len(schema), X.shape[1])) 

201 return {sch[0]: _cast_data(X[c].values, sch[1]).reshape((-1, 1)) 

202 for sch, c in zip(schema, X.columns)} 

203 raise TypeError( # pragma: no cover 

204 "Unexpected type {}, expecting an array or a dataframe." 

205 "".format(type(X))) 

206 

207 

208def guess_schema_from_model(model, tensor_type=None, schema=None): 

209 """ 

210 Guesses initial types from a model. 

211 

212 @param model model 

213 @param tensor_type if not None, replaces every 

214 *FloatTensorType* or *DoubleTensorType* 

215 by this one 

216 @param schema known schema 

217 @return schema (list of typed and named columns) 

218 """ 

219 if schema is not None: 

220 try: 

221 guessed = guess_schema_from_model(model) 

222 except NotImplementedError: # pragma: no cover 

223 return _replace_tensor_type(schema, tensor_type) 

224 if len(guessed) != len(schema): 

225 raise RuntimeError( # pragma: no cover 

226 "Given schema and guessed schema are not the same:\nGOT: {}\n-----\nGOT:\n{}".format( 

227 schema, guessed)) 

228 return _replace_tensor_type(schema, tensor_type) 

229 

230 if hasattr(model, 'coef_'): 

231 # linear model 

232 init = [('X', FloatTensorType([None, model.coef_.shape[1]]))] 

233 return _replace_tensor_type(init, tensor_type) 

234 elif hasattr(model, 'dump_model'): 

235 dumped = model.dump_model() 

236 if isinstance(dumped, dict) and 'feature_names' in dumped: 

237 names = dumped['feature_names'] 

238 init = [(name, FloatTensorType([None, 1])) for name in names] 

239 return _replace_tensor_type(init, tensor_type) 

240 

241 data = pprint.pformat(model.__dict__) 

242 dirs = pprint.pformat(dir(model)) 

243 if hasattr(model, 'dump_model'): # pragma: no cover 

244 dumped = model.dump_model() 

245 keys = list(sorted(dumped)) 

246 last = pprint.pformat([keys, dumped]) 

247 if len(last) >= 200000: 

248 last = last[:200000] + "\n..." 

249 else: 

250 last = "" 

251 raise NotImplementedError( # pragma: no cover 

252 "Unable to guess schema for model {}\n{}\n----\n{}\n------\n{}".format( 

253 model.__class__, data, dirs, last)) 

254 

255 

256def to_onnx(model, X=None, name=None, initial_types=None, 

257 target_opset=None, options=None, rewrite_ops=False, 

258 white_op=None, black_op=None, final_types=None, 

259 rename_strategy=None, verbose=0): 

260 """ 

261 Converts a model using on :epkg:`sklearn-onnx`. 

262 

263 :param model: model to convert or a function 

264 wrapped into :epkg:`_PredictScorer` with 

265 function :epkg:`make_scorer` 

266 :param X: training set (at least one row), 

267 can be None, it is used to infered the 

268 input types (*initial_types*) 

269 :param initial_types: if *X* is None, then *initial_types* 

270 must be defined 

271 :param name: name of the produced model 

272 :param target_opset: to do it with a different target opset 

273 :param options: additional parameters for the conversion 

274 :param rewrite_ops: rewrites some existing converters, 

275 the changes are permanent 

276 :param white_op: white list of ONNX nodes allowed 

277 while converting a pipeline, if empty, all are allowed 

278 :param black_op: black list of ONNX nodes allowed 

279 while converting a pipeline, if empty, 

280 none are blacklisted 

281 :param final_types: a python list. Works the same way as 

282 initial_types but not mandatory, it is used 

283 to overwrites the type (if type is not None) 

284 and the name of every output. 

285 :param rename_strategy: rename any name in the graph, select shorter 

286 names, see @see fn onnx_rename_names 

287 :param verbose: display information while converting the model 

288 :return: converted model 

289 

290 The function rewrites function *to_onnx* from :epkg:`sklearn-onnx` 

291 but may changes a few converters if *rewrite_ops* is True. 

292 For example, :epkg:`ONNX` only supports *TreeEnsembleRegressor* 

293 for float but not for double. It becomes available 

294 if ``rewrite_ops=True``. 

295 

296 .. faqref:: 

297 :title: How to deal with a dataframe as input? 

298 

299 Each column of the dataframe is considered as an named input. 

300 The first step is to make sure that every column type is correct. 

301 :epkg:`pandas` tends to select the least generic type to 

302 hold the content of one column. :epkg:`ONNX` does not automatically 

303 cast the data it receives. The data must have the same type with 

304 the model is converted and when the converted model receives 

305 the data to predict. 

306 

307 .. runpython:: 

308 :showcode: 

309 :warningout: DeprecationWarning 

310 

311 from io import StringIO 

312 from textwrap import dedent 

313 import numpy 

314 import pandas 

315 from pyquickhelper.pycode import ExtTestCase 

316 from sklearn.preprocessing import OneHotEncoder 

317 from sklearn.pipeline import Pipeline 

318 from sklearn.compose import ColumnTransformer 

319 from mlprodict.onnx_conv import to_onnx 

320 from mlprodict.onnxrt import OnnxInference 

321 

322 text = dedent(''' 

323 __SCHEMA__ 

324 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red 

325 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red 

326 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red 

327 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red 

328 ''') 

329 text = text.replace( 

330 "__SCHEMA__", 

331 "fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides," 

332 "free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates," 

333 "alcohol,quality,color") 

334 

335 X_train = pandas.read_csv(StringIO(text)) 

336 for c in X_train.columns: 

337 if c != 'color': 

338 X_train[c] = X_train[c].astype(numpy.float32) 

339 numeric_features = [c for c in X_train if c != 'color'] 

340 

341 pipe = Pipeline([ 

342 ("prep", ColumnTransformer([ 

343 ("color", Pipeline([ 

344 ('one', OneHotEncoder()), 

345 ('select', ColumnTransformer( 

346 [('sel1', 'passthrough', [0])])) 

347 ]), ['color']), 

348 ("others", "passthrough", numeric_features) 

349 ])), 

350 ]) 

351 

352 pipe.fit(X_train) 

353 pred = pipe.transform(X_train) 

354 print(pred) 

355 

356 model_onnx = to_onnx(pipe, X_train, target_opset=12) 

357 oinf = OnnxInference(model_onnx) 

358 

359 # The dataframe is converted into a dictionary, 

360 # each key is a column name, each value is a numpy array. 

361 inputs = {c: X_train[c].values for c in X_train.columns} 

362 inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()} 

363 

364 onxp = oinf.run(inputs) 

365 print(onxp) 

366 

367 .. versionchanged:: 0.7 

368 Parameter *rename_strategy* was added. 

369 """ 

370 logger.debug("to_onnx(%s, X=%r, initial_types=%r, target_opset=%r, " 

371 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, " 

372 "final_types=%r)", 

373 model.__class__.__name__, type(X), initial_types, 

374 target_opset, options, rewrite_ops, white_op, black_op, 

375 final_types) 

376 

377 if isinstance(model, OnnxOperatorMixin): 

378 if not hasattr(model, 'op_version'): 

379 raise RuntimeError( # pragma: no cover 

380 "Missing attribute 'op_version' for type '{}'.".format( 

381 type(model))) 

382 _fix_opset_skl2onnx() 

383 return model.to_onnx( 

384 X=X, name=name, options=options, black_op=black_op, 

385 white_op=white_op, final_types=final_types, 

386 target_opset=target_opset) 

387 # verbose=verbose) 

388 

389 if rewrite_ops: 

390 old_values, old_shapes = register_rewritten_operators() 

391 register_converters() 

392 else: 

393 old_values, old_shapes = {}, {} 

394 

395 def _guess_type_(X, itype, dtype): 

396 initial_types = guess_initial_types(X, itype) 

397 if dtype is None: 

398 if hasattr(X, 'dtypes'): # DataFrame 

399 dtype = numpy.float32 

400 elif hasattr(X, 'dtype'): 

401 dtype = X.dtype 

402 elif hasattr(X, 'type'): 

403 dtype = guess_numpy_type(X.type) 

404 elif initial_types is not None: 

405 dtype = guess_numpy_type(initial_types[0][1]) 

406 else: 

407 raise RuntimeError( # pragma: no cover 

408 "dtype cannot be guessed: {}".format( 

409 type(X))) 

410 if dtype != numpy.float64: 

411 dtype = numpy.float32 

412 if dtype is None: 

413 raise RuntimeError("dtype cannot be None") # pragma: no cover 

414 if isinstance(dtype, FloatTensorType): 

415 dtype = numpy.float32 # pragma: no cover 

416 elif isinstance(dtype, DoubleTensorType): 

417 dtype = numpy.float64 # pragma: no cover 

418 new_dtype = dtype 

419 if isinstance(dtype, numpy.ndarray): 

420 new_dtype = dtype.dtype # pragma: no cover 

421 elif isinstance(dtype, DataType): 

422 new_dtype = numpy.float32 # pragma: no cover 

423 if new_dtype not in (numpy.float32, numpy.float64, numpy.int64, 

424 numpy.int32, numpy.float16): 

425 raise NotImplementedError( # pragma: no cover 

426 "dtype should be real not {} ({})".format(new_dtype, dtype)) 

427 return initial_types, dtype, new_dtype 

428 

429 if isinstance(model, _PredictScorer): 

430 if X is not None and not isinstance(X, OrderedDict): 

431 raise ValueError("For a scorer, parameter X should be a OrderedDict not {}." 

432 "".format(type(X))) 

433 if initial_types is None: 

434 dts = [] 

435 initial_types = [] 

436 for k, v in X.items(): 

437 if hasattr(v, 'dtype'): 

438 dtype = guess_numpy_type(v.dtype) 

439 else: 

440 dtype = v # pragma: no cover 

441 it, _, ndt = _guess_type_(v, None, dtype) 

442 for i in range(len(it)): # pylint: disable=C0200 

443 it[i] = (k, it[i][1]) # pylint: disable=C0200 

444 initial_types.extend(it) 

445 dts.append(ndt) 

446 ndt = set(dts) 

447 if len(ndt) != 1: 

448 raise RuntimeError( # pragma: no cover 

449 "Multiple dtype is not efficient {}.".format(ndt)) 

450 res = convert_scorer(model, initial_types, name=name, 

451 target_opset=target_opset, options=options, 

452 black_op=black_op, white_op=white_op, 

453 final_types=final_types, verbose=verbose) 

454 else: 

455 if name is None: 

456 name = "mlprodict_ONNX(%s)" % model.__class__.__name__ 

457 

458 initial_types, dtype, _ = _guess_type_(X, initial_types, None) 

459 

460 _fix_opset_skl2onnx() 

461 res = convert_sklearn(model, initial_types=initial_types, name=name, 

462 target_opset=target_opset, options=options, 

463 black_op=black_op, white_op=white_op, 

464 final_types=final_types, verbose=verbose) 

465 

466 register_rewritten_operators(old_values, old_shapes) 

467 

468 # optimisation 

469 if rename_strategy is not None: 

470 res = onnx_rename_names(res, strategy=rename_strategy) 

471 return res