Coverage for mlprodict/onnx_conv/convert.py: 91%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- encoding: utf-8 -*-
2"""
3@file
4@brief Overloads a conversion function.
5"""
6import pprint
7from collections import OrderedDict
8import logging
9import numpy
10import pandas
11try:
12 from sklearn.metrics._scorer import _PredictScorer
13except ImportError: # pragma: no cover
14 # scikit-learn < 0.22
15 from sklearn.metrics.scorer import _PredictScorer
16from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version
17from skl2onnx.common.data_types import (
18 FloatTensorType, DoubleTensorType, DataType, guess_numpy_type,
19 StringTensorType, Int64TensorType)
20from skl2onnx import convert_sklearn
21from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin
22from skl2onnx.algebra.type_helper import _guess_type
23from ..onnx_tools.onnx_manipulations import onnx_rename_names
24from .register_rewritten_converters import register_rewritten_operators
25from .register import register_converters
26from .scorers import CustomScorerTransform
29logger = logging.getLogger('mlprodict')
32def _fix_opset_skl2onnx():
33 import skl2onnx
34 from .. import __max_supported_opset__
35 if skl2onnx.__max_supported_opset__ != __max_supported_opset__:
36 skl2onnx.__max_supported_opset__ = __max_supported_opset__
39def convert_scorer(fct, initial_types, name=None,
40 target_opset=None, options=None,
41 custom_conversion_functions=None,
42 custom_shape_calculators=None,
43 custom_parsers=None, white_op=None,
44 black_op=None, final_types=None,
45 verbose=0):
46 """
47 Converts a scorer into :epkg:`ONNX` assuming
48 there exists a converter associated to it.
49 The function wraps the function into a custom
50 transformer, then calls function *convert_sklearn*
51 from :epkg:`sklearn-onnx`.
53 :param fct: function to convert (or a scorer from :epkg:`scikit-learn`)
54 :param initial_types: types information
55 :param name: name of the produced model
56 :param target_opset: to do it with a different target opset
57 :param options: additional parameters for the conversion
58 :param custom_conversion_functions: a dictionary for specifying the user
59 customized conversion function, it takes precedence over
60 registered converters
61 :param custom_shape_calculators: a dictionary for specifying the user
62 customized shape calculator it takes precedence over registered
63 shape calculators.
64 :param custom_parsers: parsers determine which outputs is expected
65 for which particular task, default parsers are
66 defined for classifiers, regressors, pipeline but
67 they can be rewritten, *custom_parsers* is a dictionary
68 ``{ type: fct_parser(scope, model, inputs,
69 custom_parsers=None) }``
70 :param white_op: white list of ONNX nodes allowed
71 while converting a pipeline, if empty, all are allowed
72 :param black_op: black list of ONNX nodes allowed
73 while converting a pipeline, if empty, none are blacklisted
74 :param final_types: a python list. Works the same way as
75 initial_types but not mandatory, it is used
76 to overwrites the type (if type is not None)
77 and the name of every output.
78 :param verbose: displays information while converting
79 :return: :epkg:`ONNX` graph
80 """
81 if hasattr(fct, '_score_func'):
82 kwargs = fct._kwargs
83 fct = fct._score_func
84 else:
85 kwargs = None # pragma: no cover
86 if name is None:
87 name = "mlprodict_fct_ONNX(%s)" % fct.__name__
88 tr = CustomScorerTransform(fct.__name__, fct, kwargs)
89 _fix_opset_skl2onnx()
90 return convert_sklearn(
91 tr, initial_types=initial_types,
92 target_opset=target_opset, options=options,
93 custom_conversion_functions=custom_conversion_functions,
94 custom_shape_calculators=custom_shape_calculators,
95 custom_parsers=custom_parsers, white_op=white_op,
96 black_op=black_op, final_types=final_types,
97 verbose=verbose)
100def guess_initial_types(X, initial_types):
101 """
102 Guesses initial types from an array or a dataframe.
104 @param X array or dataframe
105 @param initial_types hints about X
106 @return data types
107 """
108 if X is None and initial_types is None:
109 raise NotImplementedError( # pragma: no cover
110 "Initial types must be specified.")
111 elif initial_types is None:
112 if isinstance(X, (numpy.ndarray, pandas.DataFrame)):
113 X = X[:1]
114 if isinstance(X, pandas.DataFrame):
115 initial_types = []
116 for c in X.columns:
117 if isinstance(X[c].values[0], (str, numpy.str_)):
118 g = StringTensorType()
119 else:
120 g = _guess_type(X[c].values)
121 g.shape = [None, 1]
122 initial_types.append((c, g))
123 else:
124 gt = _guess_type(X)
125 initial_types = [('X', gt)]
126 return initial_types
129def _replace_tensor_type(schema, tensor_type):
130 res = []
131 for name, ty in schema:
132 cl = ty.__class__
133 if cl in (FloatTensorType, DoubleTensorType) and cl != tensor_type:
134 ty = tensor_type(ty.shape)
135 res.append((name, ty))
136 return res
139def guess_schema_from_data(X, tensor_type=None, schema=None):
140 """
141 Guesses initial types from a dataset.
143 @param X dataset (dataframe, array)
144 @param tensor_type if not None, replaces every
145 *FloatTensorType* or *DoubleTensorType*
146 by this one
147 @param schema known schema
148 @return schema (list of typed and named columns)
149 """
150 init = guess_initial_types(X, schema)
151 if tensor_type is not None:
152 init = _replace_tensor_type(init, tensor_type)
153 # Grouping column
154 unique = set()
155 for _, col in init:
156 if len(col.shape) != 2:
157 return init # pragma: no cover
158 if col.shape[0] is not None:
159 return init # pragma: no cover
160 if len(unique) > 0 and col.__class__ not in unique:
161 return init # pragma: no cover
162 unique.add(col.__class__)
163 unique = list(unique)
164 return [('X', unique[0]([None, sum(_[1].shape[1] for _ in init)]))]
167def get_inputs_from_data(X, schema=None):
168 """
169 Produces input data for *onnx* runtime.
171 @param X data
172 @param schema schema if None, schema is guessed with
173 @see fn guess_schema_from_data
174 @return input data
175 """
176 def _cast_data(X, ct):
177 if isinstance(ct, FloatTensorType):
178 return X.astype(numpy.float32)
179 if isinstance(ct, DoubleTensorType):
180 return X.astype(numpy.float64)
181 if isinstance(ct, StringTensorType):
182 return X.astype(numpy.str_)
183 if isinstance(ct, Int64TensorType):
184 return X.astype(numpy.int64)
185 raise RuntimeError( # pragma: no cover
186 "Unexpected column type {} for type {}."
187 "".format(ct, type(X)))
189 if schema is None:
190 schema = guess_schema_from_data(X)
191 if isinstance(X, numpy.ndarray):
192 if len(schema) != 1:
193 raise RuntimeError( # pragma: no cover
194 "More than one column but input is an array.")
195 return {schema[0][0]: _cast_data(X, schema[0][1])}
196 if isinstance(X, pandas.DataFrame):
197 if len(schema) != X.shape[1]:
198 raise RuntimeError( # pragma: no cover
199 "Mismatch between onnx columns {} and DataFrame columns {}"
200 "".format(len(schema), X.shape[1]))
201 return {sch[0]: _cast_data(X[c].values, sch[1]).reshape((-1, 1))
202 for sch, c in zip(schema, X.columns)}
203 raise TypeError( # pragma: no cover
204 "Unexpected type {}, expecting an array or a dataframe."
205 "".format(type(X)))
208def guess_schema_from_model(model, tensor_type=None, schema=None):
209 """
210 Guesses initial types from a model.
212 @param model model
213 @param tensor_type if not None, replaces every
214 *FloatTensorType* or *DoubleTensorType*
215 by this one
216 @param schema known schema
217 @return schema (list of typed and named columns)
218 """
219 if schema is not None:
220 try:
221 guessed = guess_schema_from_model(model)
222 except NotImplementedError: # pragma: no cover
223 return _replace_tensor_type(schema, tensor_type)
224 if len(guessed) != len(schema):
225 raise RuntimeError( # pragma: no cover
226 "Given schema and guessed schema are not the same:\nGOT: {}\n-----\nGOT:\n{}".format(
227 schema, guessed))
228 return _replace_tensor_type(schema, tensor_type)
230 if hasattr(model, 'coef_'):
231 # linear model
232 init = [('X', FloatTensorType([None, model.coef_.shape[1]]))]
233 return _replace_tensor_type(init, tensor_type)
234 elif hasattr(model, 'dump_model'):
235 dumped = model.dump_model()
236 if isinstance(dumped, dict) and 'feature_names' in dumped:
237 names = dumped['feature_names']
238 init = [(name, FloatTensorType([None, 1])) for name in names]
239 return _replace_tensor_type(init, tensor_type)
241 data = pprint.pformat(model.__dict__)
242 dirs = pprint.pformat(dir(model))
243 if hasattr(model, 'dump_model'): # pragma: no cover
244 dumped = model.dump_model()
245 keys = list(sorted(dumped))
246 last = pprint.pformat([keys, dumped])
247 if len(last) >= 200000:
248 last = last[:200000] + "\n..."
249 else:
250 last = ""
251 raise NotImplementedError( # pragma: no cover
252 "Unable to guess schema for model {}\n{}\n----\n{}\n------\n{}".format(
253 model.__class__, data, dirs, last))
256def to_onnx(model, X=None, name=None, initial_types=None,
257 target_opset=None, options=None, rewrite_ops=False,
258 white_op=None, black_op=None, final_types=None,
259 rename_strategy=None, verbose=0):
260 """
261 Converts a model using on :epkg:`sklearn-onnx`.
263 :param model: model to convert or a function
264 wrapped into :epkg:`_PredictScorer` with
265 function :epkg:`make_scorer`
266 :param X: training set (at least one row),
267 can be None, it is used to infered the
268 input types (*initial_types*)
269 :param initial_types: if *X* is None, then *initial_types*
270 must be defined
271 :param name: name of the produced model
272 :param target_opset: to do it with a different target opset
273 :param options: additional parameters for the conversion
274 :param rewrite_ops: rewrites some existing converters,
275 the changes are permanent
276 :param white_op: white list of ONNX nodes allowed
277 while converting a pipeline, if empty, all are allowed
278 :param black_op: black list of ONNX nodes allowed
279 while converting a pipeline, if empty,
280 none are blacklisted
281 :param final_types: a python list. Works the same way as
282 initial_types but not mandatory, it is used
283 to overwrites the type (if type is not None)
284 and the name of every output.
285 :param rename_strategy: rename any name in the graph, select shorter
286 names, see @see fn onnx_rename_names
287 :param verbose: display information while converting the model
288 :return: converted model
290 The function rewrites function *to_onnx* from :epkg:`sklearn-onnx`
291 but may changes a few converters if *rewrite_ops* is True.
292 For example, :epkg:`ONNX` only supports *TreeEnsembleRegressor*
293 for float but not for double. It becomes available
294 if ``rewrite_ops=True``.
296 .. faqref::
297 :title: How to deal with a dataframe as input?
299 Each column of the dataframe is considered as an named input.
300 The first step is to make sure that every column type is correct.
301 :epkg:`pandas` tends to select the least generic type to
302 hold the content of one column. :epkg:`ONNX` does not automatically
303 cast the data it receives. The data must have the same type with
304 the model is converted and when the converted model receives
305 the data to predict.
307 .. runpython::
308 :showcode:
309 :warningout: DeprecationWarning
311 from io import StringIO
312 from textwrap import dedent
313 import numpy
314 import pandas
315 from pyquickhelper.pycode import ExtTestCase
316 from sklearn.preprocessing import OneHotEncoder
317 from sklearn.pipeline import Pipeline
318 from sklearn.compose import ColumnTransformer
319 from mlprodict.onnx_conv import to_onnx
320 from mlprodict.onnxrt import OnnxInference
322 text = dedent('''
323 __SCHEMA__
324 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
325 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
326 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
327 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
328 ''')
329 text = text.replace(
330 "__SCHEMA__",
331 "fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,"
332 "free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,"
333 "alcohol,quality,color")
335 X_train = pandas.read_csv(StringIO(text))
336 for c in X_train.columns:
337 if c != 'color':
338 X_train[c] = X_train[c].astype(numpy.float32)
339 numeric_features = [c for c in X_train if c != 'color']
341 pipe = Pipeline([
342 ("prep", ColumnTransformer([
343 ("color", Pipeline([
344 ('one', OneHotEncoder()),
345 ('select', ColumnTransformer(
346 [('sel1', 'passthrough', [0])]))
347 ]), ['color']),
348 ("others", "passthrough", numeric_features)
349 ])),
350 ])
352 pipe.fit(X_train)
353 pred = pipe.transform(X_train)
354 print(pred)
356 model_onnx = to_onnx(pipe, X_train, target_opset=12)
357 oinf = OnnxInference(model_onnx)
359 # The dataframe is converted into a dictionary,
360 # each key is a column name, each value is a numpy array.
361 inputs = {c: X_train[c].values for c in X_train.columns}
362 inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()}
364 onxp = oinf.run(inputs)
365 print(onxp)
367 .. versionchanged:: 0.7
368 Parameter *rename_strategy* was added.
369 """
370 logger.debug("to_onnx(%s, X=%r, initial_types=%r, target_opset=%r, "
371 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, "
372 "final_types=%r)",
373 model.__class__.__name__, type(X), initial_types,
374 target_opset, options, rewrite_ops, white_op, black_op,
375 final_types)
377 if isinstance(model, OnnxOperatorMixin):
378 if not hasattr(model, 'op_version'):
379 raise RuntimeError( # pragma: no cover
380 "Missing attribute 'op_version' for type '{}'.".format(
381 type(model)))
382 _fix_opset_skl2onnx()
383 return model.to_onnx(
384 X=X, name=name, options=options, black_op=black_op,
385 white_op=white_op, final_types=final_types,
386 target_opset=target_opset)
387 # verbose=verbose)
389 if rewrite_ops:
390 old_values, old_shapes = register_rewritten_operators()
391 register_converters()
392 else:
393 old_values, old_shapes = {}, {}
395 def _guess_type_(X, itype, dtype):
396 initial_types = guess_initial_types(X, itype)
397 if dtype is None:
398 if hasattr(X, 'dtypes'): # DataFrame
399 dtype = numpy.float32
400 elif hasattr(X, 'dtype'):
401 dtype = X.dtype
402 elif hasattr(X, 'type'):
403 dtype = guess_numpy_type(X.type)
404 elif initial_types is not None:
405 dtype = guess_numpy_type(initial_types[0][1])
406 else:
407 raise RuntimeError( # pragma: no cover
408 "dtype cannot be guessed: {}".format(
409 type(X)))
410 if dtype != numpy.float64:
411 dtype = numpy.float32
412 if dtype is None:
413 raise RuntimeError("dtype cannot be None") # pragma: no cover
414 if isinstance(dtype, FloatTensorType):
415 dtype = numpy.float32 # pragma: no cover
416 elif isinstance(dtype, DoubleTensorType):
417 dtype = numpy.float64 # pragma: no cover
418 new_dtype = dtype
419 if isinstance(dtype, numpy.ndarray):
420 new_dtype = dtype.dtype # pragma: no cover
421 elif isinstance(dtype, DataType):
422 new_dtype = numpy.float32 # pragma: no cover
423 if new_dtype not in (numpy.float32, numpy.float64, numpy.int64,
424 numpy.int32, numpy.float16):
425 raise NotImplementedError( # pragma: no cover
426 "dtype should be real not {} ({})".format(new_dtype, dtype))
427 return initial_types, dtype, new_dtype
429 if isinstance(model, _PredictScorer):
430 if X is not None and not isinstance(X, OrderedDict):
431 raise ValueError("For a scorer, parameter X should be a OrderedDict not {}."
432 "".format(type(X)))
433 if initial_types is None:
434 dts = []
435 initial_types = []
436 for k, v in X.items():
437 if hasattr(v, 'dtype'):
438 dtype = guess_numpy_type(v.dtype)
439 else:
440 dtype = v # pragma: no cover
441 it, _, ndt = _guess_type_(v, None, dtype)
442 for i in range(len(it)): # pylint: disable=C0200
443 it[i] = (k, it[i][1]) # pylint: disable=C0200
444 initial_types.extend(it)
445 dts.append(ndt)
446 ndt = set(dts)
447 if len(ndt) != 1:
448 raise RuntimeError( # pragma: no cover
449 "Multiple dtype is not efficient {}.".format(ndt))
450 res = convert_scorer(model, initial_types, name=name,
451 target_opset=target_opset, options=options,
452 black_op=black_op, white_op=white_op,
453 final_types=final_types, verbose=verbose)
454 else:
455 if name is None:
456 name = "mlprodict_ONNX(%s)" % model.__class__.__name__
458 initial_types, dtype, _ = _guess_type_(X, initial_types, None)
460 _fix_opset_skl2onnx()
461 res = convert_sklearn(model, initial_types=initial_types, name=name,
462 target_opset=target_opset, options=options,
463 black_op=black_op, white_op=white_op,
464 final_types=final_types, verbose=verbose)
466 register_rewritten_operators(old_values, old_shapes)
468 # optimisation
469 if rename_strategy is not None:
470 res = onnx_rename_names(res, strategy=rename_strategy)
471 return res