Coverage for mlprodict/onnx_conv/convert.py: 91%

1# -*- encoding: utf-8 -*-

2"""

3@file

4@brief Overloads a conversion function.

5"""

6import pprint

7from collections import OrderedDict

8import logging

9import numpy

10import pandas

11try:

12 from sklearn.metrics._scorer import _PredictScorer

13except ImportError: # pragma: no cover

14 # scikit-learn < 0.22

15 from sklearn.metrics.scorer import _PredictScorer

16from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version

17from skl2onnx.common.data_types import (

18 FloatTensorType, DoubleTensorType, DataType, guess_numpy_type,

19 StringTensorType, Int64TensorType)

20from skl2onnx import convert_sklearn

21from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin

22from skl2onnx.algebra.type_helper import _guess_type

23from ..onnx_tools.onnx_manipulations import onnx_rename_names

24from .register_rewritten_converters import register_rewritten_operators

25from .register import register_converters

26from .scorers import CustomScorerTransform

29logger = logging.getLogger('mlprodict')

32def _fix_opset_skl2onnx():

33 import skl2onnx

34 from .. import __max_supported_opset__

35 if skl2onnx.__max_supported_opset__ != __max_supported_opset__:

36 skl2onnx.__max_supported_opset__ = __max_supported_opset__

39def convert_scorer(fct, initial_types, name=None,

40 target_opset=None, options=None,

41 custom_conversion_functions=None,

42 custom_shape_calculators=None,

43 custom_parsers=None, white_op=None,

44 black_op=None, final_types=None,

45 verbose=0):

46 """

47 Converts a scorer into :epkg:`ONNX` assuming

48 there exists a converter associated to it.

49 The function wraps the function into a custom

50 transformer, then calls function *convert_sklearn*

51 from :epkg:`sklearn-onnx`.

53 :param fct: function to convert (or a scorer from :epkg:`scikit-learn`)

54 :param initial_types: types information

55 :param name: name of the produced model

56 :param target_opset: to do it with a different target opset

57 :param options: additional parameters for the conversion

58 :param custom_conversion_functions: a dictionary for specifying the user

59 customized conversion function, it takes precedence over

60 registered converters

61 :param custom_shape_calculators: a dictionary for specifying the user

62 customized shape calculator it takes precedence over registered

63 shape calculators.

64 :param custom_parsers: parsers determine which outputs is expected

65 for which particular task, default parsers are

66 defined for classifiers, regressors, pipeline but

67 they can be rewritten, *custom_parsers* is a dictionary

68 ``{ type: fct_parser(scope, model, inputs,

69 custom_parsers=None) }``

70 :param white_op: white list of ONNX nodes allowed

71 while converting a pipeline, if empty, all are allowed

72 :param black_op: black list of ONNX nodes allowed

73 while converting a pipeline, if empty, none are blacklisted

74 :param final_types: a python list. Works the same way as

75 initial_types but not mandatory, it is used

76 to overwrites the type (if type is not None)

77 and the name of every output.

78 :param verbose: displays information while converting

79 :return: :epkg:`ONNX` graph

80 """

81 if hasattr(fct, '_score_func'):

82 kwargs = fct._kwargs

83 fct = fct._score_func

84 else:

85 kwargs = None # pragma: no cover

86 if name is None:

87 name = "mlprodict_fct_ONNX(%s)" % fct.__name__

88 tr = CustomScorerTransform(fct.__name__, fct, kwargs)

89 _fix_opset_skl2onnx()

90 return convert_sklearn(

91 tr, initial_types=initial_types,

92 target_opset=target_opset, options=options,

93 custom_conversion_functions=custom_conversion_functions,

94 custom_shape_calculators=custom_shape_calculators,

95 custom_parsers=custom_parsers, white_op=white_op,

96 black_op=black_op, final_types=final_types,

97 verbose=verbose)

100def guess_initial_types(X, initial_types):

101 """

102 Guesses initial types from an array or a dataframe.

103

104 @param X array or dataframe

105 @param initial_types hints about X

106 @return data types

107 """

108 if X is None and initial_types is None:

109 raise NotImplementedError( # pragma: no cover

110 "Initial types must be specified.")

111 elif initial_types is None:

112 if isinstance(X, (numpy.ndarray, pandas.DataFrame)):

113 X = X[:1]

114 if isinstance(X, pandas.DataFrame):

115 initial_types = []

116 for c in X.columns:

117 if isinstance(X[c].values[0], (str, numpy.str_)):

118 g = StringTensorType()

119 else:

120 g = _guess_type(X[c].values)

121 g.shape = [None, 1]

122 initial_types.append((c, g))

123 else:

124 gt = _guess_type(X)

125 initial_types = [('X', gt)]

126 return initial_types

127

128

129def _replace_tensor_type(schema, tensor_type):

130 res = []

131 for name, ty in schema:

132 cl = ty.__class__

133 if cl in (FloatTensorType, DoubleTensorType) and cl != tensor_type:

134 ty = tensor_type(ty.shape)

135 res.append((name, ty))

136 return res

137

138

139def guess_schema_from_data(X, tensor_type=None, schema=None):

140 """

141 Guesses initial types from a dataset.

142

143 @param X dataset (dataframe, array)

144 @param tensor_type if not None, replaces every

145 *FloatTensorType* or *DoubleTensorType*

146 by this one

147 @param schema known schema

148 @return schema (list of typed and named columns)

149 """

150 init = guess_initial_types(X, schema)

151 if tensor_type is not None:

152 init = _replace_tensor_type(init, tensor_type)

153 # Grouping column

154 unique = set()

155 for _, col in init:

156 if len(col.shape) != 2:

157 return init # pragma: no cover

158 if col.shape[0] is not None:

159 return init # pragma: no cover

160 if len(unique) > 0 and col.__class__ not in unique:

161 return init # pragma: no cover

162 unique.add(col.__class__)

163 unique = list(unique)

164 return [('X', unique[0]([None, sum(_[1].shape[1] for _ in init)]))]

165

166

167def get_inputs_from_data(X, schema=None):

168 """

169 Produces input data for *onnx* runtime.

170

171 @param X data

172 @param schema schema if None, schema is guessed with

173 @see fn guess_schema_from_data

174 @return input data

175 """

176 def _cast_data(X, ct):

177 if isinstance(ct, FloatTensorType):

178 return X.astype(numpy.float32)

179 if isinstance(ct, DoubleTensorType):

180 return X.astype(numpy.float64)

181 if isinstance(ct, StringTensorType):

182 return X.astype(numpy.str_)

183 if isinstance(ct, Int64TensorType):

184 return X.astype(numpy.int64)

185 raise RuntimeError( # pragma: no cover

186 "Unexpected column type {} for type {}."

187 "".format(ct, type(X)))

188

189 if schema is None:

190 schema = guess_schema_from_data(X)

191 if isinstance(X, numpy.ndarray):

192 if len(schema) != 1:

193 raise RuntimeError( # pragma: no cover

194 "More than one column but input is an array.")

195 return {schema[0][0]: _cast_data(X, schema[0][1])}

196 if isinstance(X, pandas.DataFrame):

197 if len(schema) != X.shape[1]:

198 raise RuntimeError( # pragma: no cover

199 "Mismatch between onnx columns {} and DataFrame columns {}"

200 "".format(len(schema), X.shape[1]))

201 return {sch[0]: _cast_data(X[c].values, sch[1]).reshape((-1, 1))

202 for sch, c in zip(schema, X.columns)}

203 raise TypeError( # pragma: no cover

204 "Unexpected type {}, expecting an array or a dataframe."

205 "".format(type(X)))

206

207

208def guess_schema_from_model(model, tensor_type=None, schema=None):

209 """

210 Guesses initial types from a model.

211

212 @param model model

213 @param tensor_type if not None, replaces every

214 *FloatTensorType* or *DoubleTensorType*

215 by this one

216 @param schema known schema

217 @return schema (list of typed and named columns)

218 """

219 if schema is not None:

220 try:

221 guessed = guess_schema_from_model(model)

222 except NotImplementedError: # pragma: no cover

223 return _replace_tensor_type(schema, tensor_type)

224 if len(guessed) != len(schema):

225 raise RuntimeError( # pragma: no cover

226 "Given schema and guessed schema are not the same:\nGOT: {}\n-----\nGOT:\n{}".format(

227 schema, guessed))

228 return _replace_tensor_type(schema, tensor_type)

229

230 if hasattr(model, 'coef_'):

231 # linear model

232 init = [('X', FloatTensorType([None, model.coef_.shape[1]]))]

233 return _replace_tensor_type(init, tensor_type)

234 elif hasattr(model, 'dump_model'):

235 dumped = model.dump_model()

236 if isinstance(dumped, dict) and 'feature_names' in dumped:

237 names = dumped['feature_names']

238 init = [(name, FloatTensorType([None, 1])) for name in names]

239 return _replace_tensor_type(init, tensor_type)

240

241 data = pprint.pformat(model.__dict__)

242 dirs = pprint.pformat(dir(model))

243 if hasattr(model, 'dump_model'): # pragma: no cover

244 dumped = model.dump_model()

245 keys = list(sorted(dumped))

246 last = pprint.pformat([keys, dumped])

247 if len(last) >= 200000:

248 last = last[:200000] + "\n..."

249 else:

250 last = ""

251 raise NotImplementedError( # pragma: no cover

252 "Unable to guess schema for model {}\n{}\n----\n{}\n------\n{}".format(

253 model.__class__, data, dirs, last))

254

255

256def to_onnx(model, X=None, name=None, initial_types=None,

257 target_opset=None, options=None, rewrite_ops=False,

258 white_op=None, black_op=None, final_types=None,

259 rename_strategy=None, verbose=0):

260 """

261 Converts a model using on :epkg:`sklearn-onnx`.

262

263 :param model: model to convert or a function

264 wrapped into :epkg:`_PredictScorer` with

265 function :epkg:`make_scorer`

266 :param X: training set (at least one row),

267 can be None, it is used to infered the

268 input types (*initial_types*)

269 :param initial_types: if *X* is None, then *initial_types*

270 must be defined

271 :param name: name of the produced model

272 :param target_opset: to do it with a different target opset

273 :param options: additional parameters for the conversion

274 :param rewrite_ops: rewrites some existing converters,

275 the changes are permanent

276 :param white_op: white list of ONNX nodes allowed

277 while converting a pipeline, if empty, all are allowed

278 :param black_op: black list of ONNX nodes allowed

279 while converting a pipeline, if empty,

280 none are blacklisted

281 :param final_types: a python list. Works the same way as

282 initial_types but not mandatory, it is used

283 to overwrites the type (if type is not None)

284 and the name of every output.

285 :param rename_strategy: rename any name in the graph, select shorter

286 names, see @see fn onnx_rename_names

287 :param verbose: display information while converting the model

288 :return: converted model

289

290 The function rewrites function *to_onnx* from :epkg:`sklearn-onnx`

291 but may changes a few converters if *rewrite_ops* is True.

292 For example, :epkg:`ONNX` only supports *TreeEnsembleRegressor*

293 for float but not for double. It becomes available

294 if ``rewrite_ops=True``.

295

296 .. faqref::

297 :title: How to deal with a dataframe as input?

298

299 Each column of the dataframe is considered as an named input.

300 The first step is to make sure that every column type is correct.

301 :epkg:`pandas` tends to select the least generic type to

302 hold the content of one column. :epkg:`ONNX` does not automatically

303 cast the data it receives. The data must have the same type with

304 the model is converted and when the converted model receives

305 the data to predict.

306

307 .. runpython::

308 :showcode:

309 :warningout: DeprecationWarning

310

311 from io import StringIO

312 from textwrap import dedent

313 import numpy

314 import pandas

315 from pyquickhelper.pycode import ExtTestCase

316 from sklearn.preprocessing import OneHotEncoder

317 from sklearn.pipeline import Pipeline

318 from sklearn.compose import ColumnTransformer

319 from mlprodict.onnx_conv import to_onnx

320 from mlprodict.onnxrt import OnnxInference

321

322 text = dedent('''

323 __SCHEMA__

324 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red

325 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red

326 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red

327 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red

328 ''')

329 text = text.replace(

330 "__SCHEMA__",

331 "fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,"

332 "free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,"

333 "alcohol,quality,color")

334

335 X_train = pandas.read_csv(StringIO(text))

336 for c in X_train.columns:

337 if c != 'color':

338 X_train[c] = X_train[c].astype(numpy.float32)

339 numeric_features = [c for c in X_train if c != 'color']

340

341 pipe = Pipeline([

342 ("prep", ColumnTransformer([

343 ("color", Pipeline([

344 ('one', OneHotEncoder()),

345 ('select', ColumnTransformer(

346 [('sel1', 'passthrough', [0])]))

347 ]), ['color']),

348 ("others", "passthrough", numeric_features)

349 ])),

350 ])

351

352 pipe.fit(X_train)

353 pred = pipe.transform(X_train)

354 print(pred)

355

356 model_onnx = to_onnx(pipe, X_train, target_opset=12)

357 oinf = OnnxInference(model_onnx)

358

359 # The dataframe is converted into a dictionary,

360 # each key is a column name, each value is a numpy array.

361 inputs = {c: X_train[c].values for c in X_train.columns}

362 inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()}

363

364 onxp = oinf.run(inputs)

365 print(onxp)

366

367 .. versionchanged:: 0.7

368 Parameter *rename_strategy* was added.

369 """

370 logger.debug("to_onnx(%s, X=%r, initial_types=%r, target_opset=%r, "

371 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, "

372 "final_types=%r)",

373 model.__class__.__name__, type(X), initial_types,

374 target_opset, options, rewrite_ops, white_op, black_op,

375 final_types)

376

377 if isinstance(model, OnnxOperatorMixin):

378 if not hasattr(model, 'op_version'):

379 raise RuntimeError( # pragma: no cover

380 "Missing attribute 'op_version' for type '{}'.".format(

381 type(model)))

382 _fix_opset_skl2onnx()

383 return model.to_onnx(

384 X=X, name=name, options=options, black_op=black_op,

385 white_op=white_op, final_types=final_types,

386 target_opset=target_opset)

387 # verbose=verbose)

388

389 if rewrite_ops:

390 old_values, old_shapes = register_rewritten_operators()

391 register_converters()

392 else:

393 old_values, old_shapes = {}, {}

394

395 def _guess_type_(X, itype, dtype):

396 initial_types = guess_initial_types(X, itype)

397 if dtype is None:

398 if hasattr(X, 'dtypes'): # DataFrame

399 dtype = numpy.float32

400 elif hasattr(X, 'dtype'):

401 dtype = X.dtype

402 elif hasattr(X, 'type'):

403 dtype = guess_numpy_type(X.type)

404 elif initial_types is not None:

405 dtype = guess_numpy_type(initial_types[0][1])

406 else:

407 raise RuntimeError( # pragma: no cover

408 "dtype cannot be guessed: {}".format(

409 type(X)))

410 if dtype != numpy.float64:

411 dtype = numpy.float32

412 if dtype is None:

413 raise RuntimeError("dtype cannot be None") # pragma: no cover

414 if isinstance(dtype, FloatTensorType):

415 dtype = numpy.float32 # pragma: no cover

416 elif isinstance(dtype, DoubleTensorType):

417 dtype = numpy.float64 # pragma: no cover

418 new_dtype = dtype

419 if isinstance(dtype, numpy.ndarray):

420 new_dtype = dtype.dtype # pragma: no cover

421 elif isinstance(dtype, DataType):

422 new_dtype = numpy.float32 # pragma: no cover

423 if new_dtype not in (numpy.float32, numpy.float64, numpy.int64,

424 numpy.int32, numpy.float16):

425 raise NotImplementedError( # pragma: no cover

426 "dtype should be real not {} ({})".format(new_dtype, dtype))

427 return initial_types, dtype, new_dtype

428

429 if isinstance(model, _PredictScorer):

430 if X is not None and not isinstance(X, OrderedDict):

431 raise ValueError("For a scorer, parameter X should be a OrderedDict not {}."

432 "".format(type(X)))

433 if initial_types is None:

434 dts = []

435 initial_types = []

436 for k, v in X.items():

437 if hasattr(v, 'dtype'):

438 dtype = guess_numpy_type(v.dtype)

439 else:

440 dtype = v # pragma: no cover

441 it, _, ndt = _guess_type_(v, None, dtype)

442 for i in range(len(it)): # pylint: disable=C0200

443 it[i] = (k, it[i][1]) # pylint: disable=C0200

444 initial_types.extend(it)

445 dts.append(ndt)

446 ndt = set(dts)

447 if len(ndt) != 1:

448 raise RuntimeError( # pragma: no cover

449 "Multiple dtype is not efficient {}.".format(ndt))

450 res = convert_scorer(model, initial_types, name=name,

451 target_opset=target_opset, options=options,

452 black_op=black_op, white_op=white_op,

453 final_types=final_types, verbose=verbose)

454 else:

455 if name is None:

456 name = "mlprodict_ONNX(%s)" % model.__class__.__name__

457

458 initial_types, dtype, _ = _guess_type_(X, initial_types, None)

459

460 _fix_opset_skl2onnx()

461 res = convert_sklearn(model, initial_types=initial_types, name=name,

462 target_opset=target_opset, options=options,

463 black_op=black_op, white_op=white_op,

464 final_types=final_types, verbose=verbose)

465

466 register_rewritten_operators(old_values, old_shapes)

467

468 # optimisation

469 if rename_strategy is not None:

470 res = onnx_rename_names(res, strategy=rename_strategy)

471 return res