Coverage for mlprodict/onnxrt/validate/validate_problems.py: 99%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Validates runtime for many :scikit-learn: operators.
4The submodule relies on :epkg:`onnxconverter_common`,
5:epkg:`sklearn-onnx`.
6"""
7import numpy
8from sklearn.base import (
9 ClusterMixin, BiclusterMixin, OutlierMixin,
10 RegressorMixin, ClassifierMixin)
11from sklearn.calibration import CalibratedClassifierCV
12from sklearn.cross_decomposition import PLSSVD
13from sklearn.datasets import load_iris
14from sklearn.decomposition import LatentDirichletAllocation, NMF
15from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
16from sklearn.ensemble import (
17 AdaBoostRegressor, GradientBoostingRegressor, AdaBoostClassifier,
18 BaggingClassifier, VotingClassifier, GradientBoostingClassifier,
19 RandomForestClassifier)
20try:
21 from sklearn.ensemble import StackingClassifier, StackingRegressor
22except ImportError: # pragma: no cover
23 # new in 0.22
24 StackingClassifier, StackingRegressor = None, None
25from sklearn.feature_extraction import DictVectorizer, FeatureHasher
26from sklearn.feature_extraction.text import (
27 CountVectorizer, TfidfVectorizer, TfidfTransformer)
28from sklearn.ensemble import (
29 HistGradientBoostingRegressor,
30 HistGradientBoostingClassifier)
31from sklearn.feature_selection import (
32 RFE, RFECV, GenericUnivariateSelect,
33 SelectPercentile, SelectFwe, SelectKBest,
34 SelectFdr, SelectFpr, SelectFromModel)
35from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
36from sklearn.isotonic import IsotonicRegression
37from sklearn.linear_model import (
38 ARDRegression, ElasticNetCV,
39 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,
40 SGDRegressor, OrthogonalMatchingPursuitCV,
41 TheilSenRegressor, BayesianRidge, MultiTaskElasticNet,
42 MultiTaskElasticNetCV, MultiTaskLassoCV, MultiTaskLasso,
43 PassiveAggressiveClassifier, RidgeClassifier,
44 RidgeClassifierCV, PassiveAggressiveRegressor,
45 HuberRegressor, LogisticRegression, SGDClassifier,
46 LogisticRegressionCV, Perceptron)
47from sklearn.mixture._base import BaseMixture
48from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
49from sklearn.multiclass import (
50 OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier)
51from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
52from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB
53from sklearn.neighbors import (
54 NearestCentroid, RadiusNeighborsClassifier,
55 NeighborhoodComponentsAnalysis)
56from sklearn.preprocessing import (
57 LabelBinarizer, LabelEncoder,
58 OneHotEncoder, PowerTransformer)
59from sklearn.semi_supervised import LabelPropagation, LabelSpreading
60from sklearn.svm import LinearSVC, LinearSVR, NuSVR, SVR, SVC, NuSVC
61from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, ExtraTreeClassifier
62from sklearn.utils import shuffle
63from ._validate_problems_helper import (
64 _noshapevar, _1d_problem, text_alpha_num)
67def _modify_dimension(X, n_features, seed=19):
68 """
69 Modifies the number of features to increase
70 or reduce the number of features.
72 @param X features matrix
73 @param n_features number of features
74 @param seed random seed (to get the same
75 dataset at each call)
76 @return new featurs matrix
77 """
78 if n_features is None or n_features == X.shape[1]:
79 return X
80 if n_features < X.shape[1]:
81 return X[:, :n_features]
82 rstate = numpy.random.RandomState(seed) # pylint: disable=E1101
83 res = numpy.empty((X.shape[0], n_features), dtype=X.dtype)
84 res[:, :X.shape[1]] = X[:, :]
85 div = max((n_features // X.shape[1]) + 1, 2)
86 for i in range(X.shape[1], res.shape[1]):
87 j = i % X.shape[1]
88 col = X[:, j]
89 if X.dtype in (numpy.float32, numpy.float64):
90 sigma = numpy.var(col) ** 0.5
91 rnd = rstate.randn(len(col)) * sigma / div
92 col2 = col + rnd
93 res[:, j] -= col2 / div
94 res[:, i] = col2
95 elif X.dtype in (numpy.int32, numpy.int64):
96 perm = rstate.permutation(col)
97 h = rstate.randint(0, div) % X.shape[0]
98 col2 = col.copy()
99 col2[h::div] = perm[h::div] # pylint: disable=E1136
100 res[:, i] = col2
101 h = (h + 1) % X.shape[0]
102 res[h, j] = perm[h] # pylint: disable=E1136
103 else: # pragma: no cover
104 raise NotImplementedError( # pragma: no cover
105 "Unable to add noise to a feature for this type {}".format(X.dtype))
106 return res
109###########
110# datasets
111###########
114def _problem_for_predictor_binary_classification(
115 dtype=numpy.float32, n_features=None, add_nan=False):
116 """
117 Returns *X, y, intial_types, method, node name, X runtime* for a
118 binary classification problem.
119 It is based on Iris dataset.
120 """
121 data = load_iris()
122 X = data.data
123 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
124 rnd = state.randn(*X.shape) / 3
125 X += rnd
126 X = _modify_dimension(X, n_features)
127 y = data.target
128 y[y == 2] = 1
129 if add_nan:
130 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
131 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
132 X[rows, cols] = numpy.nan
133 X = X.astype(dtype)
134 y = y.astype(numpy.int64)
135 return (X, y, [('X', X[:1].astype(dtype))],
136 'predict_proba', 1, X.astype(dtype))
139def _problem_for_predictor_multi_classification(dtype=numpy.float32, n_features=None):
140 """
141 Returns *X, y, intial_types, method, node name, X runtime* for a
142 m-cl classification problem.
143 It is based on Iris dataset.
144 """
145 data = load_iris()
146 X = data.data
147 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
148 rnd = state.randn(*X.shape) / 3
149 X += rnd
150 X = _modify_dimension(X, n_features)
151 y = data.target
152 X = X.astype(dtype)
153 y = y.astype(numpy.int64)
154 return (X, y, [('X', X[:1].astype(dtype))],
155 'predict_proba', 1, X.astype(dtype))
158def _problem_for_mixture(dtype=numpy.float32, n_features=None):
159 """
160 Returns *X, y, intial_types, method, node name, X runtime* for a
161 m-cl classification problem.
162 It is based on Iris dataset.
163 """
164 data = load_iris()
165 X = data.data
166 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
167 rnd = state.randn(*X.shape) / 3
168 X += rnd
169 X = _modify_dimension(X, n_features)
170 y = data.target
171 X = X.astype(dtype)
172 y = y.astype(numpy.int64)
173 return (X, None, [('X', X[:1].astype(dtype))],
174 'predict_proba', 1, X.astype(dtype))
177def _problem_for_predictor_multi_classification_label(dtype=numpy.float32, n_features=None):
178 """
179 Returns *X, y, intial_types, method, node name, X runtime* for a
180 m-cl classification problem.
181 It is based on Iris dataset.
182 """
183 data = load_iris()
184 X = data.data
185 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
186 rnd = state.randn(*X.shape) / 3
187 X += rnd
188 X = _modify_dimension(X, n_features)
189 y = data.target
190 y2 = numpy.zeros((y.shape[0], 3), dtype=numpy.int64)
191 for i, _ in enumerate(y):
192 y2[i, _] = 1
193 for i in range(0, y.shape[0], 5):
194 y2[i, (y[i] + 1) % 3] = 1
195 X = X.astype(dtype)
196 y2 = y2.astype(numpy.int64)
197 return (X, y2, [('X', X[:1].astype(dtype))],
198 'predict_proba', 1, X.astype(dtype))
201def _problem_for_predictor_regression(many_output=False, options=None,
202 n_features=None, nbrows=None,
203 dtype=numpy.float32, add_nan=False,
204 **kwargs):
205 """
206 Returns *X, y, intial_types, method, name, X runtime* for a
207 regression problem.
208 It is based on Iris dataset.
209 """
210 data = load_iris()
211 X = data.data
212 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
213 rnd = state.randn(*X.shape) / 3
214 X += rnd
215 X = _modify_dimension(X, n_features)
216 y = data.target + numpy.arange(len(data.target)) / 100
217 meth = 'predict' if kwargs is None else ('predict', kwargs)
218 itt = [('X', X[:1].astype(dtype))]
219 if n_features is not None:
220 X = X[:, :n_features]
221 itt = [('X', X[:1].astype(dtype))]
222 if nbrows is not None:
223 X = X[:nbrows, :]
224 y = y[:nbrows]
225 itt = [('X', X[:1].astype(dtype))]
226 if options is not None:
227 itt = itt, options
228 if add_nan:
229 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
230 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
231 X[rows, cols] = numpy.nan
232 X = X.astype(dtype)
233 y = y.astype(dtype)
234 return (X, y, itt,
235 meth, 'all' if many_output else 0, X.astype(dtype))
238def _problem_for_predictor_multi_regression(many_output=False, options=None,
239 n_features=None, nbrows=None,
240 dtype=numpy.float32, **kwargs):
241 """
242 Returns *X, y, intial_types, method, name, X runtime* for a
243 mregression problem.
244 It is based on Iris dataset.
245 """
246 data = load_iris()
247 X = data.data
248 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
249 rnd = state.randn(*X.shape) / 3
250 X += rnd
251 X = _modify_dimension(X, n_features)
252 y = data.target.astype(float) + numpy.arange(len(data.target)) / 100
253 meth = 'predict' if kwargs is None else ('predict', kwargs)
254 itt = [('X', X[:1].astype(dtype))]
255 if n_features is not None:
256 X = X[:, :n_features]
257 itt = [('X', X[:1].astype(dtype))]
258 if nbrows is not None:
259 X = X[:nbrows, :]
260 y = y[:nbrows]
261 itt = [('X', X[:1].astype(dtype))]
262 if options is not None:
263 itt = itt, options
264 y2 = numpy.empty((y.shape[0], 2))
265 y2[:, 0] = y
266 y2[:, 1] = y + 0.5
267 X = X.astype(dtype)
268 y2 = y2.astype(dtype)
269 return (X, y2, itt,
270 meth, 'all' if many_output else 0, X.astype(dtype))
273def _problem_for_numerical_transform(dtype=numpy.float32, n_features=None):
274 """
275 Returns *X, intial_types, method, name, X runtime* for a
276 transformation problem.
277 It is based on Iris dataset.
278 """
279 data = load_iris()
280 X = data.data
281 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
282 rnd = state.randn(*X.shape) / 3
283 X += rnd
284 X = _modify_dimension(X, n_features)
285 X = X.astype(dtype)
286 return (X, None, [('X', X[:1].astype(dtype))],
287 'transform', 0, X.astype(dtype=numpy.float32))
290def _problem_for_numerical_transform_positive(dtype=numpy.float32, n_features=None):
291 """
292 Returns *X, intial_types, method, name, X runtime* for a
293 transformation problem.
294 It is based on Iris dataset.
295 """
296 data = load_iris()
297 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
298 rnd = state.randn(*data.data.shape) / 3
299 X = numpy.abs(data.data + rnd)
300 X = _modify_dimension(X, n_features)
301 X = X.astype(dtype)
302 return (X, None, [('X', X[:1].astype(dtype))],
303 'transform', 0, X.astype(dtype=numpy.float32))
306def _problem_for_numerical_trainable_transform(dtype=numpy.float32, n_features=None):
307 """
308 Returns *X, intial_types, method, name, X runtime* for a
309 transformation problem.
310 It is based on Iris dataset.
311 """
312 data = load_iris()
313 X = data.data
314 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
315 rnd = state.randn(*X.shape) / 3
316 X += rnd
317 X = _modify_dimension(X, n_features)
318 y = data.target + numpy.arange(len(data.target)) / 100
319 X = X.astype(dtype)
320 y = y.astype(dtype)
321 return (X, y, [('X', X[:1].astype(dtype))],
322 'transform', 0, X.astype(dtype))
325def _problem_for_numerical_trainable_transform_cl(dtype=numpy.float32, n_features=None):
326 """
327 Returns *X, intial_types, method, name, X runtime* for a
328 transformation problem.
329 It is based on Iris dataset.
330 """
331 data = load_iris()
332 X = data.data
333 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
334 rnd = state.randn(*X.shape) / 3
335 X += rnd
336 X = _modify_dimension(X, n_features)
337 y = data.target
338 X = X.astype(dtype)
339 y = y.astype(numpy.int64)
340 return (X, y, [('X', X[:1].astype(dtype))],
341 'transform', 0, X.astype(dtype))
344def _problem_for_clustering(dtype=numpy.float32, n_features=None):
345 """
346 Returns *X, intial_types, method, name, X runtime* for a
347 clustering problem.
348 It is based on Iris dataset.
349 """
350 data = load_iris()
351 X = data.data
352 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
353 rnd = state.randn(*X.shape) / 3
354 X += rnd
355 X = _modify_dimension(X, n_features)
356 X = X.astype(dtype)
357 return (X, None, [('X', X[:1].astype(dtype))],
358 'predict', 0, X.astype(dtype))
361def _problem_for_clustering_scores(dtype=numpy.float32, n_features=None):
362 """
363 Returns *X, intial_types, method, name, X runtime* for a
364 clustering problem, the score part, not the cluster.
365 It is based on Iris dataset.
366 """
367 data = load_iris()
368 X = data.data
369 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
370 rnd = state.randn(*X.shape) / 3
371 X += rnd
372 X = _modify_dimension(X, n_features)
373 X = X.astype(dtype)
374 return (X, None, [('X', X[:1].astype(dtype))],
375 'transform', 1, X.astype(dtype))
378def _problem_for_outlier(dtype=numpy.float32, n_features=None):
379 """
380 Returns *X, intial_types, method, name, X runtime* for a
381 transformation problem.
382 It is based on Iris dataset.
383 """
384 data = load_iris()
385 X = data.data
386 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
387 rnd = state.randn(*X.shape) / 3
388 X += rnd
389 X = _modify_dimension(X, n_features)
390 X = X.astype(dtype)
391 return (X, None, [('X', X[:1].astype(dtype))],
392 'predict', 0, X.astype(dtype))
395def _problem_for_numerical_scoring(dtype=numpy.float32, n_features=None):
396 """
397 Returns *X, y, intial_types, method, name, X runtime* for a
398 scoring problem.
399 It is based on Iris dataset.
400 """
401 data = load_iris()
402 X = data.data
403 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
404 rnd = state.randn(*X.shape) / 3
405 X += rnd
406 y = data.target.astype(dtype) + numpy.arange(len(data.target)) / 100
407 y /= numpy.max(y)
408 X = X.astype(dtype)
409 y = y.astype(dtype)
410 return (X, y, [('X', X[:1].astype(dtype))],
411 'score', 0, X.astype(dtype))
414def _problem_for_clnoproba(dtype=numpy.float32, n_features=None):
415 """
416 Returns *X, y, intial_types, method, name, X runtime* for a
417 scoring problem.
418 It is based on Iris dataset.
419 """
420 data = load_iris()
421 X = data.data
422 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
423 rnd = state.randn(*X.shape) / 3
424 X += rnd
425 X = _modify_dimension(X, n_features)
426 y = data.target
427 X = X.astype(dtype)
428 y = y.astype(numpy.int64)
429 return (X, y, [('X', X[:1].astype(dtype))],
430 'predict', 0, X.astype(dtype))
433def _problem_for_clnoproba_binary(dtype=numpy.float32, n_features=None, add_nan=False):
434 """
435 Returns *X, y, intial_types, method, name, X runtime* for a
436 scoring problem. Binary classification.
437 It is based on Iris dataset.
438 """
439 data = load_iris()
440 X = data.data
441 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
442 rnd = state.randn(*X.shape) / 3
443 X += rnd
444 X = _modify_dimension(X, n_features)
445 y = data.target
446 y[y == 2] = 1
447 if add_nan:
448 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
449 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
450 X[rows, cols] = numpy.nan
451 X = X.astype(dtype)
452 y = y.astype(numpy.int64)
453 return (X, y, [('X', X[:1].astype(dtype))],
454 'predict', 0, X.astype(dtype))
457def _problem_for_cl_decision_function(dtype=numpy.float32, n_features=None):
458 """
459 Returns *X, y, intial_types, method, name, X runtime* for a
460 scoring problem.
461 It is based on Iris dataset.
462 """
463 data = load_iris()
464 X = data.data
465 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
466 rnd = state.randn(*X.shape) / 3
467 X += rnd
468 X = _modify_dimension(X, n_features)
469 y = data.target
470 X = X.astype(dtype)
471 y = y.astype(numpy.int64)
472 return (X, y, [('X', X[:1].astype(dtype))],
473 'decision_function', 1, X.astype(dtype))
476def _problem_for_cl_decision_function_binary(dtype=numpy.float32, n_features=None):
477 """
478 Returns *X, y, intial_types, method, name, X runtime* for a
479 scoring problem. Binary classification.
480 It is based on Iris dataset.
481 """
482 data = load_iris()
483 X = data.data
484 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
485 rnd = state.randn(*X.shape) / 3
486 X += rnd
487 X = _modify_dimension(X, n_features)
488 y = data.target
489 y[y == 2] = 1
490 X = X.astype(dtype)
491 y = y.astype(numpy.int64)
492 return (X, y, [('X', X[:1].astype(dtype))],
493 'decision_function', 1, X.astype(dtype))
496def _problem_for_label_encoder(dtype=numpy.int64, n_features=None):
497 """
498 Returns a problem for the :epkg:`sklearn:preprocessing:LabelEncoder`.
499 """
500 data = load_iris()
501 # X = data.data
502 y = data.target.astype(dtype)
503 itt = [('X', y[:1].astype(dtype))]
504 y = y.astype(dtype)
505 return (y, None, itt, 'transform', 0, y)
508def _problem_for_dict_vectorizer(dtype=numpy.float32, n_features=None):
509 """
510 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.
511 """
512 from skl2onnx.common.data_types import ( # delayed
513 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType)
514 data = load_iris()
515 # X = data.data
516 y = data.target
517 y2 = [{_: dtype(1000 + i)} for i, _ in enumerate(y)]
518 y2[0][2] = -2
519 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
520 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]
521 y2 = numpy.array(y2)
522 y = y.astype(numpy.int64)
523 return (y2, y, itt, 'transform', 0, y2)
526def _problem_for_tfidf_vectorizer(dtype=numpy.float32, n_features=None):
527 """
528 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfVectorizer`.
529 """
530 from skl2onnx.common.data_types import ( # delayed
531 StringTensorType)
532 X = numpy.array([_[0] for _ in text_alpha_num])
533 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)
534 itt = [("X", StringTensorType([None]))]
535 return (X, y, itt, 'transform', 0, X)
538def _problem_for_tfidf_transformer(dtype=numpy.float32, n_features=None):
539 """
540 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfTransformer`.
541 """
542 from skl2onnx.common.data_types import ( # delayed
543 FloatTensorType, DoubleTensorType)
544 X = numpy.array([_[0] for _ in text_alpha_num])
545 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)
546 X2 = CountVectorizer().fit_transform(X).astype(dtype)
547 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
548 itt = [("X", cltype([None, X2.shape[1]]))]
549 return (X2, y, itt, 'transform', 0, X2)
552def _problem_for_feature_hasher(dtype=numpy.float32, n_features=None):
553 """
554 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.
555 """
556 from skl2onnx.common.data_types import ( # delayed
557 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType)
558 data = load_iris()
559 # X = data.data
560 y = data.target
561 y2 = [{("cl%d" % _): dtype(1000 + i)} for i, _ in enumerate(y)]
562 y2[0]["cl2"] = -2
563 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
564 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]
565 y2 = numpy.array(y2)
566 return (y2, y, itt, 'transform', 0, y2)
569def _problem_for_one_hot_encoder(dtype=numpy.float32, n_features=None):
570 """
571 Returns a problem for the :epkg:`sklearn:preprocessing:OneHotEncoder`.
572 """
573 data = load_iris()
574 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
575 rnd = state.randn(*data.data.shape) / 3
576 X = _modify_dimension(data.data + rnd, n_features)
577 X = X.astype(numpy.int32).astype(dtype)
578 y = data.target
579 X, y = shuffle(X, y, random_state=1)
580 itt = [('X', X[:1].astype(dtype))]
581 return (X[:, :1], y, itt, 'transform', 0, X[:, :1].astype(dtype))
584def find_suitable_problem(model):
585 """
586 Determines problems suitable for a given
587 :epkg:`scikit-learn` operator. It may be
589 * `b-cl`: binary classification
590 * `m-cl`: m-cl classification
591 * `m-label`: classification m-label
592 (multiple labels possible at the same time)
593 * `reg`: regression
594 * `m-reg`: regression multi-output
595 * `num-tr`: transform numerical features
596 * `num-tr-pos`: transform numerical positive features
597 * `scoring`: transform numerical features, target is usually needed
598 * `outlier`: outlier prediction
599 * `linearsvc`: classifier without *predict_proba*
600 * `cluster`: similar to transform
601 * `num+y-tr`: similar to transform with targets
602 * `num+y-tr-cl`: similar to transform with classes
603 * `num-tr-clu`: similar to cluster, but returns
604 scores or distances instead of cluster
605 * `key-col`: list of dictionaries
606 * `text-col`: one column of text
608 Suffix `nofit` indicates the predictions happens
609 without the model being fitted. This is the case
610 for :epkg:`sklearn:gaussian_process:GaussianProcessRegressor`.
611 The suffix `-cov` indicates the method `predict` was called
612 with parameter ``return_cov=True``, `-std` tells
613 method `predict` was called with parameter ``return_std=True``.
614 The suffix ``-NSV`` creates an input variable
615 like the following ``[('X', FloatTensorType([None, None]))]``.
616 That's a way to bypass :epkg:`onnxruntime` shape checking
617 as one part of the graph is designed to handle any
618 kind of dimensions but apparently, if the input shape is
619 precise, every part of the graph has to be precise. The strings
620 used variables which means it is at the same time precise
621 and unprecise. Suffix ``'-64'`` means the model will
622 do double computations. Suffix ``-nop`` means the classifier
623 does not implement method *predict_proba*. Suffix ``-1d``
624 means a one dimension problem (one feature). Suffix ``-dec``
625 checks method `decision_function`.
627 The following script gives the list of :epkg:`scikit-learn`
628 models and the problem they can be fitted on.
630 .. runpython::
631 :showcode:
632 :warningout: DeprecationWarning
633 :rst:
635 from mlprodict.onnxrt.validate.validate import (
636 sklearn_operators, find_suitable_problem)
637 from pyquickhelper.pandashelper import df2rst
638 from pandas import DataFrame
639 res = sklearn_operators()
640 rows = []
641 for model in res[:20]:
642 name = model['name']
643 row = dict(name=name)
644 try:
645 prob = find_suitable_problem(model['cl'])
646 if prob is None:
647 continue
648 for p in prob:
649 row[p] = 'X'
650 except RuntimeError:
651 pass
652 rows.append(row)
653 df = DataFrame(rows).set_index('name')
654 df = df.sort_index()
655 print(df2rst(df, index=True))
657 The list is truncated. The full list can be found at
658 :ref:`l-model-problem-list`.
659 """
660 from ...onnx_conv.validate_scenarios import find_suitable_problem as ext_find_suitable_problem
662 def _internal(model): # pylint: disable=R0911
664 # checks that this model is not overwritten by this module
665 ext = ext_find_suitable_problem(model)
666 if ext is not None:
667 return ext
669 # Exceptions
670 if model in {GaussianProcessRegressor}:
671 # m-reg causes MemoryError on some machine.
672 return ['~b-reg-NF-64', # '~m-reg-NF-64',
673 '~b-reg-NF-cov-64', # '~m-reg-NF-cov-64',
674 '~b-reg-NF-std-64', # '~m-reg-NF-std-64',
675 '~b-reg-NSV-64', # '~m-reg-NSV-64',
676 '~b-reg-cov-64', # '~m-reg-cov-64',
677 '~b-reg-std-NSV-64', # '~m-reg-std-NSV-64',
678 'b-reg', '~b-reg-64', # 'm-reg'
679 ]
681 if model in {DictVectorizer}:
682 return ['key-int-col']
684 if model in {TfidfVectorizer, CountVectorizer}:
685 return ['text-col']
687 if model in {TfidfTransformer}:
688 return ['bow']
690 if model in {FeatureHasher}:
691 return ['key-str-col']
693 if model in {OneHotEncoder}:
694 return ['one-hot']
696 if model in {LabelBinarizer, LabelEncoder}:
697 return ['int-col']
699 if model in {NuSVC, SVC, SGDClassifier,
700 HistGradientBoostingClassifier}:
701 return ['b-cl', 'm-cl', '~b-cl-64', '~b-cl-nan']
703 if model in {GaussianProcessClassifier}:
704 return ['b-cl', 'm-cl', '~b-cl-64']
706 if model in {BaggingClassifier, BernoulliNB, CalibratedClassifierCV,
707 ComplementNB, GaussianNB,
708 GradientBoostingClassifier, LabelPropagation, LabelSpreading,
709 LinearDiscriminantAnalysis, LogisticRegressionCV,
710 MultinomialNB, QuadraticDiscriminantAnalysis,
711 RandomizedSearchCV}:
712 return ['b-cl', 'm-cl']
714 if model in {Perceptron}:
715 return ['~b-cl-nop', '~m-cl-nop', '~b-cl-dec', '~m-cl-dec']
717 if model in {AdaBoostRegressor}:
718 return ['b-reg', '~b-reg-64']
720 if model in {HistGradientBoostingRegressor}:
721 return ['b-reg', '~b-reg-64', '~b-reg-nan', '~b-reg-nan-64']
723 if model in {LinearSVC, NearestCentroid}:
724 return ['~b-cl-nop', '~b-cl-nop-64']
726 if model in {RFE, RFECV}:
727 return ['num+y-tr']
729 if model in {GridSearchCV}:
730 return ['b-cl', 'm-cl',
731 'b-reg', 'm-reg',
732 '~b-reg-64', '~b-cl-64',
733 'cluster', 'outlier', '~m-label']
735 if model in {VotingClassifier}:
736 return ['b-cl', 'm-cl']
738 if StackingClassifier is not None and model in {StackingClassifier}:
739 return ['b-cl']
741 if StackingRegressor is not None and model in {StackingRegressor}:
742 return ['b-reg']
744 # specific scenarios
745 if model in {IsotonicRegression}:
746 return ['~num+y-tr-1d', '~b-reg-1d']
748 if model in {ARDRegression, BayesianRidge, ElasticNetCV,
749 GradientBoostingRegressor,
750 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,
751 LinearSVR, NuSVR, OrthogonalMatchingPursuitCV,
752 PassiveAggressiveRegressor, SGDRegressor,
753 TheilSenRegressor, HuberRegressor, SVR}:
754 return ['b-reg', '~b-reg-64']
756 if model in {MultiOutputClassifier}:
757 return ['m-cl', '~m-label']
759 if model in {MultiOutputRegressor, MultiTaskElasticNet,
760 MultiTaskElasticNetCV, MultiTaskLassoCV,
761 MultiTaskLasso}:
762 return ['m-reg']
764 if model in {OneVsOneClassifier, OutputCodeClassifier,
765 PassiveAggressiveClassifier, RadiusNeighborsClassifier}:
766 return ['~b-cl-nop', '~m-cl-nop']
768 if model in {RidgeClassifier, RidgeClassifierCV}:
769 return ['~b-cl-nop', '~m-cl-nop', '~m-label']
771 # trainable transform
772 if model in {GenericUnivariateSelect,
773 NeighborhoodComponentsAnalysis,
774 PLSSVD, SelectKBest,
775 SelectPercentile, SelectFromModel}:
776 return ["num+y-tr"]
778 if model in {SelectFwe, SelectFdr, SelectFpr}:
779 return ["num+y-tr-cl"]
781 # no m-label
782 if model in {AdaBoostClassifier}:
783 return ['b-cl', '~b-cl-64', 'm-cl']
785 if model in {LogisticRegression}:
786 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-dec', '~m-cl-dec']
788 if model in {RandomForestClassifier}:
789 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']
791 if model in {DecisionTreeClassifier, ExtraTreeClassifier}:
792 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-f100', '~m-label']
794 if model in {DecisionTreeRegressor}:
795 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64', '~b-reg-f100']
797 if model in {LatentDirichletAllocation, NMF, PowerTransformer}:
798 return ['num-tr-pos']
800 if hasattr(model, 'predict'):
801 if "Classifier" in str(model):
802 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']
803 elif "Regressor" in str(model):
804 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64']
806 # Generic case.
807 res = []
808 if hasattr(model, 'transform'):
809 if issubclass(model, (RegressorMixin, ClassifierMixin)):
810 res.extend(['num+y-tr'])
811 elif issubclass(model, (ClusterMixin, BiclusterMixin)):
812 res.extend(['~num-tr-clu', '~num-tr-clu-64'])
813 else:
814 res.extend(['num-tr'])
816 if hasattr(model, 'predict') and issubclass(model, (ClusterMixin, BiclusterMixin)):
817 res.extend(['cluster', '~b-clu-64'])
819 if issubclass(model, (OutlierMixin)):
820 res.extend(['outlier'])
822 if issubclass(model, ClassifierMixin):
823 if model is OneVsRestClassifier:
824 return ['m-cl', '~m-label']
825 res.extend(['b-cl', '~b-cl-64', 'm-cl', '~m-label'])
826 if issubclass(model, RegressorMixin):
827 res.extend(['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64'])
828 if issubclass(model, BaseMixture):
829 res.extend(['mix', '~mix-64'])
831 if len(res) > 0:
832 return res
834 raise RuntimeError("Unable to find problem for model '{}' - {}."
835 "".format(model.__name__, model.__bases__))
837 res = _internal(model)
838 for r in res:
839 if r not in _problems:
840 raise ValueError( # pragma: no cover
841 "Unrecognized problem '{}' in\n{}".format(
842 r, "\n".join(sorted(_problems))))
843 return res
846_problems = {
847 # standard
848 "b-cl": _problem_for_predictor_binary_classification,
849 "m-cl": _problem_for_predictor_multi_classification,
850 "b-reg": _problem_for_predictor_regression,
851 "m-reg": _problem_for_predictor_multi_regression,
852 "num-tr": _problem_for_numerical_transform,
853 "num-tr-pos": _problem_for_numerical_transform_positive,
854 'outlier': _problem_for_outlier,
855 'cluster': _problem_for_clustering,
856 'num+y-tr': _problem_for_numerical_trainable_transform,
857 'num+y-tr-cl': _problem_for_numerical_trainable_transform_cl,
858 'mix': _problem_for_mixture,
859 # others
860 '~num-tr-clu': _problem_for_clustering_scores,
861 "~m-label": _problem_for_predictor_multi_classification_label,
862 "~scoring": _problem_for_numerical_scoring,
863 '~b-cl-nop': _problem_for_clnoproba_binary,
864 '~m-cl-nop': _problem_for_clnoproba,
865 '~b-cl-dec': _problem_for_cl_decision_function_binary,
866 '~m-cl-dec': _problem_for_cl_decision_function,
867 # nan
868 "~b-reg-nan": lambda n_features=None: _problem_for_predictor_regression(
869 n_features=n_features, add_nan=True),
870 "~b-reg-nan-64": lambda n_features=None: _problem_for_predictor_regression(
871 dtype=numpy.float64, n_features=n_features, add_nan=True),
872 "~b-cl-nan": lambda dtype=numpy.float32, n_features=None: _problem_for_predictor_binary_classification(
873 dtype=dtype, n_features=n_features, add_nan=True),
874 # 100 features
875 "~b-reg-f100": lambda n_features=100: _problem_for_predictor_regression(
876 n_features=n_features or 100),
877 "~b-cl-f100": lambda n_features=100: _problem_for_predictor_binary_classification(
878 n_features=n_features or 100),
879 # 64
880 "~b-cl-64": lambda n_features=None: _problem_for_predictor_binary_classification(
881 dtype=numpy.float64, n_features=n_features),
882 "~b-reg-64": lambda n_features=None: _problem_for_predictor_regression(
883 dtype=numpy.float64, n_features=n_features),
884 '~b-cl-nop-64': lambda n_features=None: _problem_for_clnoproba(
885 dtype=numpy.float64, n_features=n_features),
886 '~b-clu-64': lambda n_features=None: _problem_for_clustering(
887 dtype=numpy.float64, n_features=n_features),
888 '~b-cl-dec-64': lambda n_features=None: _problem_for_cl_decision_function_binary(
889 dtype=numpy.float64, n_features=n_features),
890 '~num-tr-clu-64': lambda n_features=None: _problem_for_clustering_scores(
891 dtype=numpy.float64, n_features=n_features),
892 "~m-reg-64": lambda n_features=None: _problem_for_predictor_multi_regression(
893 dtype=numpy.float64, n_features=n_features),
894 "~num-tr-64": lambda n_features=None: _problem_for_numerical_transform(
895 dtype=numpy.float64, n_features=n_features),
896 '~mix-64': lambda n_features=None: _problem_for_mixture(
897 dtype=numpy.float64, n_features=n_features),
898 #
899 "~b-cl-NF": (lambda n_features=None: _problem_for_predictor_binary_classification(
900 n_features=n_features) + (False, )),
901 "~m-cl-NF": (lambda n_features=None: _problem_for_predictor_multi_classification(
902 n_features=n_features) + (False, )),
903 "~b-reg-NF": (lambda n_features=None: _problem_for_predictor_regression(
904 n_features=n_features) + (False, )),
905 "~m-reg-NF": (lambda n_features=None: _problem_for_predictor_multi_regression(
906 n_features=n_features) + (False, )),
907 #
908 "~b-cl-NF-64": (lambda n_features=None: _problem_for_predictor_binary_classification(
909 dtype=numpy.float64, n_features=n_features) + (False, )),
910 "~m-cl-NF-64": (lambda n_features=None: _problem_for_predictor_multi_classification(
911 dtype=numpy.float64, n_features=n_features) + (False, )),
912 "~b-reg-NF-64": (lambda n_features=None: _problem_for_predictor_regression(
913 dtype=numpy.float64, n_features=n_features) + (False, )),
914 "~m-reg-NF-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
915 dtype=numpy.float64, n_features=n_features) + (False, )),
916 # GaussianProcess
917 "~b-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_regression(
918 True, options={GaussianProcessRegressor: {"return_cov": True}},
919 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),
920 "~m-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
921 True, options={GaussianProcessRegressor: {"return_cov": True}},
922 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),
923 #
924 "~b-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_regression(
925 True, options={GaussianProcessRegressor: {"return_std": True}},
926 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),
927 "~m-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
928 True, options={GaussianProcessRegressor: {"return_std": True}},
929 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),
930 #
931 "~b-reg-cov-64": (lambda n_features=None: _problem_for_predictor_regression(
932 True, options={GaussianProcessRegressor: {"return_cov": True}},
933 return_cov=True, dtype=numpy.float64, n_features=n_features)),
934 "~m-reg-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
935 True, options={GaussianProcessRegressor: {"return_cov": True}},
936 return_cov=True, dtype=numpy.float64, n_features=n_features)),
937 #
938 "~reg-std-64": (lambda n_features=None: _problem_for_predictor_regression(
939 True, options={GaussianProcessRegressor: {"return_std": True}},
940 return_std=True, dtype=numpy.float64, n_features=n_features)),
941 "~m-reg-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
942 True, options={GaussianProcessRegressor: {"return_std": True}},
943 return_std=True, dtype=numpy.float64, n_features=n_features)),
944 #
945 '~b-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_regression(
946 dtype=numpy.float64, n_features=n_features)),
947 '~m-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(
948 dtype=numpy.float64, n_features=n_features)),
949 "~b-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_regression(
950 True, options={GaussianProcessRegressor: {"return_std": True}},
951 return_std=True, dtype=numpy.float64, n_features=n_features))),
952 "~m-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(
953 True, options={GaussianProcessRegressor: {"return_std": True}},
954 return_std=True, dtype=numpy.float64, n_features=n_features))),
955 # isotonic
956 "~b-reg-1d": _1d_problem(_problem_for_predictor_regression),
957 '~num+y-tr-1d': _1d_problem(_problem_for_numerical_trainable_transform),
958 # text
959 "key-int-col": _problem_for_dict_vectorizer,
960 "key-str-col": _problem_for_feature_hasher,
961 "int-col": _problem_for_label_encoder,
962 "one-hot": _problem_for_one_hot_encoder,
963 'text-col': _problem_for_tfidf_vectorizer,
964 'bow': _problem_for_tfidf_transformer,
965}