Coverage for mlprodict/onnxrt/validate/validate

1"""

2@file

3@brief Validates runtime for many :scikit-learn: operators.

4The submodule relies on :epkg:`onnxconverter_common`,

5:epkg:`sklearn-onnx`.

6"""

7import numpy

8from sklearn.base import (

9 ClusterMixin, BiclusterMixin, OutlierMixin,

10 RegressorMixin, ClassifierMixin)

11from sklearn.calibration import CalibratedClassifierCV

12from sklearn.cross_decomposition import PLSSVD

13from sklearn.datasets import load_iris

14from sklearn.decomposition import LatentDirichletAllocation, NMF

15from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

16from sklearn.ensemble import (

17 AdaBoostRegressor, GradientBoostingRegressor, AdaBoostClassifier,

18 BaggingClassifier, VotingClassifier, GradientBoostingClassifier,

19 RandomForestClassifier)

20try:

21 from sklearn.ensemble import StackingClassifier, StackingRegressor

22except ImportError: # pragma: no cover

23 # new in 0.22

24 StackingClassifier, StackingRegressor = None, None

25from sklearn.feature_extraction import DictVectorizer, FeatureHasher

26from sklearn.feature_extraction.text import (

27 CountVectorizer, TfidfVectorizer, TfidfTransformer)

28from sklearn.ensemble import (

29 HistGradientBoostingRegressor,

30 HistGradientBoostingClassifier)

31from sklearn.feature_selection import (

32 RFE, RFECV, GenericUnivariateSelect,

33 SelectPercentile, SelectFwe, SelectKBest,

34 SelectFdr, SelectFpr, SelectFromModel)

35from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor

36from sklearn.isotonic import IsotonicRegression

37from sklearn.linear_model import (

38 ARDRegression, ElasticNetCV,

39 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,

40 SGDRegressor, OrthogonalMatchingPursuitCV,

41 TheilSenRegressor, BayesianRidge, MultiTaskElasticNet,

42 MultiTaskElasticNetCV, MultiTaskLassoCV, MultiTaskLasso,

43 PassiveAggressiveClassifier, RidgeClassifier,

44 RidgeClassifierCV, PassiveAggressiveRegressor,

45 HuberRegressor, LogisticRegression, SGDClassifier,

46 LogisticRegressionCV, Perceptron)

47from sklearn.mixture._base import BaseMixture

48from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

49from sklearn.multiclass import (

50 OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier)

51from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier

52from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB

53from sklearn.neighbors import (

54 NearestCentroid, RadiusNeighborsClassifier,

55 NeighborhoodComponentsAnalysis)

56from sklearn.preprocessing import (

57 LabelBinarizer, LabelEncoder,

58 OneHotEncoder, PowerTransformer)

59from sklearn.semi_supervised import LabelPropagation, LabelSpreading

60from sklearn.svm import LinearSVC, LinearSVR, NuSVR, SVR, SVC, NuSVC

61from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, ExtraTreeClassifier

62from sklearn.utils import shuffle

63from ._validate_problems_helper import (

64 _noshapevar, _1d_problem, text_alpha_num)

67def _modify_dimension(X, n_features, seed=19):

68 """

69 Modifies the number of features to increase

70 or reduce the number of features.

72 @param X features matrix

73 @param n_features number of features

74 @param seed random seed (to get the same

75 dataset at each call)

76 @return new featurs matrix

77 """

78 if n_features is None or n_features == X.shape[1]:

79 return X

80 if n_features < X.shape[1]:

81 return X[:, :n_features]

82 rstate = numpy.random.RandomState(seed) # pylint: disable=E1101

83 res = numpy.empty((X.shape[0], n_features), dtype=X.dtype)

84 res[:, :X.shape[1]] = X[:, :]

85 div = max((n_features // X.shape[1]) + 1, 2)

86 for i in range(X.shape[1], res.shape[1]):

87 j = i % X.shape[1]

88 col = X[:, j]

89 if X.dtype in (numpy.float32, numpy.float64):

90 sigma = numpy.var(col) ** 0.5

91 rnd = rstate.randn(len(col)) * sigma / div

92 col2 = col + rnd

93 res[:, j] -= col2 / div

94 res[:, i] = col2

95 elif X.dtype in (numpy.int32, numpy.int64):

96 perm = rstate.permutation(col)

97 h = rstate.randint(0, div) % X.shape[0]

98 col2 = col.copy()

99 col2[h::div] = perm[h::div] # pylint: disable=E1136

100 res[:, i] = col2

101 h = (h + 1) % X.shape[0]

102 res[h, j] = perm[h] # pylint: disable=E1136

103 else: # pragma: no cover

104 raise NotImplementedError( # pragma: no cover

105 "Unable to add noise to a feature for this type {}".format(X.dtype))

106 return res

107

108

109###########

110# datasets

111###########

112

113

114def _problem_for_predictor_binary_classification(

115 dtype=numpy.float32, n_features=None, add_nan=False):

116 """

117 Returns *X, y, intial_types, method, node name, X runtime* for a

118 binary classification problem.

119 It is based on Iris dataset.

120 """

121 data = load_iris()

122 X = data.data

123 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

124 rnd = state.randn(*X.shape) / 3

125 X += rnd

126 X = _modify_dimension(X, n_features)

127 y = data.target

128 y[y == 2] = 1

129 if add_nan:

130 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)

131 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)

132 X[rows, cols] = numpy.nan

133 X = X.astype(dtype)

134 y = y.astype(numpy.int64)

135 return (X, y, [('X', X[:1].astype(dtype))],

136 'predict_proba', 1, X.astype(dtype))

137

138

139def _problem_for_predictor_multi_classification(dtype=numpy.float32, n_features=None):

140 """

141 Returns *X, y, intial_types, method, node name, X runtime* for a

142 m-cl classification problem.

143 It is based on Iris dataset.

144 """

145 data = load_iris()

146 X = data.data

147 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

148 rnd = state.randn(*X.shape) / 3

149 X += rnd

150 X = _modify_dimension(X, n_features)

151 y = data.target

152 X = X.astype(dtype)

153 y = y.astype(numpy.int64)

154 return (X, y, [('X', X[:1].astype(dtype))],

155 'predict_proba', 1, X.astype(dtype))

156

157

158def _problem_for_mixture(dtype=numpy.float32, n_features=None):

159 """

160 Returns *X, y, intial_types, method, node name, X runtime* for a

161 m-cl classification problem.

162 It is based on Iris dataset.

163 """

164 data = load_iris()

165 X = data.data

166 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

167 rnd = state.randn(*X.shape) / 3

168 X += rnd

169 X = _modify_dimension(X, n_features)

170 y = data.target

171 X = X.astype(dtype)

172 y = y.astype(numpy.int64)

173 return (X, None, [('X', X[:1].astype(dtype))],

174 'predict_proba', 1, X.astype(dtype))

175

176

177def _problem_for_predictor_multi_classification_label(dtype=numpy.float32, n_features=None):

178 """

179 Returns *X, y, intial_types, method, node name, X runtime* for a

180 m-cl classification problem.

181 It is based on Iris dataset.

182 """

183 data = load_iris()

184 X = data.data

185 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

186 rnd = state.randn(*X.shape) / 3

187 X += rnd

188 X = _modify_dimension(X, n_features)

189 y = data.target

190 y2 = numpy.zeros((y.shape[0], 3), dtype=numpy.int64)

191 for i, _ in enumerate(y):

192 y2[i, _] = 1

193 for i in range(0, y.shape[0], 5):

194 y2[i, (y[i] + 1) % 3] = 1

195 X = X.astype(dtype)

196 y2 = y2.astype(numpy.int64)

197 return (X, y2, [('X', X[:1].astype(dtype))],

198 'predict_proba', 1, X.astype(dtype))

199

200

201def _problem_for_predictor_regression(many_output=False, options=None,

202 n_features=None, nbrows=None,

203 dtype=numpy.float32, add_nan=False,

204 **kwargs):

205 """

206 Returns *X, y, intial_types, method, name, X runtime* for a

207 regression problem.

208 It is based on Iris dataset.

209 """

210 data = load_iris()

211 X = data.data

212 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

213 rnd = state.randn(*X.shape) / 3

214 X += rnd

215 X = _modify_dimension(X, n_features)

216 y = data.target + numpy.arange(len(data.target)) / 100

217 meth = 'predict' if kwargs is None else ('predict', kwargs)

218 itt = [('X', X[:1].astype(dtype))]

219 if n_features is not None:

220 X = X[:, :n_features]

221 itt = [('X', X[:1].astype(dtype))]

222 if nbrows is not None:

223 X = X[:nbrows, :]

224 y = y[:nbrows]

225 itt = [('X', X[:1].astype(dtype))]

226 if options is not None:

227 itt = itt, options

228 if add_nan:

229 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)

230 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)

231 X[rows, cols] = numpy.nan

232 X = X.astype(dtype)

233 y = y.astype(dtype)

234 return (X, y, itt,

235 meth, 'all' if many_output else 0, X.astype(dtype))

236

237

238def _problem_for_predictor_multi_regression(many_output=False, options=None,

239 n_features=None, nbrows=None,

240 dtype=numpy.float32, **kwargs):

241 """

242 Returns *X, y, intial_types, method, name, X runtime* for a

243 mregression problem.

244 It is based on Iris dataset.

245 """

246 data = load_iris()

247 X = data.data

248 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

249 rnd = state.randn(*X.shape) / 3

250 X += rnd

251 X = _modify_dimension(X, n_features)

252 y = data.target.astype(float) + numpy.arange(len(data.target)) / 100

253 meth = 'predict' if kwargs is None else ('predict', kwargs)

254 itt = [('X', X[:1].astype(dtype))]

255 if n_features is not None:

256 X = X[:, :n_features]

257 itt = [('X', X[:1].astype(dtype))]

258 if nbrows is not None:

259 X = X[:nbrows, :]

260 y = y[:nbrows]

261 itt = [('X', X[:1].astype(dtype))]

262 if options is not None:

263 itt = itt, options

264 y2 = numpy.empty((y.shape[0], 2))

265 y2[:, 0] = y

266 y2[:, 1] = y + 0.5

267 X = X.astype(dtype)

268 y2 = y2.astype(dtype)

269 return (X, y2, itt,

270 meth, 'all' if many_output else 0, X.astype(dtype))

271

272

273def _problem_for_numerical_transform(dtype=numpy.float32, n_features=None):

274 """

275 Returns *X, intial_types, method, name, X runtime* for a

276 transformation problem.

277 It is based on Iris dataset.

278 """

279 data = load_iris()

280 X = data.data

281 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

282 rnd = state.randn(*X.shape) / 3

283 X += rnd

284 X = _modify_dimension(X, n_features)

285 X = X.astype(dtype)

286 return (X, None, [('X', X[:1].astype(dtype))],

287 'transform', 0, X.astype(dtype=numpy.float32))

288

289

290def _problem_for_numerical_transform_positive(dtype=numpy.float32, n_features=None):

291 """

292 Returns *X, intial_types, method, name, X runtime* for a

293 transformation problem.

294 It is based on Iris dataset.

295 """

296 data = load_iris()

297 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

298 rnd = state.randn(*data.data.shape) / 3

299 X = numpy.abs(data.data + rnd)

300 X = _modify_dimension(X, n_features)

301 X = X.astype(dtype)

302 return (X, None, [('X', X[:1].astype(dtype))],

303 'transform', 0, X.astype(dtype=numpy.float32))

304

305

306def _problem_for_numerical_trainable_transform(dtype=numpy.float32, n_features=None):

307 """

308 Returns *X, intial_types, method, name, X runtime* for a

309 transformation problem.

310 It is based on Iris dataset.

311 """

312 data = load_iris()

313 X = data.data

314 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

315 rnd = state.randn(*X.shape) / 3

316 X += rnd

317 X = _modify_dimension(X, n_features)

318 y = data.target + numpy.arange(len(data.target)) / 100

319 X = X.astype(dtype)

320 y = y.astype(dtype)

321 return (X, y, [('X', X[:1].astype(dtype))],

322 'transform', 0, X.astype(dtype))

323

324

325def _problem_for_numerical_trainable_transform_cl(dtype=numpy.float32, n_features=None):

326 """

327 Returns *X, intial_types, method, name, X runtime* for a

328 transformation problem.

329 It is based on Iris dataset.

330 """

331 data = load_iris()

332 X = data.data

333 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

334 rnd = state.randn(*X.shape) / 3

335 X += rnd

336 X = _modify_dimension(X, n_features)

337 y = data.target

338 X = X.astype(dtype)

339 y = y.astype(numpy.int64)

340 return (X, y, [('X', X[:1].astype(dtype))],

341 'transform', 0, X.astype(dtype))

342

343

344def _problem_for_clustering(dtype=numpy.float32, n_features=None):

345 """

346 Returns *X, intial_types, method, name, X runtime* for a

347 clustering problem.

348 It is based on Iris dataset.

349 """

350 data = load_iris()

351 X = data.data

352 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

353 rnd = state.randn(*X.shape) / 3

354 X += rnd

355 X = _modify_dimension(X, n_features)

356 X = X.astype(dtype)

357 return (X, None, [('X', X[:1].astype(dtype))],

358 'predict', 0, X.astype(dtype))

359

360

361def _problem_for_clustering_scores(dtype=numpy.float32, n_features=None):

362 """

363 Returns *X, intial_types, method, name, X runtime* for a

364 clustering problem, the score part, not the cluster.

365 It is based on Iris dataset.

366 """

367 data = load_iris()

368 X = data.data

369 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

370 rnd = state.randn(*X.shape) / 3

371 X += rnd

372 X = _modify_dimension(X, n_features)

373 X = X.astype(dtype)

374 return (X, None, [('X', X[:1].astype(dtype))],

375 'transform', 1, X.astype(dtype))

376

377

378def _problem_for_outlier(dtype=numpy.float32, n_features=None):

379 """

380 Returns *X, intial_types, method, name, X runtime* for a

381 transformation problem.

382 It is based on Iris dataset.

383 """

384 data = load_iris()

385 X = data.data

386 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

387 rnd = state.randn(*X.shape) / 3

388 X += rnd

389 X = _modify_dimension(X, n_features)

390 X = X.astype(dtype)

391 return (X, None, [('X', X[:1].astype(dtype))],

392 'predict', 0, X.astype(dtype))

393

394

395def _problem_for_numerical_scoring(dtype=numpy.float32, n_features=None):

396 """

397 Returns *X, y, intial_types, method, name, X runtime* for a

398 scoring problem.

399 It is based on Iris dataset.

400 """

401 data = load_iris()

402 X = data.data

403 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

404 rnd = state.randn(*X.shape) / 3

405 X += rnd

406 y = data.target.astype(dtype) + numpy.arange(len(data.target)) / 100

407 y /= numpy.max(y)

408 X = X.astype(dtype)

409 y = y.astype(dtype)

410 return (X, y, [('X', X[:1].astype(dtype))],

411 'score', 0, X.astype(dtype))

412

413

414def _problem_for_clnoproba(dtype=numpy.float32, n_features=None):

415 """

416 Returns *X, y, intial_types, method, name, X runtime* for a

417 scoring problem.

418 It is based on Iris dataset.

419 """

420 data = load_iris()

421 X = data.data

422 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

423 rnd = state.randn(*X.shape) / 3

424 X += rnd

425 X = _modify_dimension(X, n_features)

426 y = data.target

427 X = X.astype(dtype)

428 y = y.astype(numpy.int64)

429 return (X, y, [('X', X[:1].astype(dtype))],

430 'predict', 0, X.astype(dtype))

431

432

433def _problem_for_clnoproba_binary(dtype=numpy.float32, n_features=None, add_nan=False):

434 """

435 Returns *X, y, intial_types, method, name, X runtime* for a

436 scoring problem. Binary classification.

437 It is based on Iris dataset.

438 """

439 data = load_iris()

440 X = data.data

441 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

442 rnd = state.randn(*X.shape) / 3

443 X += rnd

444 X = _modify_dimension(X, n_features)

445 y = data.target

446 y[y == 2] = 1

447 if add_nan:

448 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)

449 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)

450 X[rows, cols] = numpy.nan

451 X = X.astype(dtype)

452 y = y.astype(numpy.int64)

453 return (X, y, [('X', X[:1].astype(dtype))],

454 'predict', 0, X.astype(dtype))

455

456

457def _problem_for_cl_decision_function(dtype=numpy.float32, n_features=None):

458 """

459 Returns *X, y, intial_types, method, name, X runtime* for a

460 scoring problem.

461 It is based on Iris dataset.

462 """

463 data = load_iris()

464 X = data.data

465 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

466 rnd = state.randn(*X.shape) / 3

467 X += rnd

468 X = _modify_dimension(X, n_features)

469 y = data.target

470 X = X.astype(dtype)

471 y = y.astype(numpy.int64)

472 return (X, y, [('X', X[:1].astype(dtype))],

473 'decision_function', 1, X.astype(dtype))

474

475

476def _problem_for_cl_decision_function_binary(dtype=numpy.float32, n_features=None):

477 """

478 Returns *X, y, intial_types, method, name, X runtime* for a

479 scoring problem. Binary classification.

480 It is based on Iris dataset.

481 """

482 data = load_iris()

483 X = data.data

484 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

485 rnd = state.randn(*X.shape) / 3

486 X += rnd

487 X = _modify_dimension(X, n_features)

488 y = data.target

489 y[y == 2] = 1

490 X = X.astype(dtype)

491 y = y.astype(numpy.int64)

492 return (X, y, [('X', X[:1].astype(dtype))],

493 'decision_function', 1, X.astype(dtype))

494

495

496def _problem_for_label_encoder(dtype=numpy.int64, n_features=None):

497 """

498 Returns a problem for the :epkg:`sklearn:preprocessing:LabelEncoder`.

499 """

500 data = load_iris()

501 # X = data.data

502 y = data.target.astype(dtype)

503 itt = [('X', y[:1].astype(dtype))]

504 y = y.astype(dtype)

505 return (y, None, itt, 'transform', 0, y)

506

507

508def _problem_for_dict_vectorizer(dtype=numpy.float32, n_features=None):

509 """

510 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.

511 """

512 from skl2onnx.common.data_types import ( # delayed

513 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType)

514 data = load_iris()

515 # X = data.data

516 y = data.target

517 y2 = [{_: dtype(1000 + i)} for i, _ in enumerate(y)]

518 y2[0][2] = -2

519 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType

520 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]

521 y2 = numpy.array(y2)

522 y = y.astype(numpy.int64)

523 return (y2, y, itt, 'transform', 0, y2)

524

525

526def _problem_for_tfidf_vectorizer(dtype=numpy.float32, n_features=None):

527 """

528 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfVectorizer`.

529 """

530 from skl2onnx.common.data_types import ( # delayed

531 StringTensorType)

532 X = numpy.array([_[0] for _ in text_alpha_num])

533 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)

534 itt = [("X", StringTensorType([None]))]

535 return (X, y, itt, 'transform', 0, X)

536

537

538def _problem_for_tfidf_transformer(dtype=numpy.float32, n_features=None):

539 """

540 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfTransformer`.

541 """

542 from skl2onnx.common.data_types import ( # delayed

543 FloatTensorType, DoubleTensorType)

544 X = numpy.array([_[0] for _ in text_alpha_num])

545 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)

546 X2 = CountVectorizer().fit_transform(X).astype(dtype)

547 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType

548 itt = [("X", cltype([None, X2.shape[1]]))]

549 return (X2, y, itt, 'transform', 0, X2)

550

551

552def _problem_for_feature_hasher(dtype=numpy.float32, n_features=None):

553 """

554 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.

555 """

556 from skl2onnx.common.data_types import ( # delayed

557 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType)

558 data = load_iris()

559 # X = data.data

560 y = data.target

561 y2 = [{("cl%d" % _): dtype(1000 + i)} for i, _ in enumerate(y)]

562 y2[0]["cl2"] = -2

563 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType

564 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]

565 y2 = numpy.array(y2)

566 return (y2, y, itt, 'transform', 0, y2)

567

568

569def _problem_for_one_hot_encoder(dtype=numpy.float32, n_features=None):

570 """

571 Returns a problem for the :epkg:`sklearn:preprocessing:OneHotEncoder`.

572 """

573 data = load_iris()

574 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

575 rnd = state.randn(*data.data.shape) / 3

576 X = _modify_dimension(data.data + rnd, n_features)

577 X = X.astype(numpy.int32).astype(dtype)

578 y = data.target

579 X, y = shuffle(X, y, random_state=1)

580 itt = [('X', X[:1].astype(dtype))]

581 return (X[:, :1], y, itt, 'transform', 0, X[:, :1].astype(dtype))

582

583

584def find_suitable_problem(model):

585 """

586 Determines problems suitable for a given

587 :epkg:`scikit-learn` operator. It may be

588

589 * `b-cl`: binary classification

590 * `m-cl`: m-cl classification

591 * `m-label`: classification m-label

592 (multiple labels possible at the same time)

593 * `reg`: regression

594 * `m-reg`: regression multi-output

595 * `num-tr`: transform numerical features

596 * `num-tr-pos`: transform numerical positive features

597 * `scoring`: transform numerical features, target is usually needed

598 * `outlier`: outlier prediction

599 * `linearsvc`: classifier without *predict_proba*

600 * `cluster`: similar to transform

601 * `num+y-tr`: similar to transform with targets

602 * `num+y-tr-cl`: similar to transform with classes

603 * `num-tr-clu`: similar to cluster, but returns

604 scores or distances instead of cluster

605 * `key-col`: list of dictionaries

606 * `text-col`: one column of text

607

608 Suffix `nofit` indicates the predictions happens

609 without the model being fitted. This is the case

610 for :epkg:`sklearn:gaussian_process:GaussianProcessRegressor`.

611 The suffix `-cov` indicates the method `predict` was called

612 with parameter ``return_cov=True``, `-std` tells

613 method `predict` was called with parameter ``return_std=True``.

614 The suffix ``-NSV`` creates an input variable

615 like the following ``[('X', FloatTensorType([None, None]))]``.

616 That's a way to bypass :epkg:`onnxruntime` shape checking

617 as one part of the graph is designed to handle any

618 kind of dimensions but apparently, if the input shape is

619 precise, every part of the graph has to be precise. The strings

620 used variables which means it is at the same time precise

621 and unprecise. Suffix ``'-64'`` means the model will

622 do double computations. Suffix ``-nop`` means the classifier

623 does not implement method *predict_proba*. Suffix ``-1d``

624 means a one dimension problem (one feature). Suffix ``-dec``

625 checks method `decision_function`.

626

627 The following script gives the list of :epkg:`scikit-learn`

628 models and the problem they can be fitted on.

629

630 .. runpython::

631 :showcode:

632 :warningout: DeprecationWarning

633 :rst:

634

635 from mlprodict.onnxrt.validate.validate import (

636 sklearn_operators, find_suitable_problem)

637 from pyquickhelper.pandashelper import df2rst

638 from pandas import DataFrame

639 res = sklearn_operators()

640 rows = []

641 for model in res[:20]:

642 name = model['name']

643 row = dict(name=name)

644 try:

645 prob = find_suitable_problem(model['cl'])

646 if prob is None:

647 continue

648 for p in prob:

649 row[p] = 'X'

650 except RuntimeError:

651 pass

652 rows.append(row)

653 df = DataFrame(rows).set_index('name')

654 df = df.sort_index()

655 print(df2rst(df, index=True))

656

657 The list is truncated. The full list can be found at

658 :ref:`l-model-problem-list`.

659 """

660 from ...onnx_conv.validate_scenarios import find_suitable_problem as ext_find_suitable_problem

661

662 def _internal(model): # pylint: disable=R0911

663

664 # checks that this model is not overwritten by this module

665 ext = ext_find_suitable_problem(model)

666 if ext is not None:

667 return ext

668

669 # Exceptions

670 if model in {GaussianProcessRegressor}:

671 # m-reg causes MemoryError on some machine.

672 return ['~b-reg-NF-64', # '~m-reg-NF-64',

673 '~b-reg-NF-cov-64', # '~m-reg-NF-cov-64',

674 '~b-reg-NF-std-64', # '~m-reg-NF-std-64',

675 '~b-reg-NSV-64', # '~m-reg-NSV-64',

676 '~b-reg-cov-64', # '~m-reg-cov-64',

677 '~b-reg-std-NSV-64', # '~m-reg-std-NSV-64',

678 'b-reg', '~b-reg-64', # 'm-reg'

679 ]

680

681 if model in {DictVectorizer}:

682 return ['key-int-col']

683

684 if model in {TfidfVectorizer, CountVectorizer}:

685 return ['text-col']

686

687 if model in {TfidfTransformer}:

688 return ['bow']

689

690 if model in {FeatureHasher}:

691 return ['key-str-col']

692

693 if model in {OneHotEncoder}:

694 return ['one-hot']

695

696 if model in {LabelBinarizer, LabelEncoder}:

697 return ['int-col']

698

699 if model in {NuSVC, SVC, SGDClassifier,

700 HistGradientBoostingClassifier}:

701 return ['b-cl', 'm-cl', '~b-cl-64', '~b-cl-nan']

702

703 if model in {GaussianProcessClassifier}:

704 return ['b-cl', 'm-cl', '~b-cl-64']

705

706 if model in {BaggingClassifier, BernoulliNB, CalibratedClassifierCV,

707 ComplementNB, GaussianNB,

708 GradientBoostingClassifier, LabelPropagation, LabelSpreading,

709 LinearDiscriminantAnalysis, LogisticRegressionCV,

710 MultinomialNB, QuadraticDiscriminantAnalysis,

711 RandomizedSearchCV}:

712 return ['b-cl', 'm-cl']

713

714 if model in {Perceptron}:

715 return ['~b-cl-nop', '~m-cl-nop', '~b-cl-dec', '~m-cl-dec']

716

717 if model in {AdaBoostRegressor}:

718 return ['b-reg', '~b-reg-64']

719

720 if model in {HistGradientBoostingRegressor}:

721 return ['b-reg', '~b-reg-64', '~b-reg-nan', '~b-reg-nan-64']

722

723 if model in {LinearSVC, NearestCentroid}:

724 return ['~b-cl-nop', '~b-cl-nop-64']

725

726 if model in {RFE, RFECV}:

727 return ['num+y-tr']

728

729 if model in {GridSearchCV}:

730 return ['b-cl', 'm-cl',

731 'b-reg', 'm-reg',

732 '~b-reg-64', '~b-cl-64',

733 'cluster', 'outlier', '~m-label']

734

735 if model in {VotingClassifier}:

736 return ['b-cl', 'm-cl']

737

738 if StackingClassifier is not None and model in {StackingClassifier}:

739 return ['b-cl']

740

741 if StackingRegressor is not None and model in {StackingRegressor}:

742 return ['b-reg']

743

744 # specific scenarios

745 if model in {IsotonicRegression}:

746 return ['~num+y-tr-1d', '~b-reg-1d']

747

748 if model in {ARDRegression, BayesianRidge, ElasticNetCV,

749 GradientBoostingRegressor,

750 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,

751 LinearSVR, NuSVR, OrthogonalMatchingPursuitCV,

752 PassiveAggressiveRegressor, SGDRegressor,

753 TheilSenRegressor, HuberRegressor, SVR}:

754 return ['b-reg', '~b-reg-64']

755

756 if model in {MultiOutputClassifier}:

757 return ['m-cl', '~m-label']

758

759 if model in {MultiOutputRegressor, MultiTaskElasticNet,

760 MultiTaskElasticNetCV, MultiTaskLassoCV,

761 MultiTaskLasso}:

762 return ['m-reg']

763

764 if model in {OneVsOneClassifier, OutputCodeClassifier,

765 PassiveAggressiveClassifier, RadiusNeighborsClassifier}:

766 return ['~b-cl-nop', '~m-cl-nop']

767

768 if model in {RidgeClassifier, RidgeClassifierCV}:

769 return ['~b-cl-nop', '~m-cl-nop', '~m-label']

770

771 # trainable transform

772 if model in {GenericUnivariateSelect,

773 NeighborhoodComponentsAnalysis,

774 PLSSVD, SelectKBest,

775 SelectPercentile, SelectFromModel}:

776 return ["num+y-tr"]

777

778 if model in {SelectFwe, SelectFdr, SelectFpr}:

779 return ["num+y-tr-cl"]

780

781 # no m-label

782 if model in {AdaBoostClassifier}:

783 return ['b-cl', '~b-cl-64', 'm-cl']

784

785 if model in {LogisticRegression}:

786 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-dec', '~m-cl-dec']

787

788 if model in {RandomForestClassifier}:

789 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']

790

791 if model in {DecisionTreeClassifier, ExtraTreeClassifier}:

792 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-f100', '~m-label']

793

794 if model in {DecisionTreeRegressor}:

795 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64', '~b-reg-f100']

796

797 if model in {LatentDirichletAllocation, NMF, PowerTransformer}:

798 return ['num-tr-pos']

799

800 if hasattr(model, 'predict'):

801 if "Classifier" in str(model):

802 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']

803 elif "Regressor" in str(model):

804 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64']

805

806 # Generic case.

807 res = []

808 if hasattr(model, 'transform'):

809 if issubclass(model, (RegressorMixin, ClassifierMixin)):

810 res.extend(['num+y-tr'])

811 elif issubclass(model, (ClusterMixin, BiclusterMixin)):

812 res.extend(['~num-tr-clu', '~num-tr-clu-64'])

813 else:

814 res.extend(['num-tr'])

815

816 if hasattr(model, 'predict') and issubclass(model, (ClusterMixin, BiclusterMixin)):

817 res.extend(['cluster', '~b-clu-64'])

818

819 if issubclass(model, (OutlierMixin)):

820 res.extend(['outlier'])

821

822 if issubclass(model, ClassifierMixin):

823 if model is OneVsRestClassifier:

824 return ['m-cl', '~m-label']

825 res.extend(['b-cl', '~b-cl-64', 'm-cl', '~m-label'])

826 if issubclass(model, RegressorMixin):

827 res.extend(['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64'])

828 if issubclass(model, BaseMixture):

829 res.extend(['mix', '~mix-64'])

830

831 if len(res) > 0:

832 return res

833

834 raise RuntimeError("Unable to find problem for model '{}' - {}."

835 "".format(model.__name__, model.__bases__))

836

837 res = _internal(model)

838 for r in res:

839 if r not in _problems:

840 raise ValueError( # pragma: no cover

841 "Unrecognized problem '{}' in\n{}".format(

842 r, "\n".join(sorted(_problems))))

843 return res

844

845

846_problems = {

847 # standard

848 "b-cl": _problem_for_predictor_binary_classification,

849 "m-cl": _problem_for_predictor_multi_classification,

850 "b-reg": _problem_for_predictor_regression,

851 "m-reg": _problem_for_predictor_multi_regression,

852 "num-tr": _problem_for_numerical_transform,

853 "num-tr-pos": _problem_for_numerical_transform_positive,

854 'outlier': _problem_for_outlier,

855 'cluster': _problem_for_clustering,

856 'num+y-tr': _problem_for_numerical_trainable_transform,

857 'num+y-tr-cl': _problem_for_numerical_trainable_transform_cl,

858 'mix': _problem_for_mixture,

859 # others

860 '~num-tr-clu': _problem_for_clustering_scores,

861 "~m-label": _problem_for_predictor_multi_classification_label,

862 "~scoring": _problem_for_numerical_scoring,

863 '~b-cl-nop': _problem_for_clnoproba_binary,

864 '~m-cl-nop': _problem_for_clnoproba,

865 '~b-cl-dec': _problem_for_cl_decision_function_binary,

866 '~m-cl-dec': _problem_for_cl_decision_function,

867 # nan

868 "~b-reg-nan": lambda n_features=None: _problem_for_predictor_regression(

869 n_features=n_features, add_nan=True),

870 "~b-reg-nan-64": lambda n_features=None: _problem_for_predictor_regression(

871 dtype=numpy.float64, n_features=n_features, add_nan=True),

872 "~b-cl-nan": lambda dtype=numpy.float32, n_features=None: _problem_for_predictor_binary_classification(

873 dtype=dtype, n_features=n_features, add_nan=True),

874 # 100 features

875 "~b-reg-f100": lambda n_features=100: _problem_for_predictor_regression(

876 n_features=n_features or 100),

877 "~b-cl-f100": lambda n_features=100: _problem_for_predictor_binary_classification(

878 n_features=n_features or 100),

879 # 64

880 "~b-cl-64": lambda n_features=None: _problem_for_predictor_binary_classification(

881 dtype=numpy.float64, n_features=n_features),

882 "~b-reg-64": lambda n_features=None: _problem_for_predictor_regression(

883 dtype=numpy.float64, n_features=n_features),

884 '~b-cl-nop-64': lambda n_features=None: _problem_for_clnoproba(

885 dtype=numpy.float64, n_features=n_features),

886 '~b-clu-64': lambda n_features=None: _problem_for_clustering(

887 dtype=numpy.float64, n_features=n_features),

888 '~b-cl-dec-64': lambda n_features=None: _problem_for_cl_decision_function_binary(

889 dtype=numpy.float64, n_features=n_features),

890 '~num-tr-clu-64': lambda n_features=None: _problem_for_clustering_scores(

891 dtype=numpy.float64, n_features=n_features),

892 "~m-reg-64": lambda n_features=None: _problem_for_predictor_multi_regression(

893 dtype=numpy.float64, n_features=n_features),

894 "~num-tr-64": lambda n_features=None: _problem_for_numerical_transform(

895 dtype=numpy.float64, n_features=n_features),

896 '~mix-64': lambda n_features=None: _problem_for_mixture(

897 dtype=numpy.float64, n_features=n_features),

898 #

899 "~b-cl-NF": (lambda n_features=None: _problem_for_predictor_binary_classification(

900 n_features=n_features) + (False, )),

901 "~m-cl-NF": (lambda n_features=None: _problem_for_predictor_multi_classification(

902 n_features=n_features) + (False, )),

903 "~b-reg-NF": (lambda n_features=None: _problem_for_predictor_regression(

904 n_features=n_features) + (False, )),

905 "~m-reg-NF": (lambda n_features=None: _problem_for_predictor_multi_regression(

906 n_features=n_features) + (False, )),

907 #

908 "~b-cl-NF-64": (lambda n_features=None: _problem_for_predictor_binary_classification(

909 dtype=numpy.float64, n_features=n_features) + (False, )),

910 "~m-cl-NF-64": (lambda n_features=None: _problem_for_predictor_multi_classification(

911 dtype=numpy.float64, n_features=n_features) + (False, )),

912 "~b-reg-NF-64": (lambda n_features=None: _problem_for_predictor_regression(

913 dtype=numpy.float64, n_features=n_features) + (False, )),

914 "~m-reg-NF-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

915 dtype=numpy.float64, n_features=n_features) + (False, )),

916 # GaussianProcess

917 "~b-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_regression(

918 True, options={GaussianProcessRegressor: {"return_cov": True}},

919 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),

920 "~m-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

921 True, options={GaussianProcessRegressor: {"return_cov": True}},

922 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),

923 #

924 "~b-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_regression(

925 True, options={GaussianProcessRegressor: {"return_std": True}},

926 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),

927 "~m-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

928 True, options={GaussianProcessRegressor: {"return_std": True}},

929 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),

930 #

931 "~b-reg-cov-64": (lambda n_features=None: _problem_for_predictor_regression(

932 True, options={GaussianProcessRegressor: {"return_cov": True}},

933 return_cov=True, dtype=numpy.float64, n_features=n_features)),

934 "~m-reg-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

935 True, options={GaussianProcessRegressor: {"return_cov": True}},

936 return_cov=True, dtype=numpy.float64, n_features=n_features)),

937 #

938 "~reg-std-64": (lambda n_features=None: _problem_for_predictor_regression(

939 True, options={GaussianProcessRegressor: {"return_std": True}},

940 return_std=True, dtype=numpy.float64, n_features=n_features)),

941 "~m-reg-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

942 True, options={GaussianProcessRegressor: {"return_std": True}},

943 return_std=True, dtype=numpy.float64, n_features=n_features)),

944 #

945 '~b-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_regression(

946 dtype=numpy.float64, n_features=n_features)),

947 '~m-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(

948 dtype=numpy.float64, n_features=n_features)),

949 "~b-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_regression(

950 True, options={GaussianProcessRegressor: {"return_std": True}},

951 return_std=True, dtype=numpy.float64, n_features=n_features))),

952 "~m-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(

953 True, options={GaussianProcessRegressor: {"return_std": True}},

954 return_std=True, dtype=numpy.float64, n_features=n_features))),

955 # isotonic

956 "~b-reg-1d": _1d_problem(_problem_for_predictor_regression),

957 '~num+y-tr-1d': _1d_problem(_problem_for_numerical_trainable_transform),

958 # text

959 "key-int-col": _problem_for_dict_vectorizer,

960 "key-str-col": _problem_for_feature_hasher,

961 "int-col": _problem_for_label_encoder,

962 "one-hot": _problem_for_one_hot_encoder,

963 'text-col': _problem_for_tfidf_vectorizer,

964 'bow': _problem_for_tfidf_transformer,

965}

Coverage for mlprodict/onnxrt/validate/validate_problems.py: 99%

443 statements