Coverage for mlprodict/onnxrt/validate/validate_problems.py: 99%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

443 statements  

1""" 

2@file 

3@brief Validates runtime for many :scikit-learn: operators. 

4The submodule relies on :epkg:`onnxconverter_common`, 

5:epkg:`sklearn-onnx`. 

6""" 

7import numpy 

8from sklearn.base import ( 

9 ClusterMixin, BiclusterMixin, OutlierMixin, 

10 RegressorMixin, ClassifierMixin) 

11from sklearn.calibration import CalibratedClassifierCV 

12from sklearn.cross_decomposition import PLSSVD 

13from sklearn.datasets import load_iris 

14from sklearn.decomposition import LatentDirichletAllocation, NMF 

15from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis 

16from sklearn.ensemble import ( 

17 AdaBoostRegressor, GradientBoostingRegressor, AdaBoostClassifier, 

18 BaggingClassifier, VotingClassifier, GradientBoostingClassifier, 

19 RandomForestClassifier) 

20try: 

21 from sklearn.ensemble import StackingClassifier, StackingRegressor 

22except ImportError: # pragma: no cover 

23 # new in 0.22 

24 StackingClassifier, StackingRegressor = None, None 

25from sklearn.feature_extraction import DictVectorizer, FeatureHasher 

26from sklearn.feature_extraction.text import ( 

27 CountVectorizer, TfidfVectorizer, TfidfTransformer) 

28from sklearn.ensemble import ( 

29 HistGradientBoostingRegressor, 

30 HistGradientBoostingClassifier) 

31from sklearn.feature_selection import ( 

32 RFE, RFECV, GenericUnivariateSelect, 

33 SelectPercentile, SelectFwe, SelectKBest, 

34 SelectFdr, SelectFpr, SelectFromModel) 

35from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor 

36from sklearn.isotonic import IsotonicRegression 

37from sklearn.linear_model import ( 

38 ARDRegression, ElasticNetCV, 

39 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC, 

40 SGDRegressor, OrthogonalMatchingPursuitCV, 

41 TheilSenRegressor, BayesianRidge, MultiTaskElasticNet, 

42 MultiTaskElasticNetCV, MultiTaskLassoCV, MultiTaskLasso, 

43 PassiveAggressiveClassifier, RidgeClassifier, 

44 RidgeClassifierCV, PassiveAggressiveRegressor, 

45 HuberRegressor, LogisticRegression, SGDClassifier, 

46 LogisticRegressionCV, Perceptron) 

47from sklearn.mixture._base import BaseMixture 

48from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 

49from sklearn.multiclass import ( 

50 OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier) 

51from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier 

52from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB 

53from sklearn.neighbors import ( 

54 NearestCentroid, RadiusNeighborsClassifier, 

55 NeighborhoodComponentsAnalysis) 

56from sklearn.preprocessing import ( 

57 LabelBinarizer, LabelEncoder, 

58 OneHotEncoder, PowerTransformer) 

59from sklearn.semi_supervised import LabelPropagation, LabelSpreading 

60from sklearn.svm import LinearSVC, LinearSVR, NuSVR, SVR, SVC, NuSVC 

61from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, ExtraTreeClassifier 

62from sklearn.utils import shuffle 

63from ._validate_problems_helper import ( 

64 _noshapevar, _1d_problem, text_alpha_num) 

65 

66 

67def _modify_dimension(X, n_features, seed=19): 

68 """ 

69 Modifies the number of features to increase 

70 or reduce the number of features. 

71 

72 @param X features matrix 

73 @param n_features number of features 

74 @param seed random seed (to get the same 

75 dataset at each call) 

76 @return new featurs matrix 

77 """ 

78 if n_features is None or n_features == X.shape[1]: 

79 return X 

80 if n_features < X.shape[1]: 

81 return X[:, :n_features] 

82 rstate = numpy.random.RandomState(seed) # pylint: disable=E1101 

83 res = numpy.empty((X.shape[0], n_features), dtype=X.dtype) 

84 res[:, :X.shape[1]] = X[:, :] 

85 div = max((n_features // X.shape[1]) + 1, 2) 

86 for i in range(X.shape[1], res.shape[1]): 

87 j = i % X.shape[1] 

88 col = X[:, j] 

89 if X.dtype in (numpy.float32, numpy.float64): 

90 sigma = numpy.var(col) ** 0.5 

91 rnd = rstate.randn(len(col)) * sigma / div 

92 col2 = col + rnd 

93 res[:, j] -= col2 / div 

94 res[:, i] = col2 

95 elif X.dtype in (numpy.int32, numpy.int64): 

96 perm = rstate.permutation(col) 

97 h = rstate.randint(0, div) % X.shape[0] 

98 col2 = col.copy() 

99 col2[h::div] = perm[h::div] # pylint: disable=E1136 

100 res[:, i] = col2 

101 h = (h + 1) % X.shape[0] 

102 res[h, j] = perm[h] # pylint: disable=E1136 

103 else: # pragma: no cover 

104 raise NotImplementedError( # pragma: no cover 

105 "Unable to add noise to a feature for this type {}".format(X.dtype)) 

106 return res 

107 

108 

109########### 

110# datasets 

111########### 

112 

113 

114def _problem_for_predictor_binary_classification( 

115 dtype=numpy.float32, n_features=None, add_nan=False): 

116 """ 

117 Returns *X, y, intial_types, method, node name, X runtime* for a 

118 binary classification problem. 

119 It is based on Iris dataset. 

120 """ 

121 data = load_iris() 

122 X = data.data 

123 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

124 rnd = state.randn(*X.shape) / 3 

125 X += rnd 

126 X = _modify_dimension(X, n_features) 

127 y = data.target 

128 y[y == 2] = 1 

129 if add_nan: 

130 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) 

131 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) 

132 X[rows, cols] = numpy.nan 

133 X = X.astype(dtype) 

134 y = y.astype(numpy.int64) 

135 return (X, y, [('X', X[:1].astype(dtype))], 

136 'predict_proba', 1, X.astype(dtype)) 

137 

138 

139def _problem_for_predictor_multi_classification(dtype=numpy.float32, n_features=None): 

140 """ 

141 Returns *X, y, intial_types, method, node name, X runtime* for a 

142 m-cl classification problem. 

143 It is based on Iris dataset. 

144 """ 

145 data = load_iris() 

146 X = data.data 

147 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

148 rnd = state.randn(*X.shape) / 3 

149 X += rnd 

150 X = _modify_dimension(X, n_features) 

151 y = data.target 

152 X = X.astype(dtype) 

153 y = y.astype(numpy.int64) 

154 return (X, y, [('X', X[:1].astype(dtype))], 

155 'predict_proba', 1, X.astype(dtype)) 

156 

157 

158def _problem_for_mixture(dtype=numpy.float32, n_features=None): 

159 """ 

160 Returns *X, y, intial_types, method, node name, X runtime* for a 

161 m-cl classification problem. 

162 It is based on Iris dataset. 

163 """ 

164 data = load_iris() 

165 X = data.data 

166 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

167 rnd = state.randn(*X.shape) / 3 

168 X += rnd 

169 X = _modify_dimension(X, n_features) 

170 y = data.target 

171 X = X.astype(dtype) 

172 y = y.astype(numpy.int64) 

173 return (X, None, [('X', X[:1].astype(dtype))], 

174 'predict_proba', 1, X.astype(dtype)) 

175 

176 

177def _problem_for_predictor_multi_classification_label(dtype=numpy.float32, n_features=None): 

178 """ 

179 Returns *X, y, intial_types, method, node name, X runtime* for a 

180 m-cl classification problem. 

181 It is based on Iris dataset. 

182 """ 

183 data = load_iris() 

184 X = data.data 

185 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

186 rnd = state.randn(*X.shape) / 3 

187 X += rnd 

188 X = _modify_dimension(X, n_features) 

189 y = data.target 

190 y2 = numpy.zeros((y.shape[0], 3), dtype=numpy.int64) 

191 for i, _ in enumerate(y): 

192 y2[i, _] = 1 

193 for i in range(0, y.shape[0], 5): 

194 y2[i, (y[i] + 1) % 3] = 1 

195 X = X.astype(dtype) 

196 y2 = y2.astype(numpy.int64) 

197 return (X, y2, [('X', X[:1].astype(dtype))], 

198 'predict_proba', 1, X.astype(dtype)) 

199 

200 

201def _problem_for_predictor_regression(many_output=False, options=None, 

202 n_features=None, nbrows=None, 

203 dtype=numpy.float32, add_nan=False, 

204 **kwargs): 

205 """ 

206 Returns *X, y, intial_types, method, name, X runtime* for a 

207 regression problem. 

208 It is based on Iris dataset. 

209 """ 

210 data = load_iris() 

211 X = data.data 

212 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

213 rnd = state.randn(*X.shape) / 3 

214 X += rnd 

215 X = _modify_dimension(X, n_features) 

216 y = data.target + numpy.arange(len(data.target)) / 100 

217 meth = 'predict' if kwargs is None else ('predict', kwargs) 

218 itt = [('X', X[:1].astype(dtype))] 

219 if n_features is not None: 

220 X = X[:, :n_features] 

221 itt = [('X', X[:1].astype(dtype))] 

222 if nbrows is not None: 

223 X = X[:nbrows, :] 

224 y = y[:nbrows] 

225 itt = [('X', X[:1].astype(dtype))] 

226 if options is not None: 

227 itt = itt, options 

228 if add_nan: 

229 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) 

230 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) 

231 X[rows, cols] = numpy.nan 

232 X = X.astype(dtype) 

233 y = y.astype(dtype) 

234 return (X, y, itt, 

235 meth, 'all' if many_output else 0, X.astype(dtype)) 

236 

237 

238def _problem_for_predictor_multi_regression(many_output=False, options=None, 

239 n_features=None, nbrows=None, 

240 dtype=numpy.float32, **kwargs): 

241 """ 

242 Returns *X, y, intial_types, method, name, X runtime* for a 

243 mregression problem. 

244 It is based on Iris dataset. 

245 """ 

246 data = load_iris() 

247 X = data.data 

248 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

249 rnd = state.randn(*X.shape) / 3 

250 X += rnd 

251 X = _modify_dimension(X, n_features) 

252 y = data.target.astype(float) + numpy.arange(len(data.target)) / 100 

253 meth = 'predict' if kwargs is None else ('predict', kwargs) 

254 itt = [('X', X[:1].astype(dtype))] 

255 if n_features is not None: 

256 X = X[:, :n_features] 

257 itt = [('X', X[:1].astype(dtype))] 

258 if nbrows is not None: 

259 X = X[:nbrows, :] 

260 y = y[:nbrows] 

261 itt = [('X', X[:1].astype(dtype))] 

262 if options is not None: 

263 itt = itt, options 

264 y2 = numpy.empty((y.shape[0], 2)) 

265 y2[:, 0] = y 

266 y2[:, 1] = y + 0.5 

267 X = X.astype(dtype) 

268 y2 = y2.astype(dtype) 

269 return (X, y2, itt, 

270 meth, 'all' if many_output else 0, X.astype(dtype)) 

271 

272 

273def _problem_for_numerical_transform(dtype=numpy.float32, n_features=None): 

274 """ 

275 Returns *X, intial_types, method, name, X runtime* for a 

276 transformation problem. 

277 It is based on Iris dataset. 

278 """ 

279 data = load_iris() 

280 X = data.data 

281 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

282 rnd = state.randn(*X.shape) / 3 

283 X += rnd 

284 X = _modify_dimension(X, n_features) 

285 X = X.astype(dtype) 

286 return (X, None, [('X', X[:1].astype(dtype))], 

287 'transform', 0, X.astype(dtype=numpy.float32)) 

288 

289 

290def _problem_for_numerical_transform_positive(dtype=numpy.float32, n_features=None): 

291 """ 

292 Returns *X, intial_types, method, name, X runtime* for a 

293 transformation problem. 

294 It is based on Iris dataset. 

295 """ 

296 data = load_iris() 

297 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

298 rnd = state.randn(*data.data.shape) / 3 

299 X = numpy.abs(data.data + rnd) 

300 X = _modify_dimension(X, n_features) 

301 X = X.astype(dtype) 

302 return (X, None, [('X', X[:1].astype(dtype))], 

303 'transform', 0, X.astype(dtype=numpy.float32)) 

304 

305 

306def _problem_for_numerical_trainable_transform(dtype=numpy.float32, n_features=None): 

307 """ 

308 Returns *X, intial_types, method, name, X runtime* for a 

309 transformation problem. 

310 It is based on Iris dataset. 

311 """ 

312 data = load_iris() 

313 X = data.data 

314 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

315 rnd = state.randn(*X.shape) / 3 

316 X += rnd 

317 X = _modify_dimension(X, n_features) 

318 y = data.target + numpy.arange(len(data.target)) / 100 

319 X = X.astype(dtype) 

320 y = y.astype(dtype) 

321 return (X, y, [('X', X[:1].astype(dtype))], 

322 'transform', 0, X.astype(dtype)) 

323 

324 

325def _problem_for_numerical_trainable_transform_cl(dtype=numpy.float32, n_features=None): 

326 """ 

327 Returns *X, intial_types, method, name, X runtime* for a 

328 transformation problem. 

329 It is based on Iris dataset. 

330 """ 

331 data = load_iris() 

332 X = data.data 

333 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

334 rnd = state.randn(*X.shape) / 3 

335 X += rnd 

336 X = _modify_dimension(X, n_features) 

337 y = data.target 

338 X = X.astype(dtype) 

339 y = y.astype(numpy.int64) 

340 return (X, y, [('X', X[:1].astype(dtype))], 

341 'transform', 0, X.astype(dtype)) 

342 

343 

344def _problem_for_clustering(dtype=numpy.float32, n_features=None): 

345 """ 

346 Returns *X, intial_types, method, name, X runtime* for a 

347 clustering problem. 

348 It is based on Iris dataset. 

349 """ 

350 data = load_iris() 

351 X = data.data 

352 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

353 rnd = state.randn(*X.shape) / 3 

354 X += rnd 

355 X = _modify_dimension(X, n_features) 

356 X = X.astype(dtype) 

357 return (X, None, [('X', X[:1].astype(dtype))], 

358 'predict', 0, X.astype(dtype)) 

359 

360 

361def _problem_for_clustering_scores(dtype=numpy.float32, n_features=None): 

362 """ 

363 Returns *X, intial_types, method, name, X runtime* for a 

364 clustering problem, the score part, not the cluster. 

365 It is based on Iris dataset. 

366 """ 

367 data = load_iris() 

368 X = data.data 

369 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

370 rnd = state.randn(*X.shape) / 3 

371 X += rnd 

372 X = _modify_dimension(X, n_features) 

373 X = X.astype(dtype) 

374 return (X, None, [('X', X[:1].astype(dtype))], 

375 'transform', 1, X.astype(dtype)) 

376 

377 

378def _problem_for_outlier(dtype=numpy.float32, n_features=None): 

379 """ 

380 Returns *X, intial_types, method, name, X runtime* for a 

381 transformation problem. 

382 It is based on Iris dataset. 

383 """ 

384 data = load_iris() 

385 X = data.data 

386 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

387 rnd = state.randn(*X.shape) / 3 

388 X += rnd 

389 X = _modify_dimension(X, n_features) 

390 X = X.astype(dtype) 

391 return (X, None, [('X', X[:1].astype(dtype))], 

392 'predict', 0, X.astype(dtype)) 

393 

394 

395def _problem_for_numerical_scoring(dtype=numpy.float32, n_features=None): 

396 """ 

397 Returns *X, y, intial_types, method, name, X runtime* for a 

398 scoring problem. 

399 It is based on Iris dataset. 

400 """ 

401 data = load_iris() 

402 X = data.data 

403 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

404 rnd = state.randn(*X.shape) / 3 

405 X += rnd 

406 y = data.target.astype(dtype) + numpy.arange(len(data.target)) / 100 

407 y /= numpy.max(y) 

408 X = X.astype(dtype) 

409 y = y.astype(dtype) 

410 return (X, y, [('X', X[:1].astype(dtype))], 

411 'score', 0, X.astype(dtype)) 

412 

413 

414def _problem_for_clnoproba(dtype=numpy.float32, n_features=None): 

415 """ 

416 Returns *X, y, intial_types, method, name, X runtime* for a 

417 scoring problem. 

418 It is based on Iris dataset. 

419 """ 

420 data = load_iris() 

421 X = data.data 

422 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

423 rnd = state.randn(*X.shape) / 3 

424 X += rnd 

425 X = _modify_dimension(X, n_features) 

426 y = data.target 

427 X = X.astype(dtype) 

428 y = y.astype(numpy.int64) 

429 return (X, y, [('X', X[:1].astype(dtype))], 

430 'predict', 0, X.astype(dtype)) 

431 

432 

433def _problem_for_clnoproba_binary(dtype=numpy.float32, n_features=None, add_nan=False): 

434 """ 

435 Returns *X, y, intial_types, method, name, X runtime* for a 

436 scoring problem. Binary classification. 

437 It is based on Iris dataset. 

438 """ 

439 data = load_iris() 

440 X = data.data 

441 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

442 rnd = state.randn(*X.shape) / 3 

443 X += rnd 

444 X = _modify_dimension(X, n_features) 

445 y = data.target 

446 y[y == 2] = 1 

447 if add_nan: 

448 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) 

449 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) 

450 X[rows, cols] = numpy.nan 

451 X = X.astype(dtype) 

452 y = y.astype(numpy.int64) 

453 return (X, y, [('X', X[:1].astype(dtype))], 

454 'predict', 0, X.astype(dtype)) 

455 

456 

457def _problem_for_cl_decision_function(dtype=numpy.float32, n_features=None): 

458 """ 

459 Returns *X, y, intial_types, method, name, X runtime* for a 

460 scoring problem. 

461 It is based on Iris dataset. 

462 """ 

463 data = load_iris() 

464 X = data.data 

465 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

466 rnd = state.randn(*X.shape) / 3 

467 X += rnd 

468 X = _modify_dimension(X, n_features) 

469 y = data.target 

470 X = X.astype(dtype) 

471 y = y.astype(numpy.int64) 

472 return (X, y, [('X', X[:1].astype(dtype))], 

473 'decision_function', 1, X.astype(dtype)) 

474 

475 

476def _problem_for_cl_decision_function_binary(dtype=numpy.float32, n_features=None): 

477 """ 

478 Returns *X, y, intial_types, method, name, X runtime* for a 

479 scoring problem. Binary classification. 

480 It is based on Iris dataset. 

481 """ 

482 data = load_iris() 

483 X = data.data 

484 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

485 rnd = state.randn(*X.shape) / 3 

486 X += rnd 

487 X = _modify_dimension(X, n_features) 

488 y = data.target 

489 y[y == 2] = 1 

490 X = X.astype(dtype) 

491 y = y.astype(numpy.int64) 

492 return (X, y, [('X', X[:1].astype(dtype))], 

493 'decision_function', 1, X.astype(dtype)) 

494 

495 

496def _problem_for_label_encoder(dtype=numpy.int64, n_features=None): 

497 """ 

498 Returns a problem for the :epkg:`sklearn:preprocessing:LabelEncoder`. 

499 """ 

500 data = load_iris() 

501 # X = data.data 

502 y = data.target.astype(dtype) 

503 itt = [('X', y[:1].astype(dtype))] 

504 y = y.astype(dtype) 

505 return (y, None, itt, 'transform', 0, y) 

506 

507 

508def _problem_for_dict_vectorizer(dtype=numpy.float32, n_features=None): 

509 """ 

510 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`. 

511 """ 

512 from skl2onnx.common.data_types import ( # delayed 

513 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType) 

514 data = load_iris() 

515 # X = data.data 

516 y = data.target 

517 y2 = [{_: dtype(1000 + i)} for i, _ in enumerate(y)] 

518 y2[0][2] = -2 

519 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType 

520 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))] 

521 y2 = numpy.array(y2) 

522 y = y.astype(numpy.int64) 

523 return (y2, y, itt, 'transform', 0, y2) 

524 

525 

526def _problem_for_tfidf_vectorizer(dtype=numpy.float32, n_features=None): 

527 """ 

528 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfVectorizer`. 

529 """ 

530 from skl2onnx.common.data_types import ( # delayed 

531 StringTensorType) 

532 X = numpy.array([_[0] for _ in text_alpha_num]) 

533 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype) 

534 itt = [("X", StringTensorType([None]))] 

535 return (X, y, itt, 'transform', 0, X) 

536 

537 

538def _problem_for_tfidf_transformer(dtype=numpy.float32, n_features=None): 

539 """ 

540 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfTransformer`. 

541 """ 

542 from skl2onnx.common.data_types import ( # delayed 

543 FloatTensorType, DoubleTensorType) 

544 X = numpy.array([_[0] for _ in text_alpha_num]) 

545 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype) 

546 X2 = CountVectorizer().fit_transform(X).astype(dtype) 

547 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType 

548 itt = [("X", cltype([None, X2.shape[1]]))] 

549 return (X2, y, itt, 'transform', 0, X2) 

550 

551 

552def _problem_for_feature_hasher(dtype=numpy.float32, n_features=None): 

553 """ 

554 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`. 

555 """ 

556 from skl2onnx.common.data_types import ( # delayed 

557 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType) 

558 data = load_iris() 

559 # X = data.data 

560 y = data.target 

561 y2 = [{("cl%d" % _): dtype(1000 + i)} for i, _ in enumerate(y)] 

562 y2[0]["cl2"] = -2 

563 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType 

564 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))] 

565 y2 = numpy.array(y2) 

566 return (y2, y, itt, 'transform', 0, y2) 

567 

568 

569def _problem_for_one_hot_encoder(dtype=numpy.float32, n_features=None): 

570 """ 

571 Returns a problem for the :epkg:`sklearn:preprocessing:OneHotEncoder`. 

572 """ 

573 data = load_iris() 

574 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

575 rnd = state.randn(*data.data.shape) / 3 

576 X = _modify_dimension(data.data + rnd, n_features) 

577 X = X.astype(numpy.int32).astype(dtype) 

578 y = data.target 

579 X, y = shuffle(X, y, random_state=1) 

580 itt = [('X', X[:1].astype(dtype))] 

581 return (X[:, :1], y, itt, 'transform', 0, X[:, :1].astype(dtype)) 

582 

583 

584def find_suitable_problem(model): 

585 """ 

586 Determines problems suitable for a given 

587 :epkg:`scikit-learn` operator. It may be 

588 

589 * `b-cl`: binary classification 

590 * `m-cl`: m-cl classification 

591 * `m-label`: classification m-label 

592 (multiple labels possible at the same time) 

593 * `reg`: regression 

594 * `m-reg`: regression multi-output 

595 * `num-tr`: transform numerical features 

596 * `num-tr-pos`: transform numerical positive features 

597 * `scoring`: transform numerical features, target is usually needed 

598 * `outlier`: outlier prediction 

599 * `linearsvc`: classifier without *predict_proba* 

600 * `cluster`: similar to transform 

601 * `num+y-tr`: similar to transform with targets 

602 * `num+y-tr-cl`: similar to transform with classes 

603 * `num-tr-clu`: similar to cluster, but returns 

604 scores or distances instead of cluster 

605 * `key-col`: list of dictionaries 

606 * `text-col`: one column of text 

607 

608 Suffix `nofit` indicates the predictions happens 

609 without the model being fitted. This is the case 

610 for :epkg:`sklearn:gaussian_process:GaussianProcessRegressor`. 

611 The suffix `-cov` indicates the method `predict` was called 

612 with parameter ``return_cov=True``, `-std` tells 

613 method `predict` was called with parameter ``return_std=True``. 

614 The suffix ``-NSV`` creates an input variable 

615 like the following ``[('X', FloatTensorType([None, None]))]``. 

616 That's a way to bypass :epkg:`onnxruntime` shape checking 

617 as one part of the graph is designed to handle any 

618 kind of dimensions but apparently, if the input shape is 

619 precise, every part of the graph has to be precise. The strings 

620 used variables which means it is at the same time precise 

621 and unprecise. Suffix ``'-64'`` means the model will 

622 do double computations. Suffix ``-nop`` means the classifier 

623 does not implement method *predict_proba*. Suffix ``-1d`` 

624 means a one dimension problem (one feature). Suffix ``-dec`` 

625 checks method `decision_function`. 

626 

627 The following script gives the list of :epkg:`scikit-learn` 

628 models and the problem they can be fitted on. 

629 

630 .. runpython:: 

631 :showcode: 

632 :warningout: DeprecationWarning 

633 :rst: 

634 

635 from mlprodict.onnxrt.validate.validate import ( 

636 sklearn_operators, find_suitable_problem) 

637 from pyquickhelper.pandashelper import df2rst 

638 from pandas import DataFrame 

639 res = sklearn_operators() 

640 rows = [] 

641 for model in res[:20]: 

642 name = model['name'] 

643 row = dict(name=name) 

644 try: 

645 prob = find_suitable_problem(model['cl']) 

646 if prob is None: 

647 continue 

648 for p in prob: 

649 row[p] = 'X' 

650 except RuntimeError: 

651 pass 

652 rows.append(row) 

653 df = DataFrame(rows).set_index('name') 

654 df = df.sort_index() 

655 print(df2rst(df, index=True)) 

656 

657 The list is truncated. The full list can be found at 

658 :ref:`l-model-problem-list`. 

659 """ 

660 from ...onnx_conv.validate_scenarios import find_suitable_problem as ext_find_suitable_problem 

661 

662 def _internal(model): # pylint: disable=R0911 

663 

664 # checks that this model is not overwritten by this module 

665 ext = ext_find_suitable_problem(model) 

666 if ext is not None: 

667 return ext 

668 

669 # Exceptions 

670 if model in {GaussianProcessRegressor}: 

671 # m-reg causes MemoryError on some machine. 

672 return ['~b-reg-NF-64', # '~m-reg-NF-64', 

673 '~b-reg-NF-cov-64', # '~m-reg-NF-cov-64', 

674 '~b-reg-NF-std-64', # '~m-reg-NF-std-64', 

675 '~b-reg-NSV-64', # '~m-reg-NSV-64', 

676 '~b-reg-cov-64', # '~m-reg-cov-64', 

677 '~b-reg-std-NSV-64', # '~m-reg-std-NSV-64', 

678 'b-reg', '~b-reg-64', # 'm-reg' 

679 ] 

680 

681 if model in {DictVectorizer}: 

682 return ['key-int-col'] 

683 

684 if model in {TfidfVectorizer, CountVectorizer}: 

685 return ['text-col'] 

686 

687 if model in {TfidfTransformer}: 

688 return ['bow'] 

689 

690 if model in {FeatureHasher}: 

691 return ['key-str-col'] 

692 

693 if model in {OneHotEncoder}: 

694 return ['one-hot'] 

695 

696 if model in {LabelBinarizer, LabelEncoder}: 

697 return ['int-col'] 

698 

699 if model in {NuSVC, SVC, SGDClassifier, 

700 HistGradientBoostingClassifier}: 

701 return ['b-cl', 'm-cl', '~b-cl-64', '~b-cl-nan'] 

702 

703 if model in {GaussianProcessClassifier}: 

704 return ['b-cl', 'm-cl', '~b-cl-64'] 

705 

706 if model in {BaggingClassifier, BernoulliNB, CalibratedClassifierCV, 

707 ComplementNB, GaussianNB, 

708 GradientBoostingClassifier, LabelPropagation, LabelSpreading, 

709 LinearDiscriminantAnalysis, LogisticRegressionCV, 

710 MultinomialNB, QuadraticDiscriminantAnalysis, 

711 RandomizedSearchCV}: 

712 return ['b-cl', 'm-cl'] 

713 

714 if model in {Perceptron}: 

715 return ['~b-cl-nop', '~m-cl-nop', '~b-cl-dec', '~m-cl-dec'] 

716 

717 if model in {AdaBoostRegressor}: 

718 return ['b-reg', '~b-reg-64'] 

719 

720 if model in {HistGradientBoostingRegressor}: 

721 return ['b-reg', '~b-reg-64', '~b-reg-nan', '~b-reg-nan-64'] 

722 

723 if model in {LinearSVC, NearestCentroid}: 

724 return ['~b-cl-nop', '~b-cl-nop-64'] 

725 

726 if model in {RFE, RFECV}: 

727 return ['num+y-tr'] 

728 

729 if model in {GridSearchCV}: 

730 return ['b-cl', 'm-cl', 

731 'b-reg', 'm-reg', 

732 '~b-reg-64', '~b-cl-64', 

733 'cluster', 'outlier', '~m-label'] 

734 

735 if model in {VotingClassifier}: 

736 return ['b-cl', 'm-cl'] 

737 

738 if StackingClassifier is not None and model in {StackingClassifier}: 

739 return ['b-cl'] 

740 

741 if StackingRegressor is not None and model in {StackingRegressor}: 

742 return ['b-reg'] 

743 

744 # specific scenarios 

745 if model in {IsotonicRegression}: 

746 return ['~num+y-tr-1d', '~b-reg-1d'] 

747 

748 if model in {ARDRegression, BayesianRidge, ElasticNetCV, 

749 GradientBoostingRegressor, 

750 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC, 

751 LinearSVR, NuSVR, OrthogonalMatchingPursuitCV, 

752 PassiveAggressiveRegressor, SGDRegressor, 

753 TheilSenRegressor, HuberRegressor, SVR}: 

754 return ['b-reg', '~b-reg-64'] 

755 

756 if model in {MultiOutputClassifier}: 

757 return ['m-cl', '~m-label'] 

758 

759 if model in {MultiOutputRegressor, MultiTaskElasticNet, 

760 MultiTaskElasticNetCV, MultiTaskLassoCV, 

761 MultiTaskLasso}: 

762 return ['m-reg'] 

763 

764 if model in {OneVsOneClassifier, OutputCodeClassifier, 

765 PassiveAggressiveClassifier, RadiusNeighborsClassifier}: 

766 return ['~b-cl-nop', '~m-cl-nop'] 

767 

768 if model in {RidgeClassifier, RidgeClassifierCV}: 

769 return ['~b-cl-nop', '~m-cl-nop', '~m-label'] 

770 

771 # trainable transform 

772 if model in {GenericUnivariateSelect, 

773 NeighborhoodComponentsAnalysis, 

774 PLSSVD, SelectKBest, 

775 SelectPercentile, SelectFromModel}: 

776 return ["num+y-tr"] 

777 

778 if model in {SelectFwe, SelectFdr, SelectFpr}: 

779 return ["num+y-tr-cl"] 

780 

781 # no m-label 

782 if model in {AdaBoostClassifier}: 

783 return ['b-cl', '~b-cl-64', 'm-cl'] 

784 

785 if model in {LogisticRegression}: 

786 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-dec', '~m-cl-dec'] 

787 

788 if model in {RandomForestClassifier}: 

789 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label'] 

790 

791 if model in {DecisionTreeClassifier, ExtraTreeClassifier}: 

792 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-f100', '~m-label'] 

793 

794 if model in {DecisionTreeRegressor}: 

795 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64', '~b-reg-f100'] 

796 

797 if model in {LatentDirichletAllocation, NMF, PowerTransformer}: 

798 return ['num-tr-pos'] 

799 

800 if hasattr(model, 'predict'): 

801 if "Classifier" in str(model): 

802 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label'] 

803 elif "Regressor" in str(model): 

804 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64'] 

805 

806 # Generic case. 

807 res = [] 

808 if hasattr(model, 'transform'): 

809 if issubclass(model, (RegressorMixin, ClassifierMixin)): 

810 res.extend(['num+y-tr']) 

811 elif issubclass(model, (ClusterMixin, BiclusterMixin)): 

812 res.extend(['~num-tr-clu', '~num-tr-clu-64']) 

813 else: 

814 res.extend(['num-tr']) 

815 

816 if hasattr(model, 'predict') and issubclass(model, (ClusterMixin, BiclusterMixin)): 

817 res.extend(['cluster', '~b-clu-64']) 

818 

819 if issubclass(model, (OutlierMixin)): 

820 res.extend(['outlier']) 

821 

822 if issubclass(model, ClassifierMixin): 

823 if model is OneVsRestClassifier: 

824 return ['m-cl', '~m-label'] 

825 res.extend(['b-cl', '~b-cl-64', 'm-cl', '~m-label']) 

826 if issubclass(model, RegressorMixin): 

827 res.extend(['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64']) 

828 if issubclass(model, BaseMixture): 

829 res.extend(['mix', '~mix-64']) 

830 

831 if len(res) > 0: 

832 return res 

833 

834 raise RuntimeError("Unable to find problem for model '{}' - {}." 

835 "".format(model.__name__, model.__bases__)) 

836 

837 res = _internal(model) 

838 for r in res: 

839 if r not in _problems: 

840 raise ValueError( # pragma: no cover 

841 "Unrecognized problem '{}' in\n{}".format( 

842 r, "\n".join(sorted(_problems)))) 

843 return res 

844 

845 

846_problems = { 

847 # standard 

848 "b-cl": _problem_for_predictor_binary_classification, 

849 "m-cl": _problem_for_predictor_multi_classification, 

850 "b-reg": _problem_for_predictor_regression, 

851 "m-reg": _problem_for_predictor_multi_regression, 

852 "num-tr": _problem_for_numerical_transform, 

853 "num-tr-pos": _problem_for_numerical_transform_positive, 

854 'outlier': _problem_for_outlier, 

855 'cluster': _problem_for_clustering, 

856 'num+y-tr': _problem_for_numerical_trainable_transform, 

857 'num+y-tr-cl': _problem_for_numerical_trainable_transform_cl, 

858 'mix': _problem_for_mixture, 

859 # others 

860 '~num-tr-clu': _problem_for_clustering_scores, 

861 "~m-label": _problem_for_predictor_multi_classification_label, 

862 "~scoring": _problem_for_numerical_scoring, 

863 '~b-cl-nop': _problem_for_clnoproba_binary, 

864 '~m-cl-nop': _problem_for_clnoproba, 

865 '~b-cl-dec': _problem_for_cl_decision_function_binary, 

866 '~m-cl-dec': _problem_for_cl_decision_function, 

867 # nan 

868 "~b-reg-nan": lambda n_features=None: _problem_for_predictor_regression( 

869 n_features=n_features, add_nan=True), 

870 "~b-reg-nan-64": lambda n_features=None: _problem_for_predictor_regression( 

871 dtype=numpy.float64, n_features=n_features, add_nan=True), 

872 "~b-cl-nan": lambda dtype=numpy.float32, n_features=None: _problem_for_predictor_binary_classification( 

873 dtype=dtype, n_features=n_features, add_nan=True), 

874 # 100 features 

875 "~b-reg-f100": lambda n_features=100: _problem_for_predictor_regression( 

876 n_features=n_features or 100), 

877 "~b-cl-f100": lambda n_features=100: _problem_for_predictor_binary_classification( 

878 n_features=n_features or 100), 

879 # 64 

880 "~b-cl-64": lambda n_features=None: _problem_for_predictor_binary_classification( 

881 dtype=numpy.float64, n_features=n_features), 

882 "~b-reg-64": lambda n_features=None: _problem_for_predictor_regression( 

883 dtype=numpy.float64, n_features=n_features), 

884 '~b-cl-nop-64': lambda n_features=None: _problem_for_clnoproba( 

885 dtype=numpy.float64, n_features=n_features), 

886 '~b-clu-64': lambda n_features=None: _problem_for_clustering( 

887 dtype=numpy.float64, n_features=n_features), 

888 '~b-cl-dec-64': lambda n_features=None: _problem_for_cl_decision_function_binary( 

889 dtype=numpy.float64, n_features=n_features), 

890 '~num-tr-clu-64': lambda n_features=None: _problem_for_clustering_scores( 

891 dtype=numpy.float64, n_features=n_features), 

892 "~m-reg-64": lambda n_features=None: _problem_for_predictor_multi_regression( 

893 dtype=numpy.float64, n_features=n_features), 

894 "~num-tr-64": lambda n_features=None: _problem_for_numerical_transform( 

895 dtype=numpy.float64, n_features=n_features), 

896 '~mix-64': lambda n_features=None: _problem_for_mixture( 

897 dtype=numpy.float64, n_features=n_features), 

898 # 

899 "~b-cl-NF": (lambda n_features=None: _problem_for_predictor_binary_classification( 

900 n_features=n_features) + (False, )), 

901 "~m-cl-NF": (lambda n_features=None: _problem_for_predictor_multi_classification( 

902 n_features=n_features) + (False, )), 

903 "~b-reg-NF": (lambda n_features=None: _problem_for_predictor_regression( 

904 n_features=n_features) + (False, )), 

905 "~m-reg-NF": (lambda n_features=None: _problem_for_predictor_multi_regression( 

906 n_features=n_features) + (False, )), 

907 # 

908 "~b-cl-NF-64": (lambda n_features=None: _problem_for_predictor_binary_classification( 

909 dtype=numpy.float64, n_features=n_features) + (False, )), 

910 "~m-cl-NF-64": (lambda n_features=None: _problem_for_predictor_multi_classification( 

911 dtype=numpy.float64, n_features=n_features) + (False, )), 

912 "~b-reg-NF-64": (lambda n_features=None: _problem_for_predictor_regression( 

913 dtype=numpy.float64, n_features=n_features) + (False, )), 

914 "~m-reg-NF-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

915 dtype=numpy.float64, n_features=n_features) + (False, )), 

916 # GaussianProcess 

917 "~b-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_regression( 

918 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

919 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

920 "~m-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

921 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

922 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

923 # 

924 "~b-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_regression( 

925 True, options={GaussianProcessRegressor: {"return_std": True}}, 

926 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

927 "~m-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

928 True, options={GaussianProcessRegressor: {"return_std": True}}, 

929 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

930 # 

931 "~b-reg-cov-64": (lambda n_features=None: _problem_for_predictor_regression( 

932 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

933 return_cov=True, dtype=numpy.float64, n_features=n_features)), 

934 "~m-reg-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

935 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

936 return_cov=True, dtype=numpy.float64, n_features=n_features)), 

937 # 

938 "~reg-std-64": (lambda n_features=None: _problem_for_predictor_regression( 

939 True, options={GaussianProcessRegressor: {"return_std": True}}, 

940 return_std=True, dtype=numpy.float64, n_features=n_features)), 

941 "~m-reg-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

942 True, options={GaussianProcessRegressor: {"return_std": True}}, 

943 return_std=True, dtype=numpy.float64, n_features=n_features)), 

944 # 

945 '~b-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_regression( 

946 dtype=numpy.float64, n_features=n_features)), 

947 '~m-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression( 

948 dtype=numpy.float64, n_features=n_features)), 

949 "~b-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_regression( 

950 True, options={GaussianProcessRegressor: {"return_std": True}}, 

951 return_std=True, dtype=numpy.float64, n_features=n_features))), 

952 "~m-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression( 

953 True, options={GaussianProcessRegressor: {"return_std": True}}, 

954 return_std=True, dtype=numpy.float64, n_features=n_features))), 

955 # isotonic 

956 "~b-reg-1d": _1d_problem(_problem_for_predictor_regression), 

957 '~num+y-tr-1d': _1d_problem(_problem_for_numerical_trainable_transform), 

958 # text 

959 "key-int-col": _problem_for_dict_vectorizer, 

960 "key-str-col": _problem_for_feature_hasher, 

961 "int-col": _problem_for_label_encoder, 

962 "one-hot": _problem_for_one_hot_encoder, 

963 'text-col': _problem_for_tfidf_vectorizer, 

964 'bow': _problem_for_tfidf_transformer, 

965}