Benchmark Random Forests, Tree Ensemble, Multi-Classification#

The script compares different implementations for the operator TreeEnsembleRegressor for a multi-regression. It replicates the benchmark Benchmark Random Forests, Tree Ensemble, (AoS and SoA) for multi-classification.


import warnings
from time import perf_counter as time
from multiprocessing import cpu_count
import numpy
from numpy.random import rand
from numpy.testing import assert_almost_equal
import pandas
import matplotlib.pyplot as plt
from sklearn import config_context
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils._testing import ignore_warnings
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from onnxruntime import InferenceSession
from mlprodict.onnxrt import OnnxInference

Available optimisation on this machine.

from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation




def version():
    from datetime import datetime
    import sklearn
    import numpy
    import onnx
    import onnxruntime
    import skl2onnx
    import mlprodict
    df = pandas.DataFrame([
        {"name": "date", "version": str(},
        {"name": "numpy", "version": numpy.__version__},
        {"name": "scikit-learn", "version": sklearn.__version__},
        {"name": "onnx", "version": onnx.__version__},
        {"name": "onnxruntime", "version": onnxruntime.__version__},
        {"name": "skl2onnx", "version": skl2onnx.__version__},
        {"name": "mlprodict", "version": mlprodict.__version__},
    return df

name version
0 date 2022-04-05 06:13:50.173727
1 numpy 1.21.5
2 scikit-learn 1.0.2
3 onnx 1.11.0
4 onnxruntime 1.11.0
5 skl2onnx 1.11.1
6 mlprodict 0.8.1762

Implementations to benchmark#

def fcts_model(X, y, max_depth, n_estimators, n_jobs):
    rf = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators,
                                n_jobs=n_jobs), y)

    initial_types = [('X', FloatTensorType([None, X.shape[1]]))]
    onx = convert_sklearn(rf, initial_types=initial_types,
                          options={id(rf): {'zipmap': False}})
    sess = InferenceSession(onx.SerializeToString())
    outputs = [ for o in sess.get_outputs()]
    oinf = OnnxInference(onx, runtime="python")
    oinf.sequence_[0].ops_._init(numpy.float32, 1)
    name = outputs[1]
    oinf2 = OnnxInference(onx, runtime="python")
    oinf2.sequence_[0].ops_._init(numpy.float32, 2)
    oinf3 = OnnxInference(onx, runtime="python")
    oinf3.sequence_[0].ops_._init(numpy.float32, 3)

    def predict_skl_predict(X, model=rf):
        return rf.predict_proba(X)

    def predict_onnxrt_predict(X, sess=sess):
        return[:1], {'X': X})[0]

    def predict_onnx_inference(X, oinf=oinf):
        return{'X': X})[name]

    def predict_onnx_inference2(X, oinf2=oinf2):
        return{'X': X})[name]

    def predict_onnx_inference3(X, oinf3=oinf3):
        return{'X': X})[name]

    return {'predict': (
        predict_skl_predict, predict_onnxrt_predict,
        predict_onnx_inference, predict_onnx_inference2,


def allow_configuration(**kwargs):
    return True

def bench(n_obs, n_features, max_depths, n_estimatorss, n_jobss,
          methods, repeat=10, verbose=False):
    res = []
    for nfeat in n_features:

        ntrain = 50000
        X_train = numpy.empty((ntrain, nfeat)).astype(numpy.float32)
        X_train[:, :] = rand(ntrain, nfeat)[:, :]
        eps = rand(ntrain) - 0.5
        y_train_f = X_train.sum(axis=1) + eps
        y_train = (y_train_f > 12).astype(numpy.int64)
        y_train[y_train_f > 15] = 2
        y_train[y_train_f < 10] = 3

        for n_jobs in n_jobss:
            for max_depth in max_depths:
                for n_estimators in n_estimatorss:
                    fcts = fcts_model(X_train, y_train,
                                      max_depth, n_estimators, n_jobs)

                    for n in n_obs:
                        for method in methods:

                            fct1, fct2, fct3, fct4, fct5 = fcts[method]

                            if not allow_configuration(
                                    n=n, nfeat=nfeat, max_depth=max_depth,
                                    n_estimator=n_estimators, n_jobs=n_jobs,

                            obs = dict(n_obs=n, nfeat=nfeat,

                            # creates different inputs to avoid caching
                            Xs = []
                            for r in range(repeat):
                                x = numpy.empty((n, nfeat))
                                x[:, :] = rand(n, nfeat)[:, :]

                            # measures the baseline
                            with config_context(assume_finite=True):
                                st = time()
                                repeated = 0
                                for X in Xs:
                                    p1 = fct1(X)
                                    repeated += 1
                                    if time() - st >= 1:
                                        break  # stops if longer than a second
                                end = time()
                                obs["time_skl"] = (end - st) / repeated

                            # measures the new implementation
                            st = time()
                            r2 = 0
                            for X in Xs:
                                p2 = fct2(X)
                                r2 += 1
                                if r2 >= repeated:
                            end = time()
                            obs["time_ort"] = (end - st) / r2

                            # measures the other new implementation
                            st = time()
                            r2 = 0
                            for X in Xs:
                                p2 = fct3(X)
                                r2 += 1
                                if r2 >= repeated:
                            end = time()
                            obs["time_mlprodict"] = (end - st) / r2

                            # measures the other new implementation 2
                            st = time()
                            r2 = 0
                            for X in Xs:
                                p2 = fct4(X)
                                r2 += 1
                                if r2 >= repeated:
                            end = time()
                            obs["time_mlprodict2"] = (end - st) / r2

                            # measures the other new implementation 3
                            st = time()
                            r2 = 0
                            for X in Xs:
                                p2 = fct5(X)
                                r2 += 1
                                if r2 >= repeated:
                            end = time()
                            obs["time_mlprodict3"] = (end - st) / r2

                            # final
                            if verbose and (len(res) % 1 == 0 or n >= 10000):
                                print("bench", len(res), ":", obs)

                            # checks that both produce the same outputs
                            if n <= 10000:
                                if len(p1.shape) == 1 and len(p2.shape) == 2:
                                    p2 = p2.ravel()
                                        p1.ravel(), p2.ravel(), decimal=5)
                                except AssertionError as e:
    return res


def plot_rf_models(dfr):

    def autolabel(ax, rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate('%1.1fx' % height,
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom',

    engines = [_.split('_')[-1] for _ in dfr.columns if _.startswith("time_")]
    engines = [_ for _ in engines if _ != 'skl']
    for engine in engines:
        dfr["speedup_%s" % engine] = dfr["time_skl"] / dfr["time_%s" % engine]

    ncols = 4
    fig, axs = plt.subplots(len(engines), ncols, figsize=(
        14, 4 * len(engines)), sharey=True)

    row = 0
    for row, engine in enumerate(engines):
        pos = 0
        name = "RandomForestClassifier - %s" % engine
        for max_depth in sorted(set(dfr.max_depth)):
            for nf in sorted(set(dfr.nfeat)):
                for est in sorted(set(dfr.n_estimators)):
                    for n_jobs in sorted(set(dfr.n_jobs)):
                        sub = dfr[(dfr.max_depth == max_depth) &
                                  (dfr.nfeat == nf) &
                                  (dfr.n_estimators == est) &
                                  (dfr.n_jobs == n_jobs)]
                        ax = axs[row, pos]
                        labels = sub.n_obs
                        means = sub["speedup_%s" % engine]

                        x = numpy.arange(len(labels))
                        width = 0.90

                        rects1 =, means, width, label='Speedup')
                        if pos == 0:
                            ax.set_ylim([0.1, max(dfr["speedup_%s" % engine])])

                        if pos == 0:
                            '%s\ndepth %d - %d features\n %d estimators '
                            '%d jobs' % (name, max_depth, nf, est, n_jobs))
                        if row == len(engines) - 1:
                            ax.set_xlabel('batch size')
                        autolabel(ax, rects1)
                        for tick in ax.xaxis.get_major_ticks():
                        for tick in ax.yaxis.get_major_ticks():
                        pos += 1

    return fig, ax

Run benchs#

def run_bench(repeat=100, verbose=False):
    n_obs = [1, 10, 100, 1000, 10000]
    methods = ['predict']
    n_features = [30]
    max_depths = [6, 8, 10, 12]
    n_estimatorss = [100]
    n_jobss = [cpu_count()]

    start = time()
    results = bench(n_obs, n_features, max_depths, n_estimatorss, n_jobss,
                    methods, repeat=repeat, verbose=verbose)
    end = time()

    results_df = pandas.DataFrame(results)
    print("Total time = %0.3f sec cpu=%d\n" % (end - start, cpu_count()))

    # plot the results
    return results_df

name = "plot_random_forest_cls_multi"
df = run_bench(verbose=True)
df.to_csv("%s.csv" % name, index=False)
df.to_excel("%s.xlsx" % name, index=False)
fig, ax = plot_rf_models(df)
fig.savefig("%s.png" % name)
RandomForestClassifier - ort depth 6 - 30 features  100 estimators 8 jobs, RandomForestClassifier - ort depth 8 - 30 features  100 estimators 8 jobs, RandomForestClassifier - ort depth 10 - 30 features  100 estimators 8 jobs, RandomForestClassifier - ort depth 12 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict depth 6 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict depth 8 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict depth 10 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict depth 12 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict2 depth 6 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict2 depth 8 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict2 depth 10 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict2 depth 12 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict3 depth 6 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict3 depth 8 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict3 depth 10 - 30 features  100 estimators 8 jobs, RandomForestClassifier - mlprodict3 depth 12 - 30 features  100 estimators 8 jobs


Total time = 291.966 sec cpu=8

                            15         16         17         18        19
n_obs                        1         10        100       1000     10000
nfeat                       30         30         30         30        30
max_depth                   12         12         12         12        12
n_estimators               100        100        100        100       100
method                 predict    predict    predict    predict   predict
n_jobs                       8          8          8          8         8
time_skl              0.061098   0.066983   0.081959   0.115955  0.162307
time_ort              0.000176   0.001231   0.002462    0.01241  0.093968
time_mlprodict        0.004886    0.00143   0.007573   0.032509  0.255113
time_mlprodict2       0.000256   0.001713   0.004833   0.036906  0.300205
time_mlprodict3       0.000191   0.000977   0.001557   0.008751  0.058248
speedup_ort         347.593266  54.415797  33.295689   9.343991  1.727249
speedup_mlprodict     12.50479  46.838871  10.822188   3.566848  0.636214
speedup_mlprodict2  238.562945  39.110322  16.958438   3.141931  0.540652
speedup_mlprodict3  319.415312  68.570292  52.622911  13.250676  2.786474

Total running time of the script: ( 5 minutes 8.492 seconds)

