Note

Click here to download the full example code

Benchmark onnxruntime optimization¶

onnxruntime does optimize the ONNX graph before running the inference. It tries for example to fuse a matrix multiplication following or followed by a transpose, choosing the most efficient path.

One ONNX file
Building the model
Functions
Benchmark
Graph

One ONNX file ¶

This section creates an ONNX graph if there is not one.

import os
from collections import OrderedDict, Counter
import numpy
import onnx
from cpyquickhelper.numbers.speed_measure import measure_time
import pandas
from onnxruntime import InferenceSession, SessionOptions, get_device
from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
    SessionIOBinding, OrtDevice as C_OrtDevice, OrtValue as C_OrtValue,
    GraphOptimizationLevel)
from sklearn.neighbors import RadiusNeighborsRegressor
from skl2onnx import to_onnx
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation

Available optimisation on this machine.

print(code_optimisation())

Out:

AVX-omp=8

Building the model ¶

filename = "onnx_to_profile.onnx"

if not os.path.exists(filename):
    print("Generate a graph for %r." % filename)
    X = numpy.random.randn(1000, 10).astype(numpy.float64)
    y = X.sum(axis=1).reshape((-1, 1))

    model = RadiusNeighborsRegressor()
    model.fit(X, y)
    onx = to_onnx(model, X, options={'optim': 'cdist'})

    with open(filename, "wb") as f:
        f.write(onx.SerializeToString())

Functions ¶

We need to generate random inputs to test the graph.

def random_input(typ, shape, batch):
    if typ == 'tensor(double)':
        dtype = numpy.float64
    elif typ == 'tensor(float)':
        dtype = numpy.float32
    else:
        raise NotImplementedError(
            "Unable to guess dtype from %r." % typ)

    if len(shape) <= 1:
        new_shape = shape
    elif shape[0] is None:
        new_shape = tuple([batch] + list(shape[1:]))
    else:
        new_shape = shape
    return numpy.random.randn(*new_shape).astype(dtype)


def random_feed(sess, batch=10):
    """
    Creates a dictionary of random inputs.

    :param batch: dimension to use as batch dimension if unknown
    :return: dictionary
    """
    inputs = sess.get_inputs()
    res = OrderedDict()
    for inp in inputs:
        name = inp.name
        typ = inp.type
        shape = inp.shape
        res[name] = random_input(typ, shape, batch)
    return res

A function which calls the API for any device.

def run_with_iobinding(sess, bind, ort_device, feed_ort_value, outputs):
    for name, (value, dtype) in feed_ort_value.items():
        bind.bind_input(name, ort_device, dtype, value.shape(),
                        value.data_ptr())
    for out in outputs:
        bind.bind_output(out, ort_device)
    sess._sess.run_with_iobinding(bind, None)
    ortvalues = bind.get_outputs()
    return [o.numpy() for o in ortvalues]

Benchmark ¶

Let’s choose the device available on this machine. batch dimension is set to 10.

batch = 200

if get_device().upper() == 'GPU':
    ort_device = C_OrtDevice(
        C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0)
    provider = 'CUDAExecutionProvider'
else:
    ort_device = C_OrtDevice(
        C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
    provider = 'CPUExecutionProvider'
print("provider = %r" % provider)

Out:

provider = 'CPUExecutionProvider'

We load the graph.

with open(filename, 'rb') as f:
    onx = onnx.load(f)

Create of the session.

data = []
files = []
legend = []

for graph_opt, name_opt in tqdm([
        (GraphOptimizationLevel.ORT_DISABLE_ALL, "ORT_DISABLE_ALL"),
        (GraphOptimizationLevel.ORT_ENABLE_BASIC, "ORT_ENABLE_BASIC"),
        (GraphOptimizationLevel.ORT_ENABLE_EXTENDED, "ORT_ENABLE_EXTENDED"),
        (GraphOptimizationLevel.ORT_ENABLE_ALL, "ORT_ENABLE_ALL")]):

    so = SessionOptions()
    so.graph_optimization_level = graph_opt
    so.optimized_model_filepath = (
        os.path.split(filename)[-1] + ".optimized.%s.onnx" % name_opt)
    files.append(so.optimized_model_filepath)
    legend.append(name_opt)
    sess = InferenceSession(onx.SerializeToString(), so,
                            providers=[provider])
    bind = SessionIOBinding(sess._sess)

    #####################################
    # Creates random data
    feed = random_feed(sess, batch)

    #####################################
    # moving the data on CPU or GPU
    feed_ort_value = OrderedDict(
        (name, (C_OrtValue.ortvalue_from_numpy(v, ort_device), v.dtype))
        for name, v in feed.items())
    outputs = [o.name for o in sess.get_outputs()]

    #######################################
    # The profiling.

    obs = measure_time(
        lambda: run_with_iobinding(
            sess, bind, ort_device, feed_ort_value, outputs),
        context=dict(run_with_iobinding=run_with_iobinding,
                     feed_ort_value=feed_ort_value, outputs=outputs,
                     sess=sess, bind=bind, ort_device=ort_device),
        repeat=10, number=10, div_by_number=True)
    obs['name'] = name_opt
    data.append(obs)


df = pandas.DataFrame(data)
df

Out:

  0%|          | 0/4 [00:00<?, ?it/s]
 25%|##5       | 1/4 [00:03<00:09,  3.08s/it]
 50%|#####     | 2/4 [00:06<00:06,  3.05s/it]
 75%|#######5  | 3/4 [00:09<00:03,  3.04s/it]
100%|##########| 4/4 [00:12<00:00,  3.04s/it]
100%|##########| 4/4 [00:12<00:00,  3.04s/it]

	average	deviation	min_exec	max_exec	repeat	number	ttime	context_size	name
0	0.030690	0.000058	0.030657	0.030861	10	10	0.306897	360	ORT_DISABLE_ALL
1	0.030198	0.000008	0.030178	0.030206	10	10	0.301984	360	ORT_ENABLE_BASIC
2	0.030185	0.000023	0.030165	0.030241	10	10	0.301846	360	ORT_ENABLE_EXTENDED
3	0.030152	0.000008	0.030142	0.030172	10	10	0.301523	360	ORT_ENABLE_ALL

Graph ¶

df = df.set_index('name')
dev = df[['deviation']].copy()
dev.columns = ['average']
ax = df[['average']].plot.bar(yerr=dev)
ax.set_title(os.path.split(filename)[-1])
ax.tick_params(axis='x', labelrotation=15)

The result are similar because the optimized model was very similar.

data = []
for name in files:
    with open(name, "rb") as f:
        onx = onnx.load(f)
    op_names = [op.op_type for op in onx.graph.node]
    data.append(Counter(op_names))

df = pandas.DataFrame(data).T
df.columns = legend
df

	ORT_DISABLE_ALL	ORT_ENABLE_BASIC	ORT_ENABLE_EXTENDED	ORT_ENABLE_ALL
CDist	1	1	1	1
Shape	2	2	2	2
Less	1	1	1	1
Cast	3	2	2	2
ConstantOfShape	1	1	1	1
ReduceSum	2	2	2	2
CumSum	1	1	1	1
Neg	1	1	1	1
Add	1	1	1	1
Where	1	1	1	1
Flatten	1	1	1	1
ArrayFeatureExtractor	1	1	1	1
Reshape	3	3	3	3
Mul	1	1	1	1
Div	1	1	1	1

Graph.

ax = df.plot.barh(yerr=dev)
ax.set_title(os.path.split(filename)[-1])

# import matplotlib.pyplot as plt
# plt.show()

Out:

Text(0.5, 1.0, 'onnx_to_profile.onnx')

Total running time of the script: ( 0 minutes 13.603 seconds)

Gallery generated by Sphinx-Gallery

Benchmark inference for a linear regression

Profiling

Benchmark onnxruntime optimization¶

One ONNX file¶

Building the model¶

Functions¶

Benchmark¶

Graph¶

One ONNX file ¶

Building the model ¶

Functions ¶

Benchmark ¶

Graph ¶