Benchmark onnxruntime optimization

onnxruntime does optimize the ONNX graph before running the inference. It tries for example to fuse a matrix multiplication following or followed by a transpose, choosing the most efficient path.

One ONNX file

This section creates an ONNX graph if there is not one.

import os
from collections import OrderedDict, Counter
import numpy
import onnx
from cpyquickhelper.numbers.speed_measure import measure_time
import pandas
from onnxruntime import InferenceSession, SessionOptions, get_device
from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
    SessionIOBinding, OrtDevice as C_OrtDevice, OrtValue as C_OrtValue,
    GraphOptimizationLevel)
from sklearn.neighbors import RadiusNeighborsRegressor
from skl2onnx import to_onnx
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation

Available optimisation on this machine.

print(code_optimisation())

Out:

AVX-omp=8

Building the model

filename = "onnx_to_profile.onnx"

if not os.path.exists(filename):
    print("Generate a graph for %r." % filename)
    X = numpy.random.randn(1000, 10).astype(numpy.float64)
    y = X.sum(axis=1).reshape((-1, 1))

    model = RadiusNeighborsRegressor()
    model.fit(X, y)
    onx = to_onnx(model, X, options={'optim': 'cdist'})

    with open(filename, "wb") as f:
        f.write(onx.SerializeToString())

Functions

We need to generate random inputs to test the graph.

def random_input(typ, shape, batch):
    if typ == 'tensor(double)':
        dtype = numpy.float64
    elif typ == 'tensor(float)':
        dtype = numpy.float32
    else:
        raise NotImplementedError(
            "Unable to guess dtype from %r." % typ)

    if len(shape) <= 1:
        new_shape = shape
    elif shape[0] is None:
        new_shape = tuple([batch] + list(shape[1:]))
    else:
        new_shape = shape
    return numpy.random.randn(*new_shape).astype(dtype)


def random_feed(sess, batch=10):
    """
    Creates a dictionary of random inputs.

    :param batch: dimension to use as batch dimension if unknown
    :return: dictionary
    """
    inputs = sess.get_inputs()
    res = OrderedDict()
    for inp in inputs:
        name = inp.name
        typ = inp.type
        shape = inp.shape
        res[name] = random_input(typ, shape, batch)
    return res

A function which calls the API for any device.

def run_with_iobinding(sess, bind, ort_device, feed_ort_value, outputs):
    for name, (value, dtype) in feed_ort_value.items():
        bind.bind_input(name, ort_device, dtype, value.shape(),
                        value.data_ptr())
    for out in outputs:
        bind.bind_output(out, ort_device)
    sess._sess.run_with_iobinding(bind, None)
    ortvalues = bind.get_outputs()
    return [o.numpy() for o in ortvalues]

Benchmark

Let’s choose the device available on this machine. batch dimension is set to 10.

batch = 200

if get_device().upper() == 'GPU':
    ort_device = C_OrtDevice(
        C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0)
    provider = 'CUDAExecutionProvider'
else:
    ort_device = C_OrtDevice(
        C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
    provider = 'CPUExecutionProvider'
print("provider = %r" % provider)

Out:

provider = 'CPUExecutionProvider'

We load the graph.

with open(filename, 'rb') as f:
    onx = onnx.load(f)

Create of the session.

data = []
files = []
legend = []

for graph_opt, name_opt in tqdm([
        (GraphOptimizationLevel.ORT_DISABLE_ALL, "ORT_DISABLE_ALL"),
        (GraphOptimizationLevel.ORT_ENABLE_BASIC, "ORT_ENABLE_BASIC"),
        (GraphOptimizationLevel.ORT_ENABLE_EXTENDED, "ORT_ENABLE_EXTENDED"),
        (GraphOptimizationLevel.ORT_ENABLE_ALL, "ORT_ENABLE_ALL")]):

    so = SessionOptions()
    so.graph_optimization_level = graph_opt
    so.optimized_model_filepath = (
        os.path.split(filename)[-1] + ".optimized.%s.onnx" % name_opt)
    files.append(so.optimized_model_filepath)
    legend.append(name_opt)
    sess = InferenceSession(onx.SerializeToString(), so,
                            providers=[provider])
    bind = SessionIOBinding(sess._sess)

    #####################################
    # Creates random data
    feed = random_feed(sess, batch)

    #####################################
    # moving the data on CPU or GPU
    feed_ort_value = OrderedDict(
        (name, (C_OrtValue.ortvalue_from_numpy(v, ort_device), v.dtype))
        for name, v in feed.items())
    outputs = [o.name for o in sess.get_outputs()]

    #######################################
    # The profiling.

    obs = measure_time(
        lambda: run_with_iobinding(
            sess, bind, ort_device, feed_ort_value, outputs),
        context=dict(run_with_iobinding=run_with_iobinding,
                     feed_ort_value=feed_ort_value, outputs=outputs,
                     sess=sess, bind=bind, ort_device=ort_device),
        repeat=10, number=10, div_by_number=True)
    obs['name'] = name_opt
    data.append(obs)


df = pandas.DataFrame(data)
df

Out:

  0%|          | 0/4 [00:00<?, ?it/s]
 25%|##5       | 1/4 [00:03<00:09,  3.08s/it]
 50%|#####     | 2/4 [00:06<00:06,  3.05s/it]
 75%|#######5  | 3/4 [00:09<00:03,  3.04s/it]
100%|##########| 4/4 [00:12<00:00,  3.04s/it]
100%|##########| 4/4 [00:12<00:00,  3.04s/it]
average deviation min_exec max_exec repeat number ttime context_size name
0 0.030690 0.000058 0.030657 0.030861 10 10 0.306897 360 ORT_DISABLE_ALL
1 0.030198 0.000008 0.030178 0.030206 10 10 0.301984 360 ORT_ENABLE_BASIC
2 0.030185 0.000023 0.030165 0.030241 10 10 0.301846 360 ORT_ENABLE_EXTENDED
3 0.030152 0.000008 0.030142 0.030172 10 10 0.301523 360 ORT_ENABLE_ALL


Graph

df = df.set_index('name')
dev = df[['deviation']].copy()
dev.columns = ['average']
ax = df[['average']].plot.bar(yerr=dev)
ax.set_title(os.path.split(filename)[-1])
ax.tick_params(axis='x', labelrotation=15)
onnx_to_profile.onnx

The result are similar because the optimized model was very similar.

data = []
for name in files:
    with open(name, "rb") as f:
        onx = onnx.load(f)
    op_names = [op.op_type for op in onx.graph.node]
    data.append(Counter(op_names))

df = pandas.DataFrame(data).T
df.columns = legend
df
ORT_DISABLE_ALL ORT_ENABLE_BASIC ORT_ENABLE_EXTENDED ORT_ENABLE_ALL
CDist 1 1 1 1
Shape 2 2 2 2
Less 1 1 1 1
Cast 3 2 2 2
ConstantOfShape 1 1 1 1
ReduceSum 2 2 2 2
CumSum 1 1 1 1
Neg 1 1 1 1
Add 1 1 1 1
Where 1 1 1 1
Flatten 1 1 1 1
ArrayFeatureExtractor 1 1 1 1
Reshape 3 3 3 3
Mul 1 1 1 1
Div 1 1 1 1


Graph.

ax = df.plot.barh(yerr=dev)
ax.set_title(os.path.split(filename)[-1])

# import matplotlib.pyplot as plt
# plt.show()
onnx_to_profile.onnx

Out:

Text(0.5, 1.0, 'onnx_to_profile.onnx')

Total running time of the script: ( 0 minutes 13.603 seconds)

Gallery generated by Sphinx-Gallery