Benchmark and profile of operator Slice¶
This short code compares the execution of the operator Slice between numpy and onnxruntime for three configurations.
A simple example¶
import json
import numpy
from numpy.testing import assert_almost_equal
import pandas
from pandas import DataFrame
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession, get_device, SessionOptions
from onnxruntime.capi._pybind_state import ( # pylint: disable=E0611
OrtValue as C_OrtValue)
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxSlice, OnnxAdd, OnnxMul
from cpyquickhelper.numbers import measure_time
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import (
from mlprodict.onnxrt.ops_whole.session import OnnxWholeSession
from onnxcustom.utils.onnxruntime_helper import get_ort_device
print([code_optimisation(), get_device()])
['AVX-omp=8', 'CPU']
The functions to compare.
def build_ort_op(op_version=14, save=None, **kwargs): # opset=13, 14, ...
slices = kwargs['slices']
slice1, slice2 = slices
slice1 = slice(0, None) if slice1 is None else slice(*slice1)
slice2 = slice(0, None) if slice2 is None else slice(*slice2)
axes = []
starts = []
ends = []
for i in [0, 1]:
if slices[i] is None:
starts = numpy.array(starts, dtype=numpy.int64)
ends = numpy.array(ends, dtype=numpy.int64)
axes = numpy.array(axes, dtype=numpy.int64)
node1 = OnnxSlice('X', starts, ends, axes, op_version=op_version)
node2 = OnnxAdd(node1, numpy.array([1], dtype=numpy.float32),
node3 = OnnxSlice(node2, starts, ends, axes,
node4 = OnnxMul(node3, numpy.array([2], dtype=numpy.float32),
op_version=op_version, output_names=['Y'])
onx = node4.to_onnx(inputs=[('X', FloatTensorType([None, None]))],
sess = InferenceSession(onx.SerializeToString(),
if save is not None:
with open(save, "wb") as f:
def npy_fct(x):
return ((x[slice1, slice2] + 1)[slice1, slice2] * 2).copy()
rnd = numpy.random.randn(10, 10).astype(numpy.float32)
expected = npy_fct(rnd)
got =, {'X': rnd})[0]
assert_almost_equal(expected, got)
except AssertionError as e:
raise AssertionError(
"kwargs=%r slice1=%r slice2=%r shapes=%r ? %r "
"(x[slice1, slice2].shape)=%r" % (
kwargs, slice1, slice2, expected.shape,
got.shape, rnd[slice1, slice2].shape)) from e
if get_device().upper() == 'GPU':
sessg = InferenceSession(onx.SerializeToString(),
io_binding = sessg.io_binding()._iobinding
device = get_ort_device('cuda:0')
def run_gpu(x):
'X', device, numpy.float32, x.shape(), x.data_ptr())
io_binding.bind_output('Y', device)
return sessg._sess.run_with_iobinding(io_binding, None)
return onx, lambda x:, {'X': x}), npy_fct, run_gpu
return onx, lambda x:, {'X': x}), npy_fct, None
The benchmark.
def loop_fct(fct, xs):
for x in xs:
def benchmark_op(repeat=10, number=10, name="Slice", shape_slice_fct=None,
save=None, opset=14, repeat_profile=1500, verbose=1):
if verbose:
print("[benchmark_op] start repeat=%d number=%d repeat_profile=%d"
" opset=%d." % (repeat, number, repeat_profile, opset))
res = []
for dim in tqdm([8, 16, 32, 64, 100, 128, 200,
256, 400, 512, 600, 784, 800,
1000, 1024, 1200]):
shape, slices = shape_slice_fct(dim)
onx, ort_fct, npy_fct, ort_fct_gpu = build_ort_op(
save=save, op_version=opset, slices=slices)
n_arrays = 20
if dim >= 512:
n_arrays = 10
xs = [numpy.random.rand(*shape).astype(numpy.float32)
for _ in range(n_arrays)]
info = dict(shape=shape)
ctx = dict(xs=xs, loop_fct=loop_fct)
# numpy
ctx['fct'] = npy_fct
obs = measure_time(
lambda: loop_fct(npy_fct, xs),
div_by_number=True, context=ctx, repeat=repeat, number=number)
obs['dim'] = dim
obs['fct'] = 'numpy'
obs['shape'] = ",".join(map(str, shape))
obs['slices'] = str(slices)
# onnxruntime
ctx['fct'] = ort_fct
obs = measure_time(
lambda: loop_fct(ort_fct, xs),
div_by_number=True, context=ctx, repeat=repeat, number=number)
obs['dim'] = dim
obs['fct'] = 'ort'
obs['shape'] = ",".join(map(str, shape))
obs['slices'] = str(slices)
if ort_fct_gpu is not None:
# onnxruntime
dev = get_ort_device('cuda:0')
ctx['xs'] = [
C_OrtValue.ortvalue_from_numpy(x, dev)
for x in xs]
ctx['fct'] = ort_fct_gpu
obs = measure_time(
lambda: loop_fct(ort_fct_gpu, ctx['xs']),
div_by_number=True, context=ctx, repeat=repeat, number=number)
obs['dim'] = dim
obs['fct'] = 'ort_gpu'
obs['shape'] = ",".join(map(str, shape))
obs['slices'] = str(slices)
# profiling CPU
if verbose:
print("[benchmark_op] done.")
print("[benchmark_op] profile CPU.")
so = SessionOptions()
so.enable_profiling = True
sess = InferenceSession(onx.SerializeToString(), so,
for i in range(0, repeat_profile):, {'X': xs[-1]}, )
prof = sess.end_profiling()
with open(prof, "r") as f:
js = json.load(f)
dfprof = DataFrame(OnnxWholeSession.process_profiling(js))
dfprof['shape'] = ",".join(map(str, shape))
dfprof['slices'] = str(slices)
if verbose:
print("[benchmark_op] done.")
# profiling CPU
if ort_fct_gpu is not None:
if verbose:
print("[benchmark_op] profile GPU.")
so = SessionOptions()
so.enable_profiling = True
sess = InferenceSession(onx.SerializeToString(), so,
io_binding = sess.io_binding()._iobinding
device = get_ort_device('cpu')
for i in range(0, repeat_profile):
x = ctx['xs'][-1]
'X', device, numpy.float32, x.shape(), x.data_ptr())
io_binding.bind_output('Y', device)
sess._sess.run_with_iobinding(io_binding, None)
prof = sess.end_profiling()
with open(prof, "r") as f:
js = json.load(f)
dfprofgpu = DataFrame(OnnxWholeSession.process_profiling(js))
dfprofgpu['shape'] = ",".join(map(str, shape))
dfprofgpu['slices'] = str(slices)
if verbose:
print("[benchmark_op] profile done.")
dfprofgpu = None
# Dataframes
shape_name = str(shape).replace(str(dim), "N")
df = pandas.DataFrame(res)
piv = df.pivot('shape', 'fct', 'average')
rs = piv.copy()
for c in ['numpy', 'ort', 'ort_gpu']:
if c in rs.columns:
rs["numpy/%s" % c] = rs['numpy'] / rs[c]
rs = rs[[c for c in rs.columns if "/numpy" not in c]].copy()
# Graphs.
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
piv.plot(logx=True, logy=True, ax=ax[0],
title="%s benchmark\n%r"
" lower better" % (name, shape_name))
ax[0].legend(prop={"size": 9})
rs.plot(logx=True, logy=True, ax=ax[1],
title="%s Speedup, baseline=numpy\n%r"
" higher better" % (name, shape_name))
ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--')
ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--')
ax[1].legend(prop={"size": 9})
return dfprof, dfprofgpu, df, rs, ax
The results.
nth = int(code_optimisation().split('=')[1])
cols_profile = ["shape", "slices", "args_op_name", 'args_provider']
shape = (100, N) - slice = [1:-1], :¶
dfs = []
dfprof, dfprofgpu, df, piv, ax = benchmark_op(
shape_slice_fct=lambda dim: ((256, dim), ((1, -1), None)),
save="bslice.onnx", number=nth * 4, repeat=8, repeat_profile=100 * nth)
piv2 = df.pivot("fct", "shape", "average")
print("slices = [1:-1], :")
print(dfprof.drop(['pid', 'tid', 'ts'], axis=1).groupby(
if dfprofgpu is not None:
print(dfprofgpu.drop(['pid', 'tid'], axis=1).groupby(

[benchmark_op] start repeat=8 number=32 repeat_profile=800 opset=14.
[benchmark_op] done.
[benchmark_op] profile CPU.
[benchmark_op] done.
slices = [1:-1], :
| shape | numpy | ort | numpy/ort |
| (256, 8) | 0.00122474 | 0.00213213 | 0.57442 |
| (256, 16) | 0.00138635 | 0.00236698 | 0.585704 |
| (256, 32) | 0.00158061 | 0.00260299 | 0.607228 |
| (256, 64) | 0.00196467 | 0.00340243 | 0.577431 |
| (256, 100) | 0.00229879 | 0.00443202 | 0.518679 |
| (256, 128) | 0.00273199 | 0.00525296 | 0.520086 |
| (256, 200) | 0.00416482 | 0.00816956 | 0.509797 |
| (256, 256) | 0.00483228 | 0.011073 | 0.436402 |
| (256, 400) | 0.00932988 | 0.0182123 | 0.512283 |
| (256, 512) | 0.00643529 | 0.0107145 | 0.600618 |
| (256, 600) | 0.00839677 | 0.0121431 | 0.691486 |
| (256, 784) | 0.00966727 | 0.0145427 | 0.664752 |
| (256, 800) | 0.0096974 | 0.0145289 | 0.667455 |
| (256, 1000) | 0.0119276 | 0.0178634 | 0.66771 |
| (256, 1024) | 0.0122553 | 0.0182789 | 0.670461 |
| (256, 1200) | 0.0139913 | 0.0209278 | 0.668551 |
| | dur |
| ('256,1200', '((1, -1), None)', 'Add', 'CPUExecutionProvider') | 227335 |
| ('256,1200', '((1, -1), None)', 'Mul', 'CPUExecutionProvider') | 223940 |
| ('256,1200', '((1, -1), None)', 'Slice', 'CPUExecutionProvider') | 942561 |
shape = (100, N) - slice = :, [1:-1]¶
dfs = []
dfprof, dfprofgpu, df, piv, ax = benchmark_op(
shape_slice_fct=lambda dim: ((256, dim), (None, (1, -1))),
save="bslice.onnx", number=nth * 4, repeat=8, repeat_profile=100 * nth)
piv2 = df.pivot("fct", "shape", "average")
print("slices = :, [1:-1]")
print(dfprof.drop(['pid', 'tid', 'ts'], axis=1).groupby(
if dfprofgpu is not None:
print(dfprofgpu.drop(['pid', 'tid'], axis=1).groupby(

[benchmark_op] start repeat=8 number=32 repeat_profile=800 opset=14.
[benchmark_op] done.
[benchmark_op] profile CPU.
[benchmark_op] done.
slices = :, [1:-1]
| shape | numpy | ort | numpy/ort |
| (256, 8) | 0.00202208 | 0.00208558 | 0.969553 |
| (256, 16) | 0.00229975 | 0.00224515 | 1.02432 |
| (256, 32) | 0.00262508 | 0.00256523 | 1.02333 |
| (256, 64) | 0.00377195 | 0.00330289 | 1.14201 |
| (256, 100) | 0.0046357 | 0.00419682 | 1.10457 |
| (256, 128) | 0.00584651 | 0.00532873 | 1.09717 |
| (256, 200) | 0.00774097 | 0.00822542 | 0.941103 |
| (256, 256) | 0.0105949 | 0.0111949 | 0.946404 |
| (256, 400) | 0.0180014 | 0.0196297 | 0.91705 |
| (256, 512) | 0.0114267 | 0.0111348 | 1.02622 |
| (256, 600) | 0.0121301 | 0.0122574 | 0.989617 |
| (256, 784) | 0.0138878 | 0.0144415 | 0.961664 |
| (256, 800) | 0.0139389 | 0.0148834 | 0.936539 |
| (256, 1000) | 0.0174043 | 0.0179164 | 0.971417 |
| (256, 1024) | 0.017487 | 0.018385 | 0.951155 |
| (256, 1200) | 0.019937 | 0.0208039 | 0.958329 |
| | dur |
| ('256,1200', '(None, (1, -1))', 'Add', 'CPUExecutionProvider') | 228597 |
| ('256,1200', '(None, (1, -1))', 'Mul', 'CPUExecutionProvider') | 237380 |
| ('256,1200', '(None, (1, -1))', 'Slice', 'CPUExecutionProvider') | 932484 |
shape = (100, N) - slice = [1:-1], [1:-1]¶
dfs = []
dfprof, dfprofgpu, df, piv, ax = benchmark_op(
shape_slice_fct=lambda dim: ((256, dim), ((1, -1), (1, -1))),
save="bslice.onnx", number=nth * 4, repeat=8, repeat_profile=100 * nth)
piv2 = df.pivot("fct", "shape", "average")
print("slices = [1:-1], [1:-1]")
print(dfprof.drop(['pid', 'tid', 'ts'], axis=1).groupby(
if dfprofgpu is not None:
print(dfprofgpu.drop(['pid', 'tid'], axis=1).groupby(

[benchmark_op] start repeat=8 number=32 repeat_profile=800 opset=14.
[benchmark_op] done.
[benchmark_op] profile CPU.
[benchmark_op] done.
slices = [1:-1], [1:-1]
| shape | numpy | ort | numpy/ort |
| (256, 8) | 0.00202314 | 0.00207122 | 0.976791 |
| (256, 16) | 0.00225834 | 0.00224831 | 1.00446 |
| (256, 32) | 0.00270728 | 0.00259146 | 1.04469 |
| (256, 64) | 0.00372587 | 0.00327614 | 1.13728 |
| (256, 100) | 0.00461852 | 0.00416746 | 1.10823 |
| (256, 128) | 0.0059576 | 0.00522772 | 1.13962 |
| (256, 200) | 0.00901611 | 0.00816554 | 1.10417 |
| (256, 256) | 0.0120908 | 0.0110545 | 1.09375 |
| (256, 400) | 0.0193612 | 0.019482 | 0.993802 |
| (256, 512) | 0.0116121 | 0.010995 | 1.05613 |
| (256, 600) | 0.0119881 | 0.0124442 | 0.963347 |
| (256, 784) | 0.0151558 | 0.0152803 | 0.991856 |
| (256, 800) | 0.0138089 | 0.0146576 | 0.942097 |
| (256, 1000) | 0.0171397 | 0.0176591 | 0.970589 |
| (256, 1024) | 0.017269 | 0.0182734 | 0.945036 |
| (256, 1200) | 0.0195565 | 0.020961 | 0.932994 |
| | dur |
| ('256,1200', '((1, -1), (1, -1))', 'Add', 'CPUExecutionProvider') | 241762 |
| ('256,1200', '((1, -1), (1, -1))', 'Mul', 'CPUExecutionProvider') | 233176 |
| ('256,1200', '((1, -1), (1, -1))', 'Slice', 'CPUExecutionProvider') | 914787 |
Total running time of the script: ( 4 minutes 18.669 seconds)