Benchmark operator Slice

This short code compares the execution of the operator Slice on CPU and GPU in three configurations.

A simple example

import numpy
from numpy.testing import assert_almost_equal
from pandas import DataFrame, pivot_table
from onnxruntime import InferenceSession, get_device
from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
    OrtValue as C_OrtValue)
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxSlice, OnnxAdd, OnnxMul
from cpyquickhelper.numbers.speed_measure import measure_time
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
from mlprodict.onnxrt import OnnxInference
from mlprodict.plotting.plotting_onnx import plot_onnx
from onnxcustom.utils.onnxruntime_helper import get_ort_device
from tqdm import tqdm


print([code_optimisation(), get_device()])

Out:

['AVX-omp=8', 'CPU']

The graph to compare.

def build_ort_op(op_version=14, save=None, slices=None):  # opset=13, 14, ...
    if slices is None:
        starts = numpy.array([1, 1], dtype=numpy.int64)
        ends = numpy.array([-1, -1], dtype=numpy.int64)
        axes = None
    else:
        starts, ends = slices
        if starts[0] is None:
            indexes = [i for i in range(len(starts)) if starts[i] is not None]
            starts = numpy.array(
                [n for n in starts if n is not None], dtype=numpy.int64)
            ends = numpy.array(
                [n for n in ends if n is not None], dtype=numpy.int64)
            axes = numpy.array(indexes, dtype=numpy.int64)
        else:
            starts = numpy.array(starts, dtype=numpy.int64)
            ends = numpy.array(ends, dtype=numpy.int64)
            axes = None

    if axes is None:
        node1 = OnnxSlice('X', starts, ends, op_version=op_version)
    else:
        node1 = OnnxSlice('X', starts, ends, axes, op_version=op_version)
    node2 = OnnxAdd(node1, numpy.array([1], dtype=numpy.float32),
                    op_version=op_version)
    if axes is None:
        node3 = OnnxSlice(node2, starts, ends, op_version=op_version)
    else:
        node3 = OnnxSlice(node2, starts, ends, axes, op_version=op_version)
    node4 = OnnxMul(node3, numpy.array([2], dtype=numpy.float32),
                    op_version=op_version, output_names=['Y'])
    onx = node4.to_onnx(inputs=[('X', FloatTensorType([None, None]))],
                        target_opset=op_version)
    return onx


onx = build_ort_op()
plot_onnx(onx)
plot benchmark op short

Out:

<AxesSubplot:>

Execution on CPU

x = numpy.random.rand(50, 50).astype(numpy.float32)

oinf = OnnxInference(onx)
oinf.run({'X': x}, verbose=1, fLOG=print)

Out:

+ki='Sl_Slicecst': (2,) (dtype=int64 min=1 max=1)
+ki='Sl_Slicecst1': (2,) (dtype=int64 min=-1 max=-1)
+ki='Ad_Addcst': (1,) (dtype=float32 min=1.0 max=1.0)
+ki='Mu_Mulcst': (1,) (dtype=float32 min=2.0 max=2.0)
-- OnnxInference: run 6 nodes
Onnx-Identity(Sl_Slicecst) -> Sl_Slicecst2    (name='Sl_Slicecst2_op')
+kr='Sl_Slicecst2': (2,) (dtype=int64 min=1 max=1)
Onnx-Identity(Sl_Slicecst1) -> Sl_Slicecst3    (name='Sl_Slicecst3_op')
+kr='Sl_Slicecst3': (2,) (dtype=int64 min=-1 max=-1)
Onnx-Slice(X, Sl_Slicecst, Sl_Slicecst1) -> Sl_output01    (name='Sl_Slice')
+kr='Sl_output01': (48, 48) (dtype=float32 min=8.141066791722551e-05 max=0.9998125433921814)
Onnx-Add(Sl_output01, Ad_Addcst) -> Ad_C0    (name='Ad_Add')
+kr='Ad_C0': (48, 48) (dtype=float32 min=1.0000814199447632 max=1.9998126029968262)
Onnx-Slice(Ad_C0, Sl_Slicecst2, Sl_Slicecst3) -> Sl_output0    (name='Sl_Slice1')
+kr='Sl_output0': (46, 46) (dtype=float32 min=1.0000814199447632 max=1.9998126029968262)
Onnx-Mul(Sl_output0, Mu_Mulcst) -> Y    (name='Mu_Mul')
+kr='Y': (46, 46) (dtype=float32 min=2.0001628398895264 max=3.9996252059936523)

{'Y': array([[2.202823 , 2.3310156, 3.7436275, ..., 3.395742 , 3.6053634,
        3.893557 ],
       [2.0253925, 3.8714616, 2.7886271, ..., 2.9062228, 2.9643648,
        2.6978688],
       [2.7763417, 3.8809776, 3.2185361, ..., 2.3637347, 3.59551  ,
        2.441475 ],
       ...,
       [2.415699 , 3.4867158, 3.6288283, ..., 3.279631 , 2.7204676,
        3.8106637],
       [3.6561089, 3.1871939, 2.5588193, ..., 2.2869947, 2.3464262,
        2.1570392],
       [3.0212526, 3.8279028, 2.8945808, ..., 3.6432106, 3.2940612,
        2.3520088]], dtype=float32)}

With onnxruntime.

sess = InferenceSession(onx.SerializeToString(),
                        providers=["CPUExecutionProvider"])
y_cpu = sess.run(None, {'X': x})[0]

Execution on GPU

If available…

if get_device().upper() == 'GPU':
    dev = get_ort_device('cuda:0')
    try:
        gx = C_OrtValue.ortvalue_from_numpy(x, dev)
        cuda = True
    except RuntimeError as e:
        print(e)
        cuda = False
else:
    cuda = False

if cuda:
    sessg = InferenceSession(onx.SerializeToString(),
                             providers=["CUDAExecutionProvider"])

    io_binding = sessg.io_binding()._iobinding
    io_binding.bind_input(
        'X', dev, numpy.float32, gx.shape(), gx.data_ptr())
    io_binding.bind_output('Y', dev)
    sessg._sess.run_with_iobinding(io_binding, None)
    y_gpu = io_binding.copy_outputs_to_cpu()[0]
    assert_almost_equal(y_cpu, y_gpu)

Benchmark

data = []
shapes = ([(n, n) for n in [10, 100, 1000]] +
          [(n, 100) for n in [10, 100, 1000, 10000]] +
          [(100, n) for n in [10, 100, 1000, 10000]])
slices = [([1, 1], [-1, -1]), ([1], [-1]), ([None, 1], [None, -1])]
shape_slices = [(sh, sl) for sh in shapes for sl in slices]

for shape, slices in tqdm(shape_slices):
    onx = build_ort_op(slices=slices)
    x = numpy.random.rand(*shape).astype(numpy.float32)

    number = 100
    if x.size >= 100000:
        number = 10

    sess = InferenceSession(
        onx.SerializeToString(),
        providers=["CPUExecutionProvider"])
    sess.run(None, {'X': x})

    obs = dict(
        shape=str(shape).replace(
            " ", ""), slice=str(slices).replace(
            " ", ""))
    r = measure_time(lambda: sess.run(None, {'X': x}),
                     number=number, div_by_number=True,
                     context={})
    obs.update(r)
    obs['provider'] = 'CPU'
    data.append(obs)

    if cuda:
        def sess_run(sess, io_binding, x, dev):
            io_binding.bind_input(
                'X', dev, numpy.float32, gx.shape(), gx.data_ptr())
            io_binding.bind_output('Y', dev)
            sess._sess.run_with_iobinding(io_binding)

        io_binding = sess.io_binding()._iobinding
        sess = InferenceSession(
            onx.SerializeToString(),
            providers=["CUDAExecutionProvider"])
        dev = get_ort_device('cuda:0')
        gx = C_OrtValue.ortvalue_from_numpy(x, dev)
        sess_run(sess, io_binding, gx, dev)
        obs = dict(
            shape=str(shape).replace(
                " ", ""), slice=str(slices).replace(
                " ", ""))
        r = measure_time(
            lambda: sess_run(sess, io_binding, io_binding, gx, dev),
            number=number,
            div_by_number=True,
            context={
                'sess': sess, 'gx': gx, 'io_binding': io_binding,
                'dev': dev, 'sess_run': sess_run})
        obs.update(r)
        obs['provider'] = 'GPU'
        data.append(obs)

df = DataFrame(data)
print(df)

Out:

  0%|          | 0/33 [00:00<?, ?it/s]
  3%|3         | 1/33 [00:00<00:03,  8.32it/s]
  6%|6         | 2/33 [00:00<00:03,  8.91it/s]
  9%|9         | 3/33 [00:00<00:03,  8.92it/s]
 12%|#2        | 4/33 [00:00<00:03,  7.99it/s]
 15%|#5        | 5/33 [00:00<00:03,  7.55it/s]
 18%|#8        | 6/33 [00:00<00:03,  7.19it/s]
 21%|##1       | 7/33 [00:01<00:08,  3.16it/s]
 24%|##4       | 8/33 [00:02<00:10,  2.31it/s]
 27%|##7       | 9/33 [00:02<00:12,  1.94it/s]
 30%|###       | 10/33 [00:02<00:08,  2.56it/s]
 33%|###3      | 11/33 [00:03<00:06,  3.25it/s]
 36%|###6      | 12/33 [00:03<00:05,  4.01it/s]
 39%|###9      | 13/33 [00:03<00:04,  4.59it/s]
 42%|####2     | 14/33 [00:03<00:03,  5.11it/s]
 45%|####5     | 15/33 [00:03<00:03,  5.47it/s]
 48%|####8     | 16/33 [00:03<00:02,  6.06it/s]
 52%|#####1    | 17/33 [00:03<00:02,  6.58it/s]
 55%|#####4    | 18/33 [00:04<00:02,  6.90it/s]
 58%|#####7    | 19/33 [00:04<00:04,  3.16it/s]
 61%|######    | 20/33 [00:05<00:05,  2.20it/s]
 64%|######3   | 21/33 [00:06<00:06,  1.88it/s]
 67%|######6   | 22/33 [00:06<00:04,  2.46it/s]
 70%|######9   | 23/33 [00:06<00:03,  3.12it/s]
 73%|#######2  | 24/33 [00:06<00:02,  3.85it/s]
 76%|#######5  | 25/33 [00:06<00:01,  4.44it/s]
 79%|#######8  | 26/33 [00:06<00:01,  4.97it/s]
 82%|########1 | 27/33 [00:07<00:01,  5.36it/s]
 85%|########4 | 28/33 [00:07<00:00,  6.06it/s]
 88%|########7 | 29/33 [00:07<00:00,  6.67it/s]
 91%|######### | 30/33 [00:07<00:00,  7.05it/s]
 94%|#########3| 31/33 [00:08<00:00,  3.34it/s]
 97%|#########6| 32/33 [00:08<00:00,  2.43it/s]
100%|##########| 33/33 [00:09<00:00,  2.01it/s]
100%|##########| 33/33 [00:09<00:00,  3.51it/s]
          shape                 slice  ...  context_size  provider
0       (10,10)       ([1,1],[-1,-1])  ...            64       CPU
1       (10,10)            ([1],[-1])  ...            64       CPU
2       (10,10)  ([None,1],[None,-1])  ...            64       CPU
3     (100,100)       ([1,1],[-1,-1])  ...            64       CPU
4     (100,100)            ([1],[-1])  ...            64       CPU
5     (100,100)  ([None,1],[None,-1])  ...            64       CPU
6   (1000,1000)       ([1,1],[-1,-1])  ...            64       CPU
7   (1000,1000)            ([1],[-1])  ...            64       CPU
8   (1000,1000)  ([None,1],[None,-1])  ...            64       CPU
9      (10,100)       ([1,1],[-1,-1])  ...            64       CPU
10     (10,100)            ([1],[-1])  ...            64       CPU
11     (10,100)  ([None,1],[None,-1])  ...            64       CPU
12    (100,100)       ([1,1],[-1,-1])  ...            64       CPU
13    (100,100)            ([1],[-1])  ...            64       CPU
14    (100,100)  ([None,1],[None,-1])  ...            64       CPU
15   (1000,100)       ([1,1],[-1,-1])  ...            64       CPU
16   (1000,100)            ([1],[-1])  ...            64       CPU
17   (1000,100)  ([None,1],[None,-1])  ...            64       CPU
18  (10000,100)       ([1,1],[-1,-1])  ...            64       CPU
19  (10000,100)            ([1],[-1])  ...            64       CPU
20  (10000,100)  ([None,1],[None,-1])  ...            64       CPU
21     (100,10)       ([1,1],[-1,-1])  ...            64       CPU
22     (100,10)            ([1],[-1])  ...            64       CPU
23     (100,10)  ([None,1],[None,-1])  ...            64       CPU
24    (100,100)       ([1,1],[-1,-1])  ...            64       CPU
25    (100,100)            ([1],[-1])  ...            64       CPU
26    (100,100)  ([None,1],[None,-1])  ...            64       CPU
27   (100,1000)       ([1,1],[-1,-1])  ...            64       CPU
28   (100,1000)            ([1],[-1])  ...            64       CPU
29   (100,1000)  ([None,1],[None,-1])  ...            64       CPU
30  (100,10000)       ([1,1],[-1,-1])  ...            64       CPU
31  (100,10000)            ([1],[-1])  ...            64       CPU
32  (100,10000)  ([None,1],[None,-1])  ...            64       CPU

[33 rows x 11 columns]

Better display

piv = pivot_table(
    df, index=["shape", "slice"], columns="provider", values="average")
if 'GPU' in piv.columns:
    piv['ratio'] = piv['GPU'] / piv['CPU']
print(piv)

Out:

provider                               CPU
shape       slice
(10,10)     ([1,1],[-1,-1])       0.000084
            ([1],[-1])            0.000084
            ([None,1],[None,-1])  0.000085
(10,100)    ([1,1],[-1,-1])       0.000088
            ([1],[-1])            0.000088
            ([None,1],[None,-1])  0.000090
(100,10)    ([1,1],[-1,-1])       0.000093
            ([1],[-1])            0.000095
            ([None,1],[None,-1])  0.000092
(100,100)   ([1,1],[-1,-1])       0.000122
            ([1],[-1])            0.000122
            ([None,1],[None,-1])  0.000125
(100,1000)  ([1,1],[-1,-1])       0.000882
            ([1],[-1])            0.000878
            ([None,1],[None,-1])  0.000910
(100,10000) ([1,1],[-1,-1])       0.006057
            ([1],[-1])            0.006091
            ([None,1],[None,-1])  0.006290
(1000,100)  ([1,1],[-1,-1])       0.000959
            ([1],[-1])            0.000946
            ([None,1],[None,-1])  0.000964
(1000,1000) ([1,1],[-1,-1])       0.006183
            ([1],[-1])            0.006215
            ([None,1],[None,-1])  0.006293
(10000,100) ([1,1],[-1,-1])       0.006508
            ([1],[-1])            0.007114
            ([None,1],[None,-1])  0.006491

Graphs.

piv.plot()
plot benchmark op short

Out:

<AxesSubplot:xlabel='shape,slice'>

Total running time of the script: ( 0 minutes 11.672 seconds)

Gallery generated by Sphinx-Gallery