Note

Click here to download the full example code

Compares implementations of Add#

This example compares the addition of numpy to onnxruntime implementation. Function numpy.add is repeated 3 times. This minimizes the cost of copying the data from python to an external library. If available, tensorflow and pytorch are included as well. The numpy implementation is not the best, it allocates more buffers than necessary because parameter out is not used to reuse buffers.

Add implementations
(5, N, N) + (5, N, N)
(5, N, N) + (5, N, 1)
(5, N, N) + (5, 1, N)
(5, N, 5, N) + (1, N, 1, 1)
Conclusion

import numpy
import pandas
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxAdd
from cpyquickhelper.numbers import measure_time
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
print(code_optimisation())

Out:

AVX-omp=8

Add implementations #

try:
    from tensorflow.math import add as tf_add
    from tensorflow import convert_to_tensor
except ImportError:
    tf_add = None
try:
    from torch import add as torch_add, from_numpy
except ImportError:
    torch_add = None


def build_ort_add(op_version=12):
    node1 = OnnxAdd('x', 'y', op_version=op_version)
    node2 = OnnxAdd(node1, 'y', op_version=op_version)
    node = OnnxAdd(node2, 'y', op_version=op_version, output_names=['z'])
    onx = node.to_onnx(inputs=[('x', FloatTensorType()),
                               ('y', FloatTensorType())],
                       target_opset=op_version)
    sess = InferenceSession(onx.SerializeToString())
    return lambda x, y: sess.run(None, {'x': x, 'y': y})


def loop_fct(fct, xs, ys):
    for x, y in zip(xs, ys):
        fct(x, y)


def benchmark_op(repeat=5, number=2, name="Add", shape_fcts=None):
    if shape_fcts is None:
        def shape_fct(dim):
            return (5, dim, dim)
        shape_fcts = (shape_fct, shape_fct)
    ort_fct = build_ort_add()
    res = []
    for dim in tqdm([8, 16, 32, 64, 100, 128, 200,
                     256, 400, 512, 1024, 1536, 2048, 2560]):
        shape1 = shape_fcts[0](dim)
        shape2 = shape_fcts[1](dim)
        n_arrays = (16 if dim < 512 else 4) if dim < 2048 else 4
        if len(shape1) > 3:
            n_arrays = int(n_arrays / 4)
        xs = [numpy.random.rand(*shape1).astype(numpy.float32)
              for _ in range(n_arrays)]
        ys = [numpy.random.rand(*shape2).astype(numpy.float32)
              for _ in range(n_arrays)]
        info = dict(shape1=shape1, shape2=shape2)

        # numpy
        ctx = dict(
            xs=xs, ys=ys,
            fct=lambda x, y: numpy.add(numpy.add(numpy.add(x, y), y), y),
            loop_fct=loop_fct)
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'numpy'
        obs.update(info)
        res.append(obs)

        # onnxruntime
        ctx['fct'] = ort_fct
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'ort'
        obs.update(info)
        res.append(obs)

        if tf_add is not None:
            # tensorflow
            ctx['fct'] = lambda x, y: tf_add(tf_add(tf_add(x, y), y), y)
            ctx['xs'] = [convert_to_tensor(x) for x in xs]
            ctx['ys'] = [convert_to_tensor(y) for y in ys]
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'tf'
            obs.update(info)
            res.append(obs)

        if torch_add is not None:
            # torch
            ctx['fct'] = lambda x, y: torch_add(
                torch_add(torch_add(x, y), y), y)
            ctx['xs'] = [from_numpy(x) for x in xs]
            ctx['ys'] = [from_numpy(y) for y in ys]
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'torch'
            obs.update(info)
            res.append(obs)

    # Dataframes
    shape1_name = str(shape1).replace(str(dim), "N")
    shape2_name = str(shape2).replace(str(dim), "N")
    df = pandas.DataFrame(res)
    df.columns = [_.replace('dim', 'N') for _ in df.columns]
    piv = df.pivot('N', 'fct', 'average')

    rs = piv.copy()
    for c in ['ort', 'torch', 'tf']:
        if c in rs.columns:
            rs[c] = rs['numpy'] / rs[c]
    rs['numpy'] = 1.

    # Graphs.
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    piv.plot(logx=True, logy=True, ax=ax[0],
             title="%s benchmark\n%s + %s"
                   " lower better" % (name, shape1_name, shape2_name))
    ax[0].legend(prop={"size": 9})
    rs.plot(logx=True, logy=True, ax=ax[1],
            title="%s Speedup, baseline=numpy\n%s + %s"
                  " higher better" % (name, shape1_name, shape2_name))
    ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--')
    ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--')
    ax[1].legend(prop={"size": 9})
    return df, rs, ax


dfs = []

(5, N, N) + (5, N, N)#

df, piv, ax = benchmark_op()
dfs.append(df)
df.pivot("fct", "N", "average")

Out:

  0%|          | 0/14 [00:00<?, ?it/s]
  7%|7         | 1/14 [00:00<00:09,  1.31it/s]
 29%|##8       | 4/14 [00:00<00:01,  5.04it/s]
 36%|###5      | 5/14 [00:01<00:03,  2.51it/s]
 43%|####2     | 6/14 [00:03<00:05,  1.52it/s]
 50%|#####     | 7/14 [00:07<00:12,  1.75s/it]
 57%|#####7    | 8/14 [00:11<00:13,  2.29s/it]
 64%|######4   | 9/14 [00:17<00:17,  3.45s/it]
 71%|#######1  | 10/14 [00:19<00:12,  3.06s/it]
 79%|#######8  | 11/14 [00:24<00:11,  3.71s/it]
 86%|########5 | 12/14 [00:35<00:11,  5.76s/it]
 93%|#########2| 13/14 [00:56<00:10, 10.37s/it]
100%|##########| 14/14 [02:07<00:00, 28.40s/it]
100%|##########| 14/14 [02:07<00:00,  9.10s/it]

N	8	16	32	64	100	128	200	256	400	512	1024	1536	2048	2560
fct
numpy	0.000313	0.000413	0.000649	0.001387	0.003768	0.008607	0.021641	0.034943	0.080951	0.034458	0.129122	0.287434	0.544592	0.821934
ort	0.001275	0.001407	0.001644	0.002826	0.006378	0.011757	0.015676	0.026253	0.059766	0.025158	0.090838	0.217476	0.371043	0.611302
torch	0.072136	0.001152	0.001873	0.004580	0.075854	0.101256	0.382547	0.263609	0.397462	0.118290	0.169283	0.244059	0.427637	0.538100

(5, N, N) + (5, N, 1)#

shape_fcts = (lambda dim: (5, dim, dim),
              lambda dim: (5, dim, 1))

df, piv, ax = benchmark_op(shape_fcts=shape_fcts)
dfs.append(df)
df.pivot("fct", "N", "average")

Out:

  0%|          | 0/14 [00:00<?, ?it/s]
 21%|##1       | 3/14 [00:00<00:00, 20.36it/s]
 43%|####2     | 6/14 [00:06<00:09,  1.20s/it]
 50%|#####     | 7/14 [00:09<00:11,  1.60s/it]
 57%|#####7    | 8/14 [00:10<00:08,  1.43s/it]
 64%|######4   | 9/14 [00:13<00:09,  1.86s/it]
 71%|#######1  | 10/14 [00:14<00:06,  1.61s/it]
 79%|#######8  | 11/14 [00:18<00:06,  2.30s/it]
 86%|########5 | 12/14 [00:26<00:08,  4.06s/it]
 93%|#########2| 13/14 [00:40<00:06,  6.76s/it]
100%|##########| 14/14 [01:12<00:00, 14.12s/it]
100%|##########| 14/14 [01:12<00:00,  5.16s/it]

N	8	16	32	64	100	128	200	256	400	512	1024	1536	2048	2560
fct
numpy	0.000843	0.001005	0.001440	0.002739	0.004956	0.008518	0.022672	0.035007	0.079446	0.032021	0.142182	0.309083	0.470722	0.916425
ort	0.001484	0.001632	0.002097	0.003776	0.007310	0.011106	0.013630	0.020410	0.051022	0.020781	0.075765	0.167926	0.295317	0.460302
torch	0.001167	0.001369	0.002125	0.004817	0.257137	0.293941	0.255456	0.016010	0.137560	0.019911	0.126672	0.220532	0.302104	0.868772

(5, N, N) + (5, 1, N)#

shape_fcts = (lambda dim: (5, dim, dim),
              lambda dim: (5, 1, dim))

df, piv, ax = benchmark_op(shape_fcts=shape_fcts)
dfs.append(df)
df.pivot("fct", "N", "average")

Out:

  0%|          | 0/14 [00:00<?, ?it/s]
 21%|##1       | 3/14 [00:00<00:00, 20.05it/s]
 43%|####2     | 6/14 [00:06<00:09,  1.20s/it]
 50%|#####     | 7/14 [00:08<00:10,  1.52s/it]
 57%|#####7    | 8/14 [00:11<00:10,  1.75s/it]
 64%|######4   | 9/14 [00:17<00:14,  2.83s/it]
 71%|#######1  | 10/14 [00:18<00:09,  2.39s/it]
 79%|#######8  | 11/14 [00:21<00:07,  2.64s/it]
 86%|########5 | 12/14 [00:29<00:08,  4.02s/it]
 93%|#########2| 13/14 [00:43<00:06,  6.94s/it]
100%|##########| 14/14 [01:20<00:00, 15.68s/it]
100%|##########| 14/14 [01:20<00:00,  5.75s/it]

N	8	16	32	64	100	128	200	256	400	512	1024	1536	2048	2560
fct
numpy	0.000861	0.000993	0.001294	0.002486	0.004608	0.009292	0.021217	0.037826	0.078725	0.031128	0.120280	0.276108	0.473129	1.493048
ort	0.001480	0.001682	0.002152	0.003727	0.007476	0.011202	0.013769	0.020439	0.049782	0.021183	0.075196	0.168423	0.296354	0.473466
torch	0.001222	0.001457	0.002258	0.005372	0.267329	0.278235	0.219311	0.172050	0.429306	0.048130	0.066225	0.152193	0.265537	0.863066

(5, N, 5, N) + (1, N, 1, 1)#

shape_fcts = (lambda dim: (5, dim, 5, dim),
              lambda dim: (1, dim, 1, 1))

df, piv, ax = benchmark_op(shape_fcts=shape_fcts)
dfs.append(df)
df.pivot("fct", "N", "average")

Out:

  0%|          | 0/14 [00:00<?, ?it/s]
 21%|##1       | 3/14 [00:00<00:00, 23.45it/s]
 43%|####2     | 6/14 [00:00<00:01,  5.85it/s]
 57%|#####7    | 8/14 [00:03<00:03,  1.62it/s]
 64%|######4   | 9/14 [00:07<00:05,  1.17s/it]
 71%|#######1  | 10/14 [00:08<00:04,  1.19s/it]
 79%|#######8  | 11/14 [00:13<00:06,  2.12s/it]
 86%|########5 | 12/14 [01:01<00:27, 13.96s/it]
 93%|#########2| 13/14 [02:28<00:33, 33.79s/it]
100%|##########| 14/14 [04:45<00:00, 62.29s/it]
100%|##########| 14/14 [04:45<00:00, 20.36s/it]

N	8	16	32	64	100	128	200	256	400	512	1024	1536	2048	2560
fct
numpy	0.000257	0.000353	0.000777	0.002537	0.006603	0.010339	0.023632	0.040129	0.092183	0.037394	0.150007	2.927870	5.279610	8.156890
ort	0.001438	0.000487	0.001249	0.002138	0.004658	0.006895	0.016315	0.024141	0.057268	0.023894	0.097068	0.988756	2.041703	2.866919
torch	0.000376	0.000586	0.001444	0.008529	0.012194	0.012964	0.052373	0.089791	0.130507	0.043433	0.165567	0.399070	0.606843	0.965634

Conclusion #

It is difficult to have a final conclusion as the addition of two vectors is of the same order of magnitude of a copy between python and the C++ code of onnxruntime, pytorch or tensorflow. numpy is much better of small vectors. onnxruntime, pytorch and tensorflow are not optimized on this case because it is not very common in deep learning.

merged = pandas.concat(dfs)
name = "add"
merged.to_csv("plot_%s.csv" % name, index=False)
merged.to_excel("plot_%s.xlsx" % name, index=False)
plt.savefig("plot_%s.png" % name)

plt.show()

Total running time of the script: ( 9 minutes 45.707 seconds)

Gallery generated by Sphinx-Gallery

Compares numba, numpy, onnxruntime for simple functions

Compares implementations of ReduceMax

Compares implementations of Add#

Add implementations#

(5, N, N) + (5, N, N)#

(5, N, N) + (5, N, 1)#

(5, N, N) + (5, 1, N)#

(5, N, 5, N) + (1, N, 1, 1)#

Conclusion#

Add implementations #

Conclusion #