Compares numba, numpy, onnxruntime for simple functions#

The following benchmark is inspired from bench_arrayexprs.py. It compares numba, numpy and onnxruntime for simple functions. As expected, numba is better than the other options.

The functions
ONNX version
numba optimized
Benchmark
Graphs

The functions #

import numpy
import pandas
import matplotlib.pyplot as plt
from numba import jit
from typing import Any
import numpy as np
from tqdm import tqdm
from cpyquickhelper.numbers.speed_measure import measure_time
from mlprodict.npy import NDArray, onnxnumpy_np
from mlprodict.npy.onnx_numpy_annotation import NDArrayType
import mlprodict.npy.numpy_onnx_impl as npnx


# @jit(nopython=True)
def sum(a, b):
    return a + b

# @jit(nopython=True)


def sq_diff(a, b):
    return (a - b) * (a + b)

# @jit(nopython=True)


def rel_diff(a, b):
    return (a - b) / (a + b)

# @jit(nopython=True)


def square(a):
    # Note this is currently slower than `a ** 2 + b`, due to how LLVM
    # seems to lower the power intrinsic.  It's still faster than the naive
    # lowering as `exp(2 * log(a))`, though
    return a ** 2


def cube(a):
    return a ** 3

ONNX version #

The implementation uses the numpy API for ONNX to keep the same code.

@onnxnumpy_np(signature=NDArrayType(("T:all", "T"), dtypes_out=('T',)),
              runtime="onnxruntime")
def onnx_sum_32(a, b):
    return a + b


@onnxnumpy_np(signature=NDArrayType(("T:all", "T"), dtypes_out=('T',)),
              runtime="onnxruntime")
def onnx_sq_diff_32(a, b):
    return (a - b) * (a + b)


@onnxnumpy_np(signature=NDArrayType(("T:all", "T"), dtypes_out=('T',)),
              runtime="onnxruntime")
def onnx_rel_diff_32(a, b):
    return (a - b) / (a + b)


@onnxnumpy_np(signature=NDArrayType(("T:all", ), dtypes_out=('T',)),
              runtime="onnxruntime")
def onnx_square_32(a):
    return a ** 2


@onnxnumpy_np(signature=NDArrayType(("T:all", ), dtypes_out=('T',)),
              runtime="onnxruntime")
def onnx_cube_32(a):
    return a ** 3

numba optimized #

jitter = jit(nopython=True)
nu_sum = jitter(sum)
nu_sq_diff = jitter(sq_diff)
nu_rel_diff = jitter(rel_diff)
nu_square = jitter(square)
nu_cube = jitter(cube)

Benchmark #

obs = []

for n in tqdm([10, 100, 1000, 10000, 100000, 1000000]):
    number = 100 if n < 1000000 else 10
    for dtype in [numpy.float32, numpy.float64]:
        samples = [
            [numpy.random.uniform(1.0, 2.0, size=n).astype(dtype)],
            [numpy.random.uniform(1.0, 2.0, size=n).astype(dtype)
             for i in range(2)]]

        for fct1, fct2, fct3, n_inputs in [
                (sum, nu_sum, onnx_sum_32, 2),
                (sq_diff, nu_sq_diff, onnx_sq_diff_32, 2),
                (rel_diff, nu_rel_diff, onnx_rel_diff_32, 2),
                (square, nu_square, onnx_square_32, 1),
                (cube, nu_cube, onnx_cube_32, 1)]:
            sample = samples[n_inputs - 1]
            if n_inputs == 2:
                fct1(*sample)
                fct1(*sample)
                r = measure_time('fct1(a,b)', number=number, div_by_number=True,
                                 context={'fct1': fct1, 'a': sample[0], 'b': sample[1]})
                r.update(dict(dtype=dtype, name='numpy', n=n, fct=fct1.__name__))
                obs.append(r)

                fct2(*sample)
                fct2(*sample)
                r = measure_time('fct2(a,b)', number=number, div_by_number=True,
                                 context={'fct2': fct2, 'a': sample[0], 'b': sample[1]})
                r.update(dict(dtype=dtype, name='numba', n=n, fct=fct1.__name__))
                obs.append(r)

                fct3(*sample)
                fct3(*sample)
                r = measure_time('fct3(a,b)', number=number, div_by_number=True,
                                 context={'fct3': fct3, 'a': sample[0], 'b': sample[1]})
                r.update(dict(dtype=dtype, name='onnx', n=n, fct=fct1.__name__))
                obs.append(r)
            else:
                fct1(*sample)
                fct1(*sample)
                r = measure_time('fct1(a)', number=number, div_by_number=True,
                                 context={'fct1': fct1, 'a': sample[0]})
                r.update(dict(dtype=dtype, name='numpy', n=n, fct=fct1.__name__))
                obs.append(r)

                fct2(*sample)
                fct2(*sample)
                r = measure_time('fct2(a)', number=number, div_by_number=True,
                                 context={'fct2': fct2, 'a': sample[0]})
                r.update(dict(dtype=dtype, name='numba', n=n, fct=fct1.__name__))
                obs.append(r)

                fct3(*sample)
                fct3(*sample)
                r = measure_time('fct3(a)', number=number, div_by_number=True,
                                 context={'fct3': fct3, 'a': sample[0]})
                r.update(dict(dtype=dtype, name='onnx', n=n, fct=fct1.__name__))
                obs.append(r)

df = pandas.DataFrame(obs)
print(df)

Out:

  0%|          | 0/6 [00:00<?, ?it/s]
 17%|#6        | 1/6 [00:08<00:40,  8.09s/it]
 33%|###3      | 2/6 [00:09<00:17,  4.40s/it]
 50%|#####     | 3/6 [00:12<00:10,  3.42s/it]
 67%|######6   | 4/6 [00:18<00:08,  4.39s/it]
 83%|########3 | 5/6 [01:01<00:18, 18.55s/it]
100%|##########| 6/6 [01:42<00:00, 26.16s/it]
100%|##########| 6/6 [01:42<00:00, 17.10s/it]
      average     deviation  min_exec  ...   name        n      fct
0    0.000005  9.411357e-08  0.000005  ...  numpy       10      sum
1    0.000012  1.614372e-07  0.000011  ...  numba       10      sum
2    0.000148  8.374489e-07  0.000148  ...   onnx       10      sum
3    0.000014  1.436907e-07  0.000014  ...  numpy       10  sq_diff
4    0.000012  1.383735e-07  0.000012  ...  numba       10  sq_diff
..        ...           ...       ...  ...    ...      ...      ...
175  0.002935  3.809907e-05  0.002877  ...  numba  1000000   square
176  0.004226  7.632749e-05  0.004118  ...   onnx  1000000   square
177  0.173547  3.042716e-05  0.173500  ...  numpy  1000000     cube
178  0.003174  4.761741e-06  0.003170  ...  numba  1000000     cube
179  0.004159  1.051921e-04  0.004047  ...   onnx  1000000     cube

[180 rows x 12 columns]

Graphs #

fcts = list(sorted(set(df.fct)))
fig, ax = plt.subplots(len(fcts), 2, figsize=(14, len(fcts) * 3))

for i, fn in enumerate(fcts):
    piv = pandas.pivot(data=df[(df.fct == fn) & (df.dtype == numpy.float32)],
                       index="n", columns="name", values="average")
    piv.plot(title="fct=%s - float32" % fn,
             logx=True, logy=True, ax=ax[i, 0])
    piv = pandas.pivot(data=df[(df.fct == fn) & (df.dtype == numpy.float64)],
                       index="n", columns="name", values="average")
    piv.plot(title="fct=%s - float64" % fn,
             logx=True, logy=True, ax=ax[i, 1])
plt.show()

fct=cube - float32, fct=cube - float64, fct=rel_diff - float32, fct=rel_diff - float64, fct=sq_diff - float32, fct=sq_diff - float64, fct=square - float32, fct=square - float64, fct=sum - float32, fct=sum - float64

Total running time of the script: ( 1 minutes 56.999 seconds)

Gallery generated by Sphinx-Gallery

Benchmark Random Forests, Tree Ensemble

Compares implementations of Add

Compares numba, numpy, onnxruntime for simple functions#

The functions#

ONNX version#

numba optimized#

Benchmark#

Graphs#