Compares implementations of ReduceSumSquare#

This example compares the numpy for the operator ReduceSumSquare to onnxruntime implementation. If available, tensorflow and pytorch are included as well.

Available optimisation#

The code shows which parallelisation optimisation could be used, AVX or SSE and the number of available processors.

import numpy
import pandas
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxReduceSumSquare
from cpyquickhelper.numbers import measure_time
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
print(code_optimisation())

Out:

AVX-omp=8

ReduceSumSquare implementations#

try:
    from tensorflow.math import reduce_sum as tf_reduce_sum
    from tensorflow import convert_to_tensor
except ImportError:
    tf_reduce_sum = None
try:
    from torch import sum as torch_sum, from_numpy
except ImportError:
    torch_sum = None


def build_ort_reducesumsquare(axes, op_version=14):  # opset=13, 14, ...
    node = OnnxReduceSumSquare('x', axes=axes, op_version=op_version,
                               output_names=['z'])
    onx = node.to_onnx(inputs=[('x', FloatTensorType())],
                       target_opset=op_version)
    sess = InferenceSession(onx.SerializeToString())
    return lambda x, y: sess.run(None, {'x': x})


def loop_fct(fct, xs, ys):
    for x, y in zip(xs, ys):
        fct(x, y)


def benchmark_op(axes, repeat=2, number=5, name="ReduceSumSquare", shape_fct=None):
    if shape_fct is None:
        def shape_fct(dim):
            return (3, dim, 1, 128, 64)
    ort_fct = build_ort_reducesumsquare(axes)
    res = []
    for dim in tqdm([8, 16, 32, 64, 100, 128, 200,
                     256, 400, 512, 1024]):
        shape = shape_fct(dim)
        n_arrays = 10 if dim < 512 else 4
        xs = [numpy.random.rand(*shape).astype(numpy.float32)
              for _ in range(n_arrays)]
        ys = [numpy.array(axes, dtype=numpy.int64)
              for _ in range(n_arrays)]
        info = dict(axes=axes, shape=shape)

        # numpy
        ctx = dict(
            xs=xs, ys=ys,
            fct=lambda x, y: numpy.sum(x ** 2, *y),
            loop_fct=loop_fct)
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'numpy'
        obs.update(info)
        res.append(obs)

        # onnxruntime
        ctx['fct'] = ort_fct
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'ort'
        obs.update(info)
        res.append(obs)

        if tf_reduce_sum is not None:
            # tensorflow
            ctx['fct'] = lambda x, y: tf_reduce_sum(x ** 2, y)
            ctx['xs'] = [convert_to_tensor(x) for x in xs]
            ctx['ys'] = ys
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'tf'
            obs.update(info)
            res.append(obs)

        if torch_sum is not None:
            def torch_sum1(x, y):
                return torch_sum(x ** 2, y[0])

            def torch_sum2(x, y):
                return torch_sum(torch_sum(x ** 2, y[1]), y[0])

            # torch
            ctx['fct'] = torch_sum1 if len(axes) == 1 else torch_sum2
            ctx['xs'] = [from_numpy(x) for x in xs]
            ctx['ys'] = ys  # [from_numpy(y) for y in ys]
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'torch'
            obs.update(info)
            res.append(obs)

    # Dataframes
    shape_name = str(shape).replace(str(dim), "N")
    df = pandas.DataFrame(res)
    df.columns = [_.replace('dim', 'N') for _ in df.columns]
    piv = df.pivot('N', 'fct', 'average')

    rs = piv.copy()
    for c in ['ort', 'torch', 'tf', 'tf_copy']:
        if c in rs.columns:
            rs[c] = rs['numpy'] / rs[c]
    rs['numpy'] = 1.

    # Graphs.
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    piv.plot(logx=True, logy=True, ax=ax[0],
             title="%s benchmark\n%r - %r"
                   " lower better" % (name, shape_name, axes))
    ax[0].legend(prop={"size": 9})
    rs.plot(logx=True, logy=True, ax=ax[1],
            title="%s Speedup, baseline=numpy\n%r - %r"
                  " higher better" % (name, shape_name, axes))
    ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--')
    ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--')
    ax[1].legend(prop={"size": 9})
    return df, rs, ax


dfs = []

Reduction on a particular case KR#

Consecutive axis not reduced and consecutive reduced axis are merged. KRK means kept axis - reduced axis - kept axis,

(8, 24, 48, N), axis=(3, )#

axes = (3, )
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceSumSquare benchmark '(8, 24, 48, N)' - (3,) lower better, ReduceSumSquare Speedup, baseline=numpy '(8, 24, 48, N)' - (3,) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:01<00:15,  1.58s/it]
 18%|#8        | 2/11 [00:02<00:13,  1.45s/it]
 27%|##7       | 3/11 [00:04<00:13,  1.67s/it]
 36%|###6      | 4/11 [00:06<00:12,  1.74s/it]
 45%|####5     | 5/11 [00:09<00:12,  2.01s/it]
 55%|#####4    | 6/11 [00:11<00:10,  2.14s/it]
 64%|######3   | 7/11 [00:14<00:09,  2.37s/it]
 73%|#######2  | 8/11 [00:17<00:08,  2.74s/it]
 82%|########1 | 9/11 [00:22<00:06,  3.27s/it]
 91%|######### | 10/11 [00:24<00:02,  2.90s/it]
100%|##########| 11/11 [00:27<00:00,  3.07s/it]
100%|##########| 11/11 [00:27<00:00,  2.54s/it]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.006181 0.008826 0.012483 0.018670 0.026495 0.033579 0.046562 0.059994 0.090614 0.047537 0.095109
ort 0.001538 0.002176 0.002677 0.004899 0.007955 0.009179 0.011474 0.014128 0.020230 0.010900 0.018956
torch 0.147824 0.118604 0.167639 0.141806 0.183608 0.157962 0.163983 0.203748 0.210881 0.087258 0.109231


Reduction on a particular case RK#

Consecutive axis not reduced and consecutive reduced axis are merged. KRK means kept axis - reduced axis - kept axis,

(8, 24, 48, N), axis=(0, )#

axes = (0, )
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceSumSquare benchmark '(8, 24, 48, N)' - (0,) lower better, ReduceSumSquare Speedup, baseline=numpy '(8, 24, 48, N)' - (0,) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:08,  1.12it/s]
 18%|#8        | 2/11 [00:02<00:09,  1.11s/it]
 27%|##7       | 3/11 [00:03<00:11,  1.42s/it]
 36%|###6      | 4/11 [00:06<00:14,  2.03s/it]
 45%|####5     | 5/11 [00:10<00:14,  2.45s/it]
 55%|#####4    | 6/11 [00:13<00:13,  2.67s/it]
 64%|######3   | 7/11 [00:17<00:12,  3.12s/it]
 73%|#######2  | 8/11 [00:22<00:11,  3.70s/it]
 82%|########1 | 9/11 [00:28<00:08,  4.42s/it]
 91%|######### | 10/11 [00:30<00:03,  3.87s/it]
100%|##########| 11/11 [00:35<00:00,  4.01s/it]
100%|##########| 11/11 [00:35<00:00,  3.20s/it]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.002677 0.005719 0.010070 0.017688 0.028340 0.037166 0.057435 0.072518 0.116120 0.057836 0.115867
ort 0.001700 0.002711 0.004898 0.009925 0.013652 0.017818 0.025793 0.032996 0.049789 0.026137 0.049646
torch 0.081948 0.111710 0.154624 0.247651 0.247233 0.214680 0.262958 0.311617 0.313712 0.115689 0.144442


Reduction on a particular case KRK#

Consecutive axis not reduced and consecutive reduced axis are merged. KRK means kept axis - reduced axis - kept axis,

(8, 24, 48, N), axis=(1, 2)#

axes = (1, 2)
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceSumSquare benchmark '(8, 24, 48, N)' - (1, 2) lower better, ReduceSumSquare Speedup, baseline=numpy '(8, 24, 48, N)' - (1, 2) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:07,  1.26it/s]
 18%|#8        | 2/11 [00:01<00:08,  1.08it/s]
 27%|##7       | 3/11 [00:03<00:09,  1.20s/it]
 36%|###6      | 4/11 [00:05<00:11,  1.70s/it]
 45%|####5     | 5/11 [00:08<00:12,  2.15s/it]
 55%|#####4    | 6/11 [00:11<00:12,  2.44s/it]
 64%|######3   | 7/11 [00:17<00:13,  3.44s/it]
 73%|#######2  | 8/11 [00:24<00:14,  4.74s/it]
 82%|########1 | 9/11 [00:32<00:11,  5.72s/it]
 91%|######### | 10/11 [00:37<00:05,  5.41s/it]
100%|##########| 11/11 [00:45<00:00,  6.30s/it]
100%|##########| 11/11 [00:45<00:00,  4.15s/it]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.004968 0.009799 0.017924 0.034020 0.052499 0.066060 0.102772 0.130349 0.199938 0.103928 0.210616
ort 0.001490 0.002371 0.005162 0.010539 0.012890 0.029250 0.028614 0.170020 0.061849 0.136599 0.302943
torch 0.069996 0.082852 0.119642 0.183246 0.199402 0.166115 0.355614 0.376059 0.405998 0.167439 0.194333


(8, 24 * 48, N), axis=1#

axes = (1, )
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24 * 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceSumSquare benchmark '(8, 1152, N)' - (1,) lower better, ReduceSumSquare Speedup, baseline=numpy '(8, 1152, N)' - (1,) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:01<00:18,  1.85s/it]
 18%|#8        | 2/11 [00:03<00:16,  1.89s/it]
 27%|##7       | 3/11 [00:05<00:12,  1.60s/it]
 36%|###6      | 4/11 [00:07<00:13,  1.90s/it]
 45%|####5     | 5/11 [00:10<00:13,  2.19s/it]
 55%|#####4    | 6/11 [00:13<00:12,  2.47s/it]
 64%|######3   | 7/11 [00:16<00:10,  2.74s/it]
 73%|#######2  | 8/11 [00:21<00:10,  3.41s/it]
 82%|########1 | 9/11 [00:26<00:07,  3.89s/it]
 91%|######### | 10/11 [00:29<00:03,  3.73s/it]
100%|##########| 11/11 [00:36<00:00,  4.58s/it]
100%|##########| 11/11 [00:36<00:00,  3.28s/it]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.006445 0.008930 0.012796 0.020390 0.028057 0.035834 0.049329 0.061447 0.091979 0.045699 0.089666
ort 0.001496 0.002352 0.004700 0.008626 0.013274 0.026774 0.027203 0.148989 0.063515 0.137514 0.302562
torch 0.174241 0.174849 0.098034 0.187555 0.196886 0.198943 0.194237 0.196828 0.217066 0.092444 0.133390


(2, 8, 12, 24, 2, N), axis=(2, 3)#

axes = (2, 3)
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (2, 8, 12, 24, 2, dim))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceSumSquare benchmark '(2, 8, 12, 24, 2, N)' - (2, 3) lower better, ReduceSumSquare Speedup, baseline=numpy '(2, 8, 12, 24, 2, N)' - (2, 3) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:01<00:18,  1.81s/it]
 18%|#8        | 2/11 [00:03<00:16,  1.89s/it]
 27%|##7       | 3/11 [00:05<00:15,  1.99s/it]
 36%|###6      | 4/11 [00:08<00:15,  2.21s/it]
 45%|####5     | 5/11 [00:12<00:18,  3.04s/it]
 55%|#####4    | 6/11 [00:17<00:18,  3.71s/it]
 64%|######3   | 7/11 [00:23<00:17,  4.28s/it]
 73%|#######2  | 8/11 [00:29<00:14,  4.97s/it]
 82%|########1 | 9/11 [00:37<00:11,  5.96s/it]
 91%|######### | 10/11 [00:42<00:05,  5.55s/it]
100%|##########| 11/11 [00:50<00:00,  6.36s/it]
100%|##########| 11/11 [00:50<00:00,  4.62s/it]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.005047 0.010037 0.018026 0.034599 0.052939 0.067618 0.103806 0.131711 0.203364 0.104909 0.218271
ort 0.001299 0.002757 0.005273 0.009501 0.009586 0.017647 0.021785 0.060642 0.102327 0.128796 0.288689
torch 0.172208 0.175655 0.177975 0.191151 0.357315 0.377904 0.357312 0.376845 0.387392 0.167647 0.187043


Reduction on a particular case RKR#

(N, 64, 16, 16), axis=(0, 2, 3)#

axes = (0, 2, 3)
df, piv, ax = benchmark_op(
    axes, shape_fct=lambda dim: (dim, 64, 16, 16))
dfs.append(df)
df.pivot("fct", "N", "average")
Traceback (most recent call last):
  File "/var/lib/jenkins/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesumsquare.py", line 234, in <module>
    df, piv, ax = benchmark_op(
  File "/var/lib/jenkins/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesumsquare.py", line 81, in benchmark_op
    obs = measure_time(
  File "/var/lib/jenkins/workspace/mlprodict/mlprodict_UT_39_std/_venv/lib/python3.9/site-packages/cpyquickhelper/numbers/speed_measure.py", line 86, in measure_time
    res = numpy.array(tim.repeat(repeat=repeat, number=number))
  File "/usr/local/lib/python3.9/timeit.py", line 205, in repeat
    t = self.timeit(number)
  File "/usr/local/lib/python3.9/timeit.py", line 177, in timeit
    timing = self.inner(it, self.timer)
  File "<timeit-src>", line 6, in inner
  File "/var/lib/jenkins/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesumsquare.py", line 57, in loop_fct
    fct(x, y)
  File "/var/lib/jenkins/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesumsquare.py", line 79, in <lambda>
    fct=lambda x, y: numpy.sum(x ** 2, *y),
  File "<__array_function__ internals>", line 5, in sum
  File "/var/lib/jenkins/workspace/mlprodict/mlprodict_UT_39_std/_venv/lib/python3.9/site-packages/numpy/core/fromnumeric.py", line 2259, in sum
    return _wrapreduction(a, np.add, 'sum', axis, dtype, out, keepdims=keepdims,
  File "/var/lib/jenkins/workspace/mlprodict/mlprodict_UT_39_std/_venv/lib/python3.9/site-packages/numpy/core/fromnumeric.py", line 86, in _wrapreduction
    return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
TypeError: output must be an array

Reduction on a particular case RKRK#

(8, 24, 48, N), axis=(0, 2)#

axes = (0, 2)
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")

Conclusion#

Some of the configurations should be investigated. l-reducesum-problem1. The reduction on tensorflow in one dimension seems to be lazy.

merged = pandas.concat(dfs)
name = "reducesumsquare"
merged.to_csv("plot_%s.csv" % name, index=False)
merged.to_excel("plot_%s.xlsx" % name, index=False)
plt.savefig("plot_%s.png" % name)

plt.show()

Total running time of the script: ( 3 minutes 27.433 seconds)

Gallery generated by Sphinx-Gallery