Note

Click here to download the full example code

TopK benchmark#

This example compares onnxruntime and mlprodict for an implementation of operator TopK. We measure two runtimes by computing a ratio between their time execution through the following kind of graphs.

Graph to compare performance
TopK in ONNX
C++ implementation vs numpy
Parallelisation
Parallellisation with ONNX
onnxruntime vs mlprodict (no parallelisation)

Graph to compare performance #

from numpy.random import randn
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame
from onnxruntime import InferenceSession, __version__ as ort_version
from tqdm import tqdm
from cpyquickhelper.numbers import measure_time
from pyquickhelper.pycode.profiling import profile
from skl2onnx.algebra.onnx_ops import OnnxTopK_11
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxTopK
from mlprodict.onnxrt.validate.validate_benchmark import benchmark_fct
from mlprodict.onnxrt import OnnxInference
from mlprodict.onnxrt.ops_cpu.op_topk import (
    topk_sorted_implementation, topk_sorted_implementation_cpp)
from mlprodict import __version__ as mlp_version
from mlprodict.plotting.plotting import plot_benchmark_metrics

Available optimisation on this machine.

from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
print(code_optimisation())

Out:

AVX-omp=8

Graph.

def plot_metric(metric, ax=None, xlabel="N", ylabel="k", middle=1.,
                transpose=False, shrink=1.0, title=None):
    ax, cbar = plot_benchmark_metrics(
        metric, ax=ax, xlabel=xlabel, ylabel=ylabel, middle=middle,
        transpose=transpose, cbar_kw={'shrink': shrink})
    if title is not None:
        ax.set_title(title)
    return ax


data = {(1, 1): 0.1, (10, 1): 1, (1, 10): 2,
        (10, 10): 100, (100, 1): 100, (100, 10): 1000}

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
plot_metric(data, ax[0], shrink=0.6)

Out:

<AxesSubplot:xlabel='N', ylabel='k'>

plot_metric(data, ax[1], transpose=True)

Out:

<AxesSubplot:xlabel='k', ylabel='N'>

TopK in ONNX #

The following lines creates an ONNX graph using one TopK ONNX node. The outcome is the ONNX graph converted into json.

X32 = randn(100000, 100).astype(numpy.float32)

node = OnnxTopK_11('X', numpy.array([5], dtype=numpy.int64),
                   output_names=['dist', 'ind'])

model_onnx = node.to_onnx(
    [('X', X32)], target_opset=12,
    # shape inference does not seem to work in onnxruntime
    # so we speccify the output shape
    outputs=[('dist', X32[:1, :5]),
             ('ind', X32[:1, :5].astype(numpy.int64))])
model_onnx

Out:

ir_version: 6
producer_name: "skl2onnx"
producer_version: "1.11.1"
domain: "ai.onnx"
model_version: 0
graph {
  node {
    input: "X"
    input: "To_TopKcst"
    output: "dist"
    output: "ind"
    name: "To_TopK"
    op_type: "TopK"
    domain: ""
  }
  name: "OnnxTopK_11"
  initializer {
    dims: 1
    data_type: 7
    int64_data: 5
    name: "To_TopKcst"
  }
  input {
    name: "X"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
            dim_value: 100
          }
        }
      }
    }
  }
  output {
    name: "dist"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
            dim_value: 5
          }
        }
      }
    }
  }
  output {
    name: "ind"
    type {
      tensor_type {
        elem_type: 7
        shape {
          dim {
          }
          dim {
            dim_value: 5
          }
        }
      }
    }
  }
}
opset_import {
  domain: ""
  version: 11
}

That gives…

oinf = OnnxInference(model_onnx, runtime="python")
res = oinf.run({'X': X32})
dist, ind = res['dist'], res['ind']
dist[:2], ind[:2]

Out:

(array([[2.0272512, 1.8558152, 1.8149445, 1.4867951, 1.3442959],
       [2.6460865, 1.7889596, 1.7093822, 1.4970592, 1.4434265]],
      dtype=float32), array([[64, 77,  9, 61, 47],
       [ 9, 25, 36, 44,  6]]))

With onnxruntime.

sess = InferenceSession(model_onnx.SerializeToString())
dist, ind = sess.run(None, {'X': X32})
dist[:2], ind[:2]

Out:

(array([[2.0272512, 1.8558152, 1.8149445, 1.4867951, 1.3442959],
       [2.6460865, 1.7889596, 1.7093822, 1.4970592, 1.4434265]],
      dtype=float32), array([[64, 77,  9, 61, 47],
       [ 9, 25, 36, 44,  6]], dtype=int64))

Let’s compare two implementations.

def benchmark(X, fct1, fct2, N, repeat=10, number=10):

    def ti(n):
        if n <= 1:
            return 50
        if n <= 1000:
            return 2
        if n <= 10000:
            return 0.51
        return 0.11

    # to warm up the engine
    time_kwargs = {n: dict(repeat=10, number=10) for n in N[:2]}
    benchmark_fct(fct1, X, time_kwargs=time_kwargs, skip_long_test=False)
    benchmark_fct(fct2, X, time_kwargs=time_kwargs, skip_long_test=False)
    # real measure
    time_kwargs = {n: dict(repeat=int(repeat * ti(n)),
                           number=int(number * ti(n))) for n in N}
    res1 = benchmark_fct(fct1, X, time_kwargs=time_kwargs,
                         skip_long_test=False)
    res2 = benchmark_fct(fct2, X, time_kwargs=time_kwargs,
                         skip_long_test=False)

    res = {}
    for r in sorted(res1):
        r1 = res1[r]
        r2 = res2[r]
        ratio = r2['ttime'] / r1['ttime']
        res[r] = ratio
    return res


N = [1, 10, 100, 1000, 10000, 100000]
res = benchmark(X32, lambda x: sess.run(None, {'X': x}),
                lambda x: oinf.run({'X': x}), N=N)
res

Out:

{1: 1.3491936839804246, 10: 1.2996703310122397, 100: 24.31770123598381, 1000: 11.200222562568541, 10000: 2.680006426590349, 100000: 1.0835793513909697}

The implementation in mlprodict is faster when the number of rows grows. It is faster for 1 rows, for many rows, the implementation uses openmp to parallelize.

C++ implementation vs numpy #

scikit-learn uses numpy to compute the top k elements.

res = benchmark(X32, lambda x: topk_sorted_implementation(x, 5, 1, 0),
                lambda x: topk_sorted_implementation_cpp(x, 5, 1, 0), N=N)
res

Out:

{1: 0.3050468267705555, 10: 0.29974319199162397, 100: 8.586906612338494, 1000: 1.8539561753429743, 10000: 0.3519928839345043, 100000: 0.13709498200454023}

It seems to be faster too. Let’s profile.

xr = randn(1000000, 100)
text = profile(lambda: topk_sorted_implementation(xr, 5, 1, 0),
               pyinst_format='text')[1]
print(text)

Out:

  _     ._   __/__   _ _  _  _ _/_   Recorded: 05:48:31 AM Samples:  5
 /_//_/// /_\ / //_// / //_'/ //     Duration: 6.664     CPU time: 6.599
/   _/                      v4.1.1

Program: /var/lib/jenkins/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_onnx_topk.py

6.664 profile  ../pycode/profiling.py:457
`- 6.664 <lambda>  plot_op_onnx_topk.py:177
      [15 frames hidden]  plot_op_onnx_topk, mlprodict, <__arra...
         5.336 ndarray.argpartition  <built-in>:0

Parallelisation #

We need to disable the parallelisation to really compare both implementation.

# In[11]:


def benchmark_test(X, fct1, fct2, N, K, repeat=10, number=10):
    res = {}
    for k in tqdm(K):
        def f1(x, k=k): return fct1(x, k=k)
        def f2(x, k=k): return fct2(x, k=k)
        r = benchmark(X32, f1, f2, N=N, repeat=repeat, number=number)
        for n, v in r.items():
            res[n, k] = v
    return res


K = [1, 2, 5, 10, 15]
N = [1, 2, 3, 10, 100, 1000, 10000]

bench_para = benchmark_test(
    X32, (lambda x, k: topk_sorted_implementation_cpp(
        x, k=k, axis=1, largest=0, th_para=100000000)),
    (lambda x, k: topk_sorted_implementation_cpp(
        x, k=k, axis=1, largest=0, th_para=1)),
    N=N, K=K)

bench_para

Out:

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|##        | 1/5 [00:29<01:56, 29.22s/it]
 40%|####      | 2/5 [01:02<01:34, 31.46s/it]
 60%|######    | 3/5 [01:35<01:04, 32.47s/it]
 80%|########  | 4/5 [02:06<00:31, 31.87s/it]
100%|##########| 5/5 [02:40<00:00, 32.36s/it]
100%|##########| 5/5 [02:40<00:00, 32.02s/it]

{(1, 1): 1.0019932270429421, (2, 1): 220.80063479781504, (3, 1): 221.67549421569402, (10, 1): 69.233887837786, (100, 1): 24.453764003632923, (1000, 1): 20.776148018831723, (10000, 1): 2.5208520860435053, (1, 2): 0.9953727637513775, (2, 2): 72.22934386606052, (3, 2): 220.9212633588702, (10, 2): 194.68582938198458, (100, 2): 82.42947243987165, (1000, 2): 12.475049139663017, (10000, 2): 1.4807891809287688, (1, 5): 0.9976739836905998, (2, 5): 215.0298508146255, (3, 5): 93.43249284395424, (10, 5): 158.9577147990169, (100, 5): 48.49672393457181, (1000, 5): 5.738895557417205, (10000, 5): 0.8801927954561015, (1, 10): 0.9968480030454562, (2, 10): 106.72026978546145, (3, 10): 100.2405380008792, (10, 10): 47.303245103046756, (100, 10): 23.654833569670085, (1000, 10): 2.610032781828763, (10000, 10): 0.5377695979668052, (1, 15): 0.9952193911710505, (2, 15): 113.64538418489951, (3, 15): 68.265585458802, (10, 15): 111.67960543582143, (100, 15): 8.761230393816833, (1000, 15): 2.300441861980231, (10000, 15): 0.44169301848551096}

As a graph.

plot_metric(bench_para, transpose=False, title="TopK and parallelisation\n"
            "< 1 means parallelisation is faster", shrink=0.75)

TopK and parallelisation < 1 means parallelisation is faster

Out:

<AxesSubplot:title={'center':'TopK and parallelisation\n< 1 means parallelisation is faster'}, xlabel='N', ylabel='k'>

This is somehow expected. First column is closed to 1 as it is the same code being compared. Next columns are red, meaning the parallelisation does not help, the parallelisation helps for bigger N, as least more than 100.

Parallellisation with ONNX #

We replicate the same experiment with an ONNX graph.

k_ = numpy.array([3], dtype=numpy.int64)
node = OnnxTopK_11('X', 'k',
                   output_names=['dist', 'ind'])

model_onnx = node.to_onnx(
    [('X', X32), ('k', k_)], target_opset=12,
    # shape inference does not seem to work in onnxruntime
    # so we speccify the output shape
    outputs=[('dist', X32[:1, :5]),
             ('ind', X32[:1, :5].astype(numpy.int64))])

Test

oinf_no_para = OnnxInference(model_onnx, runtime="python")
res = oinf_no_para.run({'X': X32, 'k': k_})
dist, ind = res['dist'], res['ind']
dist[:2], ind[:2]

Out:

(array([[2.0272512, 1.8558152, 1.8149445],
       [2.6460865, 1.7889596, 1.7093822]], dtype=float32), array([[64, 77,  9],
       [ 9, 25, 36]]))

Let’s play with the thresholds triggering the parallelisation.

oinf_para = OnnxInference(model_onnx, runtime="python")
oinf_no_para.sequence_[0].ops_.th_para = 100000000
oinf_para.sequence_[0].ops_.th_para = 1

Results.

bench_onnx_para = benchmark_test(
    X32, (lambda x, k: oinf_no_para.run(
        {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    (lambda x, k: oinf_para.run(
        {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    N=N, K=K)
bench_onnx_para

Out:

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|##        | 1/5 [01:05<04:23, 65.95s/it]
 40%|####      | 2/5 [02:14<03:22, 67.63s/it]
 60%|######    | 3/5 [03:20<02:13, 67.00s/it]
 80%|########  | 4/5 [04:36<01:10, 70.22s/it]
100%|##########| 5/5 [05:48<00:00, 71.15s/it]
100%|##########| 5/5 [05:48<00:00, 69.79s/it]

{(1, 1): 0.975706691683112, (2, 1): 49.601354337456144, (3, 1): 46.49450110342639, (10, 1): 38.84852373728811, (100, 1): 26.857226943795812, (1000, 1): 9.199350707769051, (10000, 1): 1.6043779750671139, (1, 2): 0.9938224338698936, (2, 2): 25.688027590350895, (3, 2): 49.503092367856425, (10, 2): 50.84182742272852, (100, 2): 42.41523442344668, (1000, 2): 10.559183958771861, (10000, 2): 1.4574175507872738, (1, 5): 1.0243738161505538, (2, 5): 26.401865587344624, (3, 5): 48.52095957466945, (10, 5): 33.63194736112299, (100, 5): 11.760348391353677, (1000, 5): 5.091039893474074, (10000, 5): 0.8102099686933483, (1, 10): 0.9929557823809304, (2, 10): 71.13648945978277, (3, 10): 69.07119010305276, (10, 10): 60.04878730343963, (100, 10): 18.875668529491847, (1000, 10): 3.2351301490538584, (10000, 10): 0.5358086314400785, (1, 15): 0.9992351719888004, (2, 15): 21.485182730786452, (3, 15): 37.439324416499154, (10, 15): 52.62069973791653, (100, 15): 14.962330180210555, (1000, 15): 2.5303630078697097, (10000, 15): 0.44418851754702626}

As a graph.

plot_metric(bench_onnx_para, transpose=False,
            title="TopK and parallelisation with ONNX\n< 1 means "
            "parallelisation is faster", shrink=0.75)

TopK and parallelisation with ONNX < 1 means parallelisation is faster

Out:

<AxesSubplot:title={'center':'TopK and parallelisation with ONNX\n< 1 means parallelisation is faster'}, xlabel='N', ylabel='k'>

Pretty much the same results.

onnxruntime vs mlprodict (no parallelisation)#

sess = InferenceSession(model_onnx.SerializeToString())


bench_ort = benchmark_test(
    X32, (lambda x, k: sess.run(
        None, {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    (lambda x, k: oinf_no_para.run(
        {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    N=N, K=K)
bench_ort

Out:

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|##        | 1/5 [00:50<03:23, 50.79s/it]
 40%|####      | 2/5 [01:42<02:34, 51.60s/it]
 60%|######    | 3/5 [02:35<01:43, 51.95s/it]
 80%|########  | 4/5 [03:30<00:53, 53.04s/it]
100%|##########| 5/5 [04:25<00:00, 54.08s/it]
100%|##########| 5/5 [04:25<00:00, 53.19s/it]

{(1, 1): 1.2399499156868747, (2, 1): 1.2094894848183175, (3, 1): 1.2105110564726131, (10, 1): 1.19044942514807, (100, 1): 1.1083407957783988, (1000, 1): 0.9113368931400883, (10000, 1): 3.085530624873318, (1, 2): 1.2523970421376955, (2, 2): 1.2588352129887856, (3, 2): 1.260563258463255, (10, 2): 1.2399222644514638, (100, 2): 1.1154792452364959, (1000, 2): 0.9348969271371034, (10000, 2): 3.3511550574368503, (1, 5): 1.2029171752316001, (2, 5): 1.1875865586033656, (3, 5): 1.1836905404868134, (10, 5): 1.1459635885564035, (100, 5): 1.006216120685874, (1000, 5): 2.291875340616187, (10000, 5): 3.2273500464994513, (1, 10): 1.2234724125898908, (2, 10): 1.2381808356335182, (3, 10): 1.2273196255323788, (10, 10): 1.1613369045063797, (100, 10): 0.9700041883458369, (1000, 10): 3.019908951236195, (10000, 10): 3.2239363879588776, (1, 15): 1.210372412861998, (2, 15): 1.2275438077356726, (3, 15): 1.214728783284041, (10, 15): 1.1342570067644326, (100, 15): 0.9274965821838104, (1000, 15): 3.07315536998904, (10000, 15): 3.203859125109382}

As a graph.

plot_metric(bench_ort, transpose=False,
            title="TopK, onnxruntime vs mlprodict\n< 1 means mlprodict "
            "is faster\nno parallelisation", shrink=0.75)

TopK, onnxruntime vs mlprodict < 1 means mlprodict is faster no parallelisation

Out:

<AxesSubplot:title={'center':'TopK, onnxruntime vs mlprodict\n< 1 means mlprodict is faster\nno parallelisation'}, xlabel='N', ylabel='k'>

It seems the implementation of operator TopK in onnxruntime 1.1.1 can be improved.

Versions:

ort_version, mlp_version

Out:

('1.11.0', '0.8.1762')

And with parallelisation above 50 rows.

oinf_para.sequence_[0].ops_.th_para = 50
bench_ort_para = benchmark_test(
    X32, (lambda x, k: sess.run(
        None, {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    (lambda x, k: oinf_para.run(
        {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    N=N, K=K)
bench_ort_para

Out:

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|##        | 1/5 [00:56<03:47, 57.00s/it]
 40%|####      | 2/5 [01:51<02:46, 55.58s/it]
 60%|######    | 3/5 [02:48<01:52, 56.19s/it]
 80%|########  | 4/5 [03:45<00:56, 56.50s/it]
100%|##########| 5/5 [04:44<00:00, 57.41s/it]
100%|##########| 5/5 [04:44<00:00, 56.90s/it]

{(1, 1): 1.2505949602987918, (2, 1): 1.2575817835047145, (3, 1): 1.2518761364734767, (10, 1): 1.2435556169333721, (100, 1): 54.17368450163326, (1000, 1): 17.33084434239648, (10000, 1): 7.8733501828184815, (1, 2): 1.2151734105527625, (2, 2): 1.2534168980600628, (3, 2): 1.251783848412169, (10, 2): 1.2366884322143938, (100, 2): 33.3121952674599, (1000, 2): 4.90139308882305, (10000, 2): 4.171299002833656, (1, 5): 1.2168394636681619, (2, 5): 1.2456791695217482, (3, 5): 1.2323706642295496, (10, 5): 1.1981445249633078, (100, 5): 27.577476020804642, (1000, 5): 9.816508210814916, (10000, 5): 2.5538622256802945, (1, 10): 1.195007625008709, (2, 10): 1.183188460079545, (3, 10): 1.1691493752968305, (10, 10): 1.1190647081525935, (100, 10): 10.066629137023765, (1000, 10): 8.67901152305891, (10000, 10): 1.7435514623457984, (1, 15): 1.2172619035593142, (2, 15): 1.233174811429135, (3, 15): 1.216972500976091, (10, 15): 1.126908399446414, (100, 15): 9.799250815552321, (1000, 15): 6.421975992523156, (10000, 15): 1.4566611547745267}

As a graph.

plot_metric(bench_ort_para, transpose=False,
            title="TopK, onnxruntime vs mlprodict\n< 1 means mlprodict "
            "is faster\nparallelisation above 50 rows", shrink=0.75)

TopK, onnxruntime vs mlprodict < 1 means mlprodict is faster parallelisation above 50 rows

Out:

<AxesSubplot:title={'center':'TopK, onnxruntime vs mlprodict\n< 1 means mlprodict is faster\nparallelisation above 50 rows'}, xlabel='N', ylabel='k'>

onnxruntime and mlprodict implement the same algorithm.: The only difference comes from the threshold which trigger the parallelisation. It should depend on the machine. That explains the difference in time for 100 observations.

X = numpy.array([
    [0, 1, 2, 3],
    [4, 5, 6, 7],
    [8, 9, 10, 11],
], dtype=numpy.float32)

K = numpy.array([3], dtype=numpy.int64)


node = OnnxTopK('X', K, output_names=['values', 'indices'],
                op_version=12)
onx = node.to_onnx([('X', FloatTensorType())])

py_topk = OnnxInference(onx, runtime="python_compiled")
ort_topk = InferenceSession(onx.SerializeToString())

Check the outputs.

r1 = py_topk.run({'X': X})
r1

Out:

{'values': array([[ 3.,  2.,  1.],
       [ 7.,  6.,  5.],
       [11., 10.,  9.]], dtype=float32), 'indices': array([[3, 2, 1],
       [3, 2, 1],
       [3, 2, 1]])}

r2 = ort_topk.run(None, {'X': X})
r2

Out:

[array([[ 3.,  2.,  1.],
       [ 7.,  6.,  5.],
       [11., 10.,  9.]], dtype=float32), array([[3, 2, 1],
       [3, 2, 1],
       [3, 2, 1]], dtype=int64)]

Some figures.

bs = []
bs.append(measure_time(lambda: py_topk.run({'X': X}),
                       context=globals(), div_by_number=True))
bs[-1]['c'] = 'py'
bs[-1]

Out:

{'average': 6.372089684009551e-05, 'deviation': 5.553969884520887e-07, 'min_exec': 6.322313100099564e-05, 'max_exec': 6.524350494146346e-05, 'repeat': 10, 'number': 50, 'ttime': 0.0006372089684009551, 'context_size': 2272, 'c': 'py'}

bs.append(measure_time(lambda: ort_topk.run(None, {'X': X}),
                       context=globals(), div_by_number=True))
bs[-1]['c'] = 'or'
bs[-1]

Out:

{'average': 6.829253584146499e-05, 'deviation': 6.741774447923558e-07, 'min_exec': 6.774332374334335e-05, 'max_exec': 7.021646946668625e-05, 'repeat': 10, 'number': 50, 'ttime': 0.00068292535841465, 'context_size': 2272, 'c': 'or'}

X = numpy.random.randn(10000, 100).astype(numpy.float32)


bs.append(measure_time(lambda: py_topk.run({'X': X}),
                       context=globals(), div_by_number=True))
bs[-1]['c'] = 'py-100'
bs[-1]

Out:

{'average': 0.010219269327819348, 'deviation': 0.00018311647805134725, 'min_exec': 0.009838960394263267, 'max_exec': 0.010480948910117149, 'repeat': 10, 'number': 50, 'ttime': 0.10219269327819347, 'context_size': 2272, 'c': 'py-100'}

bs.append(measure_time(lambda: ort_topk.run(None, {'X': X}),
                       context=globals(), div_by_number=True))
bs[-1]['c'] = 'ort-100'
bs[-1]

Out:

{'average': 0.0026605716310441493, 'deviation': 1.4536568251660117e-05, 'min_exec': 0.0026473639160394667, 'max_exec': 0.0026886112987995147, 'repeat': 10, 'number': 50, 'ttime': 0.026605716310441493, 'context_size': 2272, 'c': 'ort-100'}

df = DataFrame(bs)
df

	average	deviation	min_exec	max_exec	repeat	number	ttime	context_size	c
0	0.000064	5.553970e-07	0.000063	0.000065	10	50	0.000637	2272	py
1	0.000068	6.741774e-07	0.000068	0.000070	10	50	0.000683	2272	or
2	0.010219	1.831165e-04	0.009839	0.010481	10	50	0.102193	2272	py-100
3	0.002661	1.453657e-05	0.002647	0.002689	10	50	0.026606	2272	ort-100

Total running time of the script: ( 19 minutes 48.620 seconds)

Gallery generated by Sphinx-Gallery

Compares implementations of Einsum

Benchmark Linear Regression

TopK benchmark#

Graph to compare performance#

TopK in ONNX#

C++ implementation vs numpy#

Parallelisation#

Parallellisation with ONNX#

onnxruntime vs mlprodict (no parallelisation)#

Graph to compare performance #

TopK in ONNX #

C++ implementation vs numpy #

Parallelisation #

Parallellisation with ONNX #