Note
Click here to download the full example code
Measuring CPU performance#
Processor caches must be taken into account when writing an algorithm, see Memory part 2: CPU caches from Ulrich Drepper.
Cache Performance#
from tqdm import tqdm
import matplotlib.pyplot as plt
from pyquickhelper.loghelper import run_cmd
from pandas import DataFrame, concat
from onnx_extended.ext_test_case import unit_test_going
from onnx_extended.validation.cpu._validation import (
benchmark_cache,
benchmark_cache_tree,
)
obs = []
step = 2**12
for i in tqdm(range(step, 2**20 + step, step)):
res = min(
[
benchmark_cache(i, False),
benchmark_cache(i, False),
benchmark_cache(i, False),
]
)
if res < 0:
# overflow
continue
obs.append(dict(size=i, perf=res))
df = DataFrame(obs)
mean = df.perf.mean()
lag = 32
for i in range(2, df.shape[0]):
df.loc[i, "smooth"] = df.loc[i - 8 : i + 8, "perf"].median()
if i > lag and i < df.shape[0] - lag:
df.loc[i, "delta"] = (
mean
+ df.loc[i : i + lag, "perf"].mean()
- df.loc[i - lag + 1 : i + 1, "perf"]
).mean()
0%| | 0/256 [00:00<?, ?it/s]
48%|####8 | 123/256 [00:00<00:00, 1219.97it/s]
96%|#########5| 245/256 [00:00<00:00, 671.12it/s]
100%|##########| 256/256 [00:00<00:00, 680.30it/s]
Cache size estimator#
cache_size_index = int(df.delta.argmax())
cache_size = df.loc[cache_size_index, "size"] * 2
print(f"L2 cache size estimation is {cache_size / 2 ** 20:1.3f} Mb.")
L2 cache size estimation is 0.680 Mb.
Verification#
try:
out, err = run_cmd("lscpu", wait=True)
print("\n".join(_ for _ in out.split("\n") if "cache:" in _))
except Exception as e:
print(f"failed due to {e}")
df = df.set_index("size")
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
df.plot(ax=ax, title="Cache Performance time/size", logy=True)
fig.savefig("plot_benchmark_cpu_array.png")
L1d cache: 128 KiB (4 instances)
L1i cache: 128 KiB (4 instances)
L2 cache: 1 MiB (4 instances)
L3 cache: 8 MiB (1 instance)
TreeEnsemble Performance#
We simulate the computation of a TreeEnsemble of 50 features, 100 trees and depth of 10 (so \(2^10\) nodes.)
dfs = []
cols = []
drop = []
for n in tqdm(range(10)):
res = benchmark_cache_tree(
n_rows=2000,
n_features=50,
n_trees=100,
tree_size=1024,
max_depth=10,
search_step=64,
)
res = [[max(r.row, i), r.time] for i, r in enumerate(res)]
df = DataFrame(res)
df.columns = [f"i{n}", f"time{n}"]
dfs.append(df)
cols.append(df.columns[-1])
drop.append(df.columns[0])
if unit_test_going() and len(dfs) >= 3:
break
df = concat(dfs, axis=1).reset_index(drop=True)
df["i"] = df["i0"]
df = df.drop(drop, axis=1)
df["time_avg"] = df[cols].mean(axis=1)
df["time_med"] = df[cols].median(axis=1)
df.head()
0%| | 0/10 [00:00<?, ?it/s]
10%|# | 1/10 [00:01<00:09, 1.10s/it]
20%|## | 2/10 [00:02<00:09, 1.22s/it]
30%|### | 3/10 [00:03<00:08, 1.19s/it]
40%|#### | 4/10 [00:04<00:07, 1.22s/it]
50%|##### | 5/10 [00:06<00:06, 1.25s/it]
60%|###### | 6/10 [00:07<00:04, 1.23s/it]
70%|####### | 7/10 [00:08<00:03, 1.30s/it]
80%|######## | 8/10 [00:09<00:02, 1.27s/it]
90%|######### | 9/10 [00:11<00:01, 1.26s/it]
100%|##########| 10/10 [00:12<00:00, 1.17s/it]
100%|##########| 10/10 [00:12<00:00, 1.22s/it]
Estimation#
Optimal batch size is among:
i time_med time_avg
0 704 0.033590 0.035638
1 512 0.034039 0.035084
2 448 0.034549 0.036290
3 768 0.034759 0.036731
4 1408 0.035074 0.037413
5 320 0.035154 0.042175
6 1664 0.035267 0.039321
7 960 0.035772 0.038349
8 640 0.035871 0.035799
9 1344 0.035922 0.037078
One possible estimation
Estimation: 827.0797076889988
Plots.
cols_time = ["time_avg", "time_med"]
fig, ax = plt.subplots(2, 1, figsize=(12, 6))
df.set_index("i").drop(cols_time, axis=1).plot(
ax=ax[0], title="TreeEnsemble Performance time per row", logy=True, linewidth=0.2
)
df.set_index("i")[cols_time].plot(ax=ax[1], linewidth=1.0, logy=True)
fig.savefig("plot_bench_cpu.png")
Total running time of the script: ( 0 minutes 13.827 seconds)