Skip to content

Commit 840f809

Browse files
use randomly generated data instead of models for hash experiment
As far as hashing is concerned, bytes are bytes. By generating our own bytes, we avoid I/O associated with reading models from disk. While we could read the model into memory, recreating the filesystem seems complicated. Signed-off-by: Spencer Schrock <[email protected]>
1 parent 1ed45f0 commit 840f809

File tree

1 file changed

+51
-23
lines changed

1 file changed

+51
-23
lines changed

benchmarks/exp_hash.py

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,23 @@
1616
"""Script for running a benchmark to pick a hashing algorithm."""
1717

1818
import argparse
19-
import pathlib
2019
import timeit
20+
from typing import Final
2121

22+
import numpy as np
2223
import serialize
2324

2425

26+
KB: Final[int] = 1024
27+
MB: Final[int] = 1024 * KB
28+
GB: Final[int] = 1024 * MB
29+
30+
2531
def build_parser() -> argparse.ArgumentParser:
2632
"""Builds the command line parser for the hash experiment."""
2733
parser = argparse.ArgumentParser(
2834
description="hash algorithm benchmark data for model signing"
2935
)
30-
parser.add_argument("path", help="path to model", type=pathlib.Path)
3136

3237
parser.add_argument(
3338
"--repeat",
@@ -44,28 +49,51 @@ def build_parser() -> argparse.ArgumentParser:
4449
default=["sha256", "blake2"],
4550
)
4651

52+
parser.add_argument(
53+
"--data-sizes",
54+
help="hash methods to benchmark",
55+
nargs="+",
56+
type=int,
57+
default=[KB, MB, 512 * MB, GB, 4 * GB, 16 * GB, 32 * GB],
58+
)
59+
4760
return parser
4861

4962

63+
def _human_size(size: int) -> str:
64+
if size >= GB:
65+
return str(size / GB) + " GB"
66+
elif size >= MB:
67+
return str(size / MB) + " MB"
68+
elif size >= KB:
69+
return str(size / KB) + " KB"
70+
return str(size) + " B"
71+
72+
73+
def _generate_data(size: int) -> bytes:
74+
if size < 0:
75+
raise ValueError("Cannot generate negative bytes")
76+
return np.random.randint(0, 256, size, dtype=np.uint8).tobytes()
77+
78+
5079
if __name__ == "__main__":
51-
hash_args = build_parser().parse_args()
52-
bench_parser = serialize.build_parser()
53-
for algorithm in hash_args.methods:
54-
args = bench_parser.parse_args(
55-
[
56-
str(hash_args.path),
57-
"--skip_manifest",
58-
"--hash_method",
59-
algorithm,
60-
"--merge_hasher",
61-
algorithm,
62-
]
63-
)
64-
times = timeit.repeat(
65-
lambda args=args: serialize.run(args),
66-
number=1,
67-
repeat=hash_args.repeat,
68-
)
69-
# Grab the min time, as suggested by the docs
70-
# https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat
71-
print(f"algorithm: {algorithm}, best time: {min(times)}s")
80+
np.random.seed(42)
81+
args = build_parser().parse_args()
82+
data = _generate_data(max(args.data_sizes))
83+
for size in args.data_sizes:
84+
for algorithm in args.methods:
85+
hasher = serialize.get_hash_engine_factory(algorithm)()
86+
87+
def hash(hasher=hasher, size=size):
88+
hasher.update(data[:size])
89+
return hasher.compute()
90+
91+
times = timeit.repeat(lambda: hash(), number=1, repeat=args.repeat)
92+
93+
# Grab the min time, as suggested by the docs
94+
# https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat
95+
print(
96+
f"algorithm: {algorithm}, "
97+
f"size: {_human_size(size)}, "
98+
f"best time: {min(times)}s"
99+
)

0 commit comments

Comments
 (0)