use randomly generated data instead of models for hash experiment

spencerschrock · spencerschrock · commit 840f8096996b · 2025-01-14T10:48:28.000-07:00
As far as hashing is concerned, bytes are bytes. By generating our own
bytes, we avoid I/O associated with reading models from disk. While we
could read the model into memory, recreating the filesystem seems
complicated.

Signed-off-by: Spencer Schrock &lt;sschrock@google.com&gt;
diff --git a/benchmarks/exp_hash.py b/benchmarks/exp_hash.py
@@ -16,18 +16,23 @@
 """Script for running a benchmark to pick a hashing algorithm."""
 
 import argparse
-import pathlib
 import timeit
+from typing import Final
 
+import numpy as np
 import serialize
 
 
+KB: Final[int] = 1024
+MB: Final[int] = 1024 * KB
+GB: Final[int] = 1024 * MB
+
+
 def build_parser() -> argparse.ArgumentParser:
     """Builds the command line parser for the hash experiment."""
     parser = argparse.ArgumentParser(
         description="hash algorithm benchmark data for model signing"
     )
-    parser.add_argument("path", help="path to model", type=pathlib.Path)
 
     parser.add_argument(
         "--repeat",
@@ -44,28 +49,51 @@ def build_parser() -> argparse.ArgumentParser:
         default=["sha256", "blake2"],
     )
 
+    parser.add_argument(
+        "--data-sizes",
+        help="hash methods to benchmark",
+        nargs="+",
+        type=int,
+        default=[KB, MB, 512 * MB, GB, 4 * GB, 16 * GB, 32 * GB],
+    )
+
     return parser
 
 
+def _human_size(size: int) -> str:
+    if size >= GB:
+        return str(size / GB) + " GB"
+    elif size >= MB:
+        return str(size / MB) + " MB"
+    elif size >= KB:
+        return str(size / KB) + " KB"
+    return str(size) + " B"
+
+
+def _generate_data(size: int) -> bytes:
+    if size < 0:
+        raise ValueError("Cannot generate negative bytes")
+    return np.random.randint(0, 256, size, dtype=np.uint8).tobytes()
+
+
 if __name__ == "__main__":
-    hash_args = build_parser().parse_args()
-    bench_parser = serialize.build_parser()
-    for algorithm in hash_args.methods:
-        args = bench_parser.parse_args(
-            [
-                str(hash_args.path),
-                "--skip_manifest",
-                "--hash_method",
-                algorithm,
-                "--merge_hasher",
-                algorithm,
-            ]
-        )
-        times = timeit.repeat(
-            lambda args=args: serialize.run(args),
-            number=1,
-            repeat=hash_args.repeat,
-        )
-        # Grab the min time, as suggested by the docs
-        # https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat
-        print(f"algorithm: {algorithm}, best time: {min(times)}s")
+    np.random.seed(42)
+    args = build_parser().parse_args()
+    data = _generate_data(max(args.data_sizes))
+    for size in args.data_sizes:
+        for algorithm in args.methods:
+            hasher = serialize.get_hash_engine_factory(algorithm)()
+
+            def hash(hasher=hasher, size=size):
+                hasher.update(data[:size])
+                return hasher.compute()
+
+            times = timeit.repeat(lambda: hash(), number=1, repeat=args.repeat)
+
+            # Grab the min time, as suggested by the docs
+            # https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat
+            print(
+                f"algorithm: {algorithm}, "
+                f"size: {_human_size(size)}, "
+                f"best time: {min(times)}s"
+            )