use generated models instead of real models

spencerschrock · spencerschrock · commit c566568ed08f · 2025-07-18T15:45:56.000-06:00
Signed-off-by: Spencer Schrock &lt;sschrock@google.com&gt;
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -81,7 +81,6 @@ jobs:
     permissions:
       id-token: 'write'
     env:
-      MODEL: deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
       TAG: ${{needs.publish-benchmark-container.outputs.head}}
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -92,11 +91,10 @@ jobs:
           workload_identity_provider: projects/306323169285/locations/global/workloadIdentityPools/github-actions-pool/providers/github-actions-provider
           service_account: 'model-transparency-gha@sigstore-infra-playground.iam.gserviceaccount.com'
       - run: |
-          export OUTPUT_FILE=$(date --utc +%Y%m%d%H%M%S)_$TAG.json
           gcloud batch jobs submit \
             --job-prefix=bench \
             --project sigstore-infra-playground \
             --location us-central1 \
             --config - <<EOF
-            $(envsubst '$TAG','$MODEL','$OUTPUT_FILE' < benchmarks/cloud_batch.json)
+            $(envsubst '$TAG' < benchmarks/cloud_batch.json)
             EOF
diff --git a/benchmarks/Containerfile b/benchmarks/Containerfile
@@ -14,7 +14,7 @@
 
 FROM python:3.13-slim
 
-RUN python -m pip install --no-cache-dir hatch huggingface_hub[cli]
+RUN python -m pip install --no-cache-dir hatch
 
 COPY pyproject.toml LICENSE README.md ./
 COPY src ./src
diff --git a/benchmarks/cloud_batch.json b/benchmarks/cloud_batch.json
@@ -9,7 +9,7 @@
                             "entrypoint": "/bin/sh",
                             "commands": [
                                 "-c",
-                                "benchmarks/run.sh ${MODEL} /mnt/disks/models /mnt/disks/gcs/${OUTPUT_FILE}"
+                                "benchmarks/run.sh /mnt/disks/models /mnt/disks/gcs ${TAG}"
                             ]
                         }
                     }
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
@@ -1,11 +1,25 @@
 #!/bin/bash
 set -euxo pipefail
 
-MODEL=$1
-MODEL_DIR=$2
-MODEL_PATH=$MODEL_DIR/$(echo $MODEL | cut --delimiter='/' --fields=2-)
-OUTPUT_FILE=$3
-
-huggingface-cli download $MODEL --local-dir "$MODEL_PATH"
-hatch run bench.py3.11:python benchmarks/time_serialize.py "$MODEL_PATH" \
-    --output=$OUTPUT_FILE
+MODEL_DIR=$1
+OUTPUT_DIR=$2
+REVISION=$3
+FILENAME_BASE=$OUTPUT_DIR/$(date --utc +%Y%m%d%H%M%S)_$REVISION
+
+for SIZE in 32 48 128; do
+    MODEL=${SIZE}gb
+    MODEL_PATH=$MODEL_DIR/$MODEL
+    mkdir -p MODEL_PATH
+
+    # simulate a handful of small metadata files in the repository
+    hatch run bench.py3.11:generate dir --root "$MODEL_PATH" -n 8 16384
+    # followed by model shards which are 8GiB each
+    N=$((${SIZE}/8))
+    SIZE_BYTES=$(($SIZE * 1024 * 1024 * 1024))
+    hatch run bench.py3.11:generate dir --root "$MODEL_PATH" -n "$N" "$SIZE_BYTES"
+
+    hatch run bench.py3.11:python benchmarks/time_serialize.py "$MODEL_PATH" \
+        --output="${FILENAME_BASE}_${MODEL}.json"
+done
+
+

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`"entrypoint": "/bin/sh",`
`10`	`10`	`"commands": [`
`11`	`11`	`"-c",`
`12`		`- "benchmarks/run.sh ${MODEL} /mnt/disks/models /mnt/disks/gcs/${OUTPUT_FILE}"`
	`12`	`+ "benchmarks/run.sh /mnt/disks/models /mnt/disks/gcs ${TAG}"`
`13`	`13`	`]`
`14`	`14`	`}`
`15`	`15`	`}`