Skip to content

Commit be28ac6

Browse files
authored
Merge branch 'main' into rocm-triton-fallback
2 parents 2a6c86c + c2ed069 commit be28ac6

File tree

557 files changed

+18231
-6672
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

557 files changed

+18231
-6672
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
2+
tasks:
3+
- name: "mmlu_pro"
4+
metrics:
5+
- name: "exact_match,custom-extract"
6+
value: 0.82
7+
limit: 250 # will run on 250 * 14 subjects = 3500 samples
8+
num_fewshot: 5
9+
enforce_eager: false # we use false to speed up the eval process
10+
kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
11+
max_model_len: 40960
12+
apply_chat_template: true
13+
fewshot_as_multiturn: true
14+
gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"

.buildkite/lm-eval-harness/configs/models-large-h100.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Qwen3-235B-A22B-Instruct-2507-FP8.yaml

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,13 @@ def launch_lm_eval(eval_config, tp_size):
2121
max_model_len = eval_config.get("max_model_len", 4096)
2222
batch_size = eval_config.get("batch_size", "auto")
2323
backend = eval_config.get("backend", "vllm")
24+
enforce_eager = eval_config.get("enforce_eager", "true")
25+
kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
2426
model_args = (
2527
f"pretrained={eval_config['model_name']},"
2628
f"tensor_parallel_size={tp_size},"
27-
f"enforce_eager=true,"
29+
f"enforce_eager={enforce_eager},"
30+
f"kv_cache_dtype={kv_cache_dtype},"
2831
f"add_bos_token=true,"
2932
f"trust_remote_code={trust_remote_code},"
3033
f"max_model_len={max_model_len},"
@@ -37,8 +40,13 @@ def launch_lm_eval(eval_config, tp_size):
3740
limit=eval_config["limit"],
3841
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
3942
# text models. however, this is regressing measured strict-match for
40-
# existing text models in CI, so only apply it for mm.
41-
apply_chat_template=backend == "vllm-vlm",
43+
# existing text models in CI, so only apply it for mm, or explicitly set
44+
apply_chat_template=eval_config.get(
45+
"apply_chat_template", backend == "vllm-vlm"
46+
),
47+
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
48+
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
49+
gen_kwargs=eval_config.get("gen_kwargs"),
4250
batch_size=batch_size,
4351
)
4452
return results

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 0 additions & 184 deletions
This file was deleted.

.buildkite/nightly-benchmarks/nightly-annotation.md

Lines changed: 0 additions & 28 deletions
This file was deleted.

.buildkite/nightly-benchmarks/nightly-descriptions.md

Lines changed: 0 additions & 39 deletions
This file was deleted.

0 commit comments

Comments
 (0)