Skip to content

Commit 072abb8

Browse files
mgoinlywa1998
authored andcommitted
[CI] Prune Quantization Tests and skip compilation (vllm-project#27038)
Signed-off-by: mgoin <[email protected]>
1 parent 9007739 commit 072abb8

File tree

9 files changed

+62
-134
lines changed

9 files changed

+62
-134
lines changed

tests/quantization/test_auto_round.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
)
2727
@pytest.mark.parametrize("model", MODELS)
2828
def test_auto_round(vllm_runner, model):
29-
with vllm_runner(model) as llm:
29+
with vllm_runner(model, enforce_eager=True) as llm:
3030
output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
3131
assert output
3232
print(f"{output[0][1]}")

tests/quantization/test_compressed_tensors.py

Lines changed: 29 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,6 @@ def enable_pickle(monkeypatch):
6666
2560,
6767
True,
6868
),
69-
(
70-
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
71-
"channel",
72-
QuantizationType.INT,
73-
2560,
74-
True,
75-
),
7669
(
7770
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
7871
"tensor",
@@ -138,20 +131,17 @@ def zp_valid(zp: torch.Tensor | None):
138131

139132
llm.apply_model(check_model)
140133

141-
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
134+
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
142135
assert output
143136

144137

145138
@pytest.mark.parametrize(
146139
"model_path",
147140
[
148141
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
149-
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
150-
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
151-
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
152142
],
153143
)
154-
@pytest.mark.parametrize("max_tokens", [32])
144+
@pytest.mark.parametrize("max_tokens", [8])
155145
@pytest.mark.parametrize("num_logprobs", [10])
156146
@pytest.mark.parametrize(
157147
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
@@ -211,23 +201,18 @@ def test_compressed_tensors_w8a8_logprobs(
211201
def test_compressed_tensors_no_enforce_eager(vllm_runner):
212202
model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
213203
with vllm_runner(model_path) as llm:
214-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
204+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
215205
assert output
216206

217207

218208
@pytest.mark.parametrize(
219209
"model_args",
220210
[
221211
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
222-
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
223212
(
224213
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
225214
"channel",
226215
),
227-
(
228-
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
229-
"channel",
230-
),
231216
],
232217
)
233218
@pytest.mark.parametrize(
@@ -253,7 +238,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
253238
# this will enable VLLM_ROCM_USE_AITER_LINEAR
254239
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
255240

256-
with vllm_runner(model_path, dtype=torch.float16) as llm:
241+
with vllm_runner(model_path, enforce_eager=True, dtype=torch.float16) as llm:
257242

258243
def check_model(model):
259244
layer = model.model.layers[0]
@@ -268,7 +253,7 @@ def check_model(model):
268253

269254
llm.apply_model(check_model)
270255

271-
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
256+
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
272257
assert output
273258

274259

@@ -283,38 +268,6 @@ def check_model(model):
283268
True,
284269
False,
285270
),
286-
(
287-
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
288-
"group",
289-
128,
290-
8,
291-
True,
292-
False,
293-
),
294-
(
295-
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
296-
"channel",
297-
None,
298-
4,
299-
True,
300-
False,
301-
),
302-
(
303-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256",
304-
"group",
305-
128,
306-
8,
307-
False,
308-
False,
309-
),
310-
(
311-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
312-
"channel",
313-
None,
314-
8,
315-
False,
316-
False,
317-
),
318271
(
319272
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
320273
"group",
@@ -330,7 +283,7 @@ def check_model(model):
330283
)
331284
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
332285
model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
333-
with vllm_runner(model) as llm:
286+
with vllm_runner(model, enforce_eager=True) as llm:
334287

335288
def check_model(model):
336289
layer = model.model.layers[0]
@@ -348,7 +301,7 @@ def check_model(model):
348301

349302
llm.apply_model(check_model)
350303

351-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
304+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
352305
assert output
353306

354307

@@ -357,7 +310,7 @@ def check_model(model):
357310
)
358311
def test_compressed_tensors_w4a16_marlin24(vllm_runner):
359312
model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
360-
with vllm_runner(model_path) as llm:
313+
with vllm_runner(model_path, enforce_eager=True) as llm:
361314

362315
def check_model(model):
363316
layer = model.model.layers[0]
@@ -370,13 +323,13 @@ def check_model(model):
370323

371324
llm.apply_model(check_model)
372325

373-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
326+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
374327
assert output
375328

376329

377330
def test_compressed_tensors_fp8(vllm_runner):
378331
model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
379-
with vllm_runner(model_path) as llm:
332+
with vllm_runner(model_path, enforce_eager=True) as llm:
380333

381334
def check_model(model):
382335
layer = model.model.layers[0]
@@ -399,7 +352,7 @@ def check_model(model):
399352

400353
llm.apply_model(check_model)
401354

402-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
355+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
403356
assert output
404357

405358

@@ -412,8 +365,8 @@ def check_model(model):
412365
)
413366
def test_compressed_tensors_kv_cache(vllm_runner):
414367
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
415-
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
416-
output = llm.generate_greedy("Hello world!", max_tokens=20)
368+
with vllm_runner(model_path, enforce_eager=True, kv_cache_dtype="fp8") as llm:
369+
output = llm.generate_greedy("Hello world!", max_tokens=4)
417370
assert output
418371

419372

@@ -465,7 +418,7 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="d
465418
)
466419
def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
467420
model, weight_strategy, input_strategy = args_2of4
468-
with vllm_runner(model) as llm:
421+
with vllm_runner(model, enforce_eager=True) as llm:
469422

470423
def check_model(model):
471424
layer = model.model.layers[0]
@@ -476,7 +429,7 @@ def check_model(model):
476429

477430
llm.apply_model(check_model)
478431

479-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
432+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
480433
print(output)
481434
assert output
482435

@@ -512,7 +465,7 @@ def check_model(model):
512465
)
513466
def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
514467
model, weight_strategy, input_strategy = args_2of4
515-
with vllm_runner(model) as llm:
468+
with vllm_runner(model, enforce_eager=True) as llm:
516469

517470
def check_model(model):
518471
layer = model.model.layers[0]
@@ -528,7 +481,7 @@ def check_model(model):
528481

529482
llm.apply_model(check_model)
530483

531-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
484+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
532485
print(output)
533486
assert output
534487

@@ -564,7 +517,7 @@ def check_model(model):
564517
)
565518
def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
566519
model, weight_strategy, input_strategy = args_2of4
567-
with vllm_runner(model) as llm:
520+
with vllm_runner(model, enforce_eager=True) as llm:
568521

569522
def check_model(model):
570523
layer = model.model.layers[0]
@@ -580,7 +533,7 @@ def check_model(model):
580533

581534
llm.apply_model(check_model)
582535

583-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
536+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
584537
print(output)
585538
assert output
586539

@@ -611,7 +564,7 @@ def check_model(model):
611564
)
612565
def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
613566
model, weight_strategy, input_strategy = args_2of4
614-
with vllm_runner(model) as llm:
567+
with vllm_runner(model, enforce_eager=True) as llm:
615568

616569
def check_model(model):
617570
layer = model.model.layers[0]
@@ -622,7 +575,7 @@ def check_model(model):
622575

623576
llm.apply_model(check_model)
624577

625-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
578+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
626579
print(output)
627580
assert output
628581

@@ -637,7 +590,7 @@ def check_model(model):
637590
)
638591
def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
639592
model = args_2of4
640-
with vllm_runner(model) as llm:
593+
with vllm_runner(model, enforce_eager=True) as llm:
641594

642595
def check_model(model):
643596
layer = model.model.layers[0]
@@ -656,7 +609,7 @@ def check_model(model):
656609

657610
llm.apply_model(check_model)
658611

659-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
612+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
660613
print(output)
661614
assert output
662615

@@ -670,7 +623,7 @@ def check_model(model):
670623
)
671624
def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
672625
model = args_2of4
673-
with vllm_runner(model) as llm:
626+
with vllm_runner(model, enforce_eager=True) as llm:
674627

675628
def check_model(model):
676629
layer = model.model.layers[0]
@@ -689,7 +642,7 @@ def check_model(model):
689642

690643
llm.apply_model(check_model)
691644

692-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
645+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
693646
print(output)
694647
assert output
695648

@@ -723,7 +676,7 @@ def check_model(model):
723676
assert qkv_proj.scheme.group_size == 16
724677

725678
llm.apply_model(check_model)
726-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
679+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
727680
print(output)
728681
assert output
729682

@@ -758,7 +711,7 @@ def check_model(model):
758711
assert proj.scheme.group_size == 128
759712

760713
llm.apply_model(check_model)
761-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
714+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
762715
print(output)
763716
assert output
764717

@@ -792,7 +745,7 @@ def test_compressed_tensors_transforms_perplexity(
792745

793746
def test_compressed_tensors_fp8_block_enabled(vllm_runner):
794747
model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
795-
with vllm_runner(model_path) as llm:
748+
with vllm_runner(model_path, enforce_eager=True) as llm:
796749
fp8_dtype = current_platform.fp8_dtype()
797750

798751
def check_model(model):
@@ -816,5 +769,5 @@ def check_model(model):
816769

817770
llm.apply_model(check_model)
818771

819-
output = llm.generate_greedy("Hello my name is", max_tokens=20)
772+
output = llm.generate_greedy("Hello my name is", max_tokens=4)
820773
assert output

tests/quantization/test_cpu_offload.py

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,6 @@
1616
reason="fp8 is not supported on this GPU type.",
1717
)
1818
def test_cpu_offload_fp8():
19-
# Test quantization of an unquantized checkpoint
20-
compare_two_settings(
21-
"meta-llama/Llama-3.2-1B-Instruct",
22-
["--quantization", "fp8"],
23-
["--quantization", "fp8", "--cpu-offload-gb", "1"],
24-
max_wait_seconds=480,
25-
)
2619
# Test loading a quantized checkpoint
2720
compare_two_settings(
2821
"neuralmagic/Qwen2-1.5B-Instruct-FP8",
@@ -46,13 +39,6 @@ def test_cpu_offload_gptq(monkeypatch):
4639
["--cpu-offload-gb", "1"],
4740
max_wait_seconds=480,
4841
)
49-
# Test GPTQ
50-
compare_two_settings(
51-
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
52-
["--quantization", "gptq"],
53-
["--quantization", "gptq", "--cpu-offload-gb", "1"],
54-
max_wait_seconds=480,
55-
)
5642

5743

5844
@pytest.mark.skipif(
@@ -69,13 +55,6 @@ def test_cpu_offload_awq(monkeypatch):
6955
["--cpu-offload-gb", "1"],
7056
max_wait_seconds=480,
7157
)
72-
# Test AWQ
73-
compare_two_settings(
74-
"Qwen/Qwen2-1.5B-Instruct-AWQ",
75-
["--quantization", "awq"],
76-
["--quantization", "awq", "--cpu-offload-gb", "1"],
77-
max_wait_seconds=480,
78-
)
7958

8059

8160
@pytest.mark.skipif(
@@ -92,17 +71,3 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
9271
["--cpu-offload-gb", "1"],
9372
max_wait_seconds=480,
9473
)
95-
# Test w4a16_marlin24
96-
compare_two_settings(
97-
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
98-
[],
99-
["--cpu-offload-gb", "1"],
100-
max_wait_seconds=480,
101-
)
102-
# Test w8a8
103-
compare_two_settings(
104-
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
105-
[],
106-
["--cpu-offload-gb", "1"],
107-
max_wait_seconds=480,
108-
)

0 commit comments

Comments
 (0)