lywa1998
diff --git a/‎tests/quantization/test_auto_round.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/quantization/test_auto_round.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/quantization/test_compressed_tensors.py‎
Lines changed: 29 additions & 76 deletions b/‎tests/quantization/test_compressed_tensors.py‎
Lines changed: 29 additions & 76 deletions
diff --git a/‎tests/quantization/test_cpu_offload.py‎
Lines changed: 0 additions & 35 deletions b/‎tests/quantization/test_cpu_offload.py‎
Lines changed: 0 additions & 35 deletions
@@ -26,7 +26,7 @@
 )
 @pytest.mark.parametrize("model", MODELS)
 def test_auto_round(vllm_runner, model):
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
     assert output
     print(f"{output[0][1]}")
@@ -66,13 +66,6 @@ def enable_pickle(monkeypatch):
             2560,
             True,
         ),
-        (
-            "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
-            "channel",
-            QuantizationType.INT,
-            2560,
-            True,
-        ),
         (
             "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
             "tensor",
@@ -138,20 +131,17 @@ def zp_valid(zp: torch.Tensor | None):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
 
 
 @pytest.mark.parametrize(
     "model_path",
     [
         "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
-        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
     ],
 )
-@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("max_tokens", [8])
 @pytest.mark.parametrize("num_logprobs", [10])
 @pytest.mark.parametrize(
     "use_aiter", [True, False] if current_platform.is_rocm() else [False]
@@ -211,23 +201,18 @@ def test_compressed_tensors_w8a8_logprobs(
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
     model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
     with vllm_runner(model_path) as llm:
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
 
 
 @pytest.mark.parametrize(
     "model_args",
     [
         ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
-        ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
         (
             "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
             "channel",
         ),
-        (
-            "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
-            "channel",
-        ),
     ],
 )
 @pytest.mark.parametrize(
@@ -253,7 +238,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
         # this will enable VLLM_ROCM_USE_AITER_LINEAR
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
-    with vllm_runner(model_path, dtype=torch.float16) as llm:
+    with vllm_runner(model_path, enforce_eager=True, dtype=torch.float16) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -268,7 +253,7 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
 
 
@@ -283,38 +268,6 @@ def check_model(model):
             True,
             False,
         ),
-        (
-            "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
-            "group",
-            128,
-            8,
-            True,
-            False,
-        ),
-        (
-            "nm-testing/tinyllama-oneshot-w8a16-per-channel",
-            "channel",
-            None,
-            4,
-            True,
-            False,
-        ),
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256",
-            "group",
-            128,
-            8,
-            False,
-            False,
-        ),
-        (
-            "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
-            "channel",
-            None,
-            8,
-            False,
-            False,
-        ),
         (
             "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
             "group",
@@ -330,7 +283,7 @@ def check_model(model):
 )
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
     model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -348,7 +301,7 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
 
 
@@ -357,7 +310,7 @@ def check_model(model):
 )
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
     model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
-    with vllm_runner(model_path) as llm:
+    with vllm_runner(model_path, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -370,13 +323,13 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
 
 
 def test_compressed_tensors_fp8(vllm_runner):
     model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
-    with vllm_runner(model_path) as llm:
+    with vllm_runner(model_path, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -399,7 +352,7 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
 
 
@@ -412,8 +365,8 @@ def check_model(model):
 )
 def test_compressed_tensors_kv_cache(vllm_runner):
     model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
-    with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
-        output = llm.generate_greedy("Hello world!", max_tokens=20)
+    with vllm_runner(model_path, enforce_eager=True, kv_cache_dtype="fp8") as llm:
+        output = llm.generate_greedy("Hello world!", max_tokens=4)
         assert output
 
 
@@ -465,7 +418,7 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="d
 )
 def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -476,7 +429,7 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -512,7 +465,7 @@ def check_model(model):
 )
 def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -528,7 +481,7 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -564,7 +517,7 @@ def check_model(model):
 )
 def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -580,7 +533,7 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -611,7 +564,7 @@ def check_model(model):
 )
 def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -622,7 +575,7 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -637,7 +590,7 @@ def check_model(model):
 )
 def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
     model = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -656,7 +609,7 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -670,7 +623,7 @@ def check_model(model):
 )
 def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
     model = args_2of4
-    with vllm_runner(model) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -689,7 +642,7 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -723,7 +676,7 @@ def check_model(model):
             assert qkv_proj.scheme.group_size == 16
 
         llm.apply_model(check_model)
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -758,7 +711,7 @@ def check_model(model):
                 assert proj.scheme.group_size == 128
 
         llm.apply_model(check_model)
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         print(output)
         assert output
 
@@ -792,7 +745,7 @@ def test_compressed_tensors_transforms_perplexity(
 
 def test_compressed_tensors_fp8_block_enabled(vllm_runner):
     model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
-    with vllm_runner(model_path) as llm:
+    with vllm_runner(model_path, enforce_eager=True) as llm:
         fp8_dtype = current_platform.fp8_dtype()
 
         def check_model(model):
@@ -816,5 +769,5 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
@@ -16,13 +16,6 @@
     reason="fp8 is not supported on this GPU type.",
 )
 def test_cpu_offload_fp8():
-    # Test quantization of an unquantized checkpoint
-    compare_two_settings(
-        "meta-llama/Llama-3.2-1B-Instruct",
-        ["--quantization", "fp8"],
-        ["--quantization", "fp8", "--cpu-offload-gb", "1"],
-        max_wait_seconds=480,
-    )
     # Test loading a quantized checkpoint
     compare_two_settings(
         "neuralmagic/Qwen2-1.5B-Instruct-FP8",
@@ -46,13 +39,6 @@ def test_cpu_offload_gptq(monkeypatch):
         ["--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
-    # Test GPTQ
-    compare_two_settings(
-        "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
-        ["--quantization", "gptq"],
-        ["--quantization", "gptq", "--cpu-offload-gb", "1"],
-        max_wait_seconds=480,
-    )
 
 
 @pytest.mark.skipif(
@@ -69,13 +55,6 @@ def test_cpu_offload_awq(monkeypatch):
         ["--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
-    # Test AWQ
-    compare_two_settings(
-        "Qwen/Qwen2-1.5B-Instruct-AWQ",
-        ["--quantization", "awq"],
-        ["--quantization", "awq", "--cpu-offload-gb", "1"],
-        max_wait_seconds=480,
-    )
 
 
 @pytest.mark.skipif(
@@ -92,17 +71,3 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
         ["--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
-    # Test w4a16_marlin24
-    compare_two_settings(
-        "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
-        [],
-        ["--cpu-offload-gb", "1"],
-        max_wait_seconds=480,
-    )
-    # Test w8a8
-    compare_two_settings(
-        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
-        [],
-        ["--cpu-offload-gb", "1"],
-        max_wait_seconds=480,
-    )
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`)`
`27`	`27`	`@pytest.mark.parametrize("model", MODELS)`
`28`	`28`	`def test_auto_round(vllm_runner, model):`
`29`		`- with vllm_runner(model) as llm:`
	`29`	`+ with vllm_runner(model, enforce_eager=True) as llm:`
`30`	`30`	`output = llm.generate_greedy(["The capital of France is"], max_tokens=8)`
`31`	`31`	`assert output`
`32`	`32`	`print(f"{output[0][1]}")`