@@ -66,13 +66,6 @@ def enable_pickle(monkeypatch):
6666 2560 ,
6767 True ,
6868 ),
69- (
70- "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor" ,
71- "channel" ,
72- QuantizationType .INT ,
73- 2560 ,
74- True ,
75- ),
7669 (
7770 "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama" ,
7871 "tensor" ,
@@ -138,20 +131,17 @@ def zp_valid(zp: torch.Tensor | None):
138131
139132 llm .apply_model (check_model )
140133
141- output = llm .generate_greedy (["Hello my name is" ], max_tokens = 20 )
134+ output = llm .generate_greedy (["Hello my name is" ], max_tokens = 4 )
142135 assert output
143136
144137
145138@pytest .mark .parametrize (
146139 "model_path" ,
147140 [
148141 "neuralmagic/Llama-3.2-1B-quantized.w8a8" ,
149- "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym" ,
150- "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym" ,
151- "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym" ,
152142 ],
153143)
154- @pytest .mark .parametrize ("max_tokens" , [32 ])
144+ @pytest .mark .parametrize ("max_tokens" , [8 ])
155145@pytest .mark .parametrize ("num_logprobs" , [10 ])
156146@pytest .mark .parametrize (
157147 "use_aiter" , [True , False ] if current_platform .is_rocm () else [False ]
@@ -211,23 +201,18 @@ def test_compressed_tensors_w8a8_logprobs(
211201def test_compressed_tensors_no_enforce_eager (vllm_runner ):
212202 model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
213203 with vllm_runner (model_path ) as llm :
214- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
204+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
215205 assert output
216206
217207
218208@pytest .mark .parametrize (
219209 "model_args" ,
220210 [
221211 ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2" , "tensor" ),
222- ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym" , "tensor" ),
223212 (
224213 "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2" ,
225214 "channel" ,
226215 ),
227- (
228- "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym" ,
229- "channel" ,
230- ),
231216 ],
232217)
233218@pytest .mark .parametrize (
@@ -253,7 +238,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
253238 # this will enable VLLM_ROCM_USE_AITER_LINEAR
254239 monkeypatch .setenv ("VLLM_ROCM_USE_AITER" , "1" )
255240
256- with vllm_runner (model_path , dtype = torch .float16 ) as llm :
241+ with vllm_runner (model_path , enforce_eager = True , dtype = torch .float16 ) as llm :
257242
258243 def check_model (model ):
259244 layer = model .model .layers [0 ]
@@ -268,7 +253,7 @@ def check_model(model):
268253
269254 llm .apply_model (check_model )
270255
271- output = llm .generate_greedy (["Hello my name is" ], max_tokens = 20 )
256+ output = llm .generate_greedy (["Hello my name is" ], max_tokens = 4 )
272257 assert output
273258
274259
@@ -283,38 +268,6 @@ def check_model(model):
283268 True ,
284269 False ,
285270 ),
286- (
287- "nm-testing/tinyllama-oneshot-w4a16-group128-v2" ,
288- "group" ,
289- 128 ,
290- 8 ,
291- True ,
292- False ,
293- ),
294- (
295- "nm-testing/tinyllama-oneshot-w8a16-per-channel" ,
296- "channel" ,
297- None ,
298- 4 ,
299- True ,
300- False ,
301- ),
302- (
303- "nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256" ,
304- "group" ,
305- 128 ,
306- 8 ,
307- False ,
308- False ,
309- ),
310- (
311- "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel" ,
312- "channel" ,
313- None ,
314- 8 ,
315- False ,
316- False ,
317- ),
318271 (
319272 "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder" ,
320273 "group" ,
@@ -330,7 +283,7 @@ def check_model(model):
330283)
331284def test_compressed_tensors_wNa16 (vllm_runner , wNa16_args ):
332285 model , strategy , group , pack_factor , symmetric , has_g_idx = wNa16_args
333- with vllm_runner (model ) as llm :
286+ with vllm_runner (model , enforce_eager = True ) as llm :
334287
335288 def check_model (model ):
336289 layer = model .model .layers [0 ]
@@ -348,7 +301,7 @@ def check_model(model):
348301
349302 llm .apply_model (check_model )
350303
351- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
304+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
352305 assert output
353306
354307
@@ -357,7 +310,7 @@ def check_model(model):
357310)
358311def test_compressed_tensors_w4a16_marlin24 (vllm_runner ):
359312 model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
360- with vllm_runner (model_path ) as llm :
313+ with vllm_runner (model_path , enforce_eager = True ) as llm :
361314
362315 def check_model (model ):
363316 layer = model .model .layers [0 ]
@@ -370,13 +323,13 @@ def check_model(model):
370323
371324 llm .apply_model (check_model )
372325
373- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
326+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
374327 assert output
375328
376329
377330def test_compressed_tensors_fp8 (vllm_runner ):
378331 model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
379- with vllm_runner (model_path ) as llm :
332+ with vllm_runner (model_path , enforce_eager = True ) as llm :
380333
381334 def check_model (model ):
382335 layer = model .model .layers [0 ]
@@ -399,7 +352,7 @@ def check_model(model):
399352
400353 llm .apply_model (check_model )
401354
402- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
355+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
403356 assert output
404357
405358
@@ -412,8 +365,8 @@ def check_model(model):
412365)
413366def test_compressed_tensors_kv_cache (vllm_runner ):
414367 model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
415- with vllm_runner (model_path , kv_cache_dtype = "fp8" ) as llm :
416- output = llm .generate_greedy ("Hello world!" , max_tokens = 20 )
368+ with vllm_runner (model_path , enforce_eager = True , kv_cache_dtype = "fp8" ) as llm :
369+ output = llm .generate_greedy ("Hello world!" , max_tokens = 4 )
417370 assert output
418371
419372
@@ -465,7 +418,7 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="d
465418)
466419def test_compressed_tensors_2of4_quant_fp8 (vllm_runner , args_2of4 ):
467420 model , weight_strategy , input_strategy = args_2of4
468- with vllm_runner (model ) as llm :
421+ with vllm_runner (model , enforce_eager = True ) as llm :
469422
470423 def check_model (model ):
471424 layer = model .model .layers [0 ]
@@ -476,7 +429,7 @@ def check_model(model):
476429
477430 llm .apply_model (check_model )
478431
479- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
432+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
480433 print (output )
481434 assert output
482435
@@ -512,7 +465,7 @@ def check_model(model):
512465)
513466def test_compressed_tensors_2of4_quant_fp8_compressed (vllm_runner , args_2of4 ):
514467 model , weight_strategy , input_strategy = args_2of4
515- with vllm_runner (model ) as llm :
468+ with vllm_runner (model , enforce_eager = True ) as llm :
516469
517470 def check_model (model ):
518471 layer = model .model .layers [0 ]
@@ -528,7 +481,7 @@ def check_model(model):
528481
529482 llm .apply_model (check_model )
530483
531- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
484+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
532485 print (output )
533486 assert output
534487
@@ -564,7 +517,7 @@ def check_model(model):
564517)
565518def test_compressed_tensors_2of4_quant_int8_compressed (vllm_runner , args_2of4 ):
566519 model , weight_strategy , input_strategy = args_2of4
567- with vllm_runner (model ) as llm :
520+ with vllm_runner (model , enforce_eager = True ) as llm :
568521
569522 def check_model (model ):
570523 layer = model .model .layers [0 ]
@@ -580,7 +533,7 @@ def check_model(model):
580533
581534 llm .apply_model (check_model )
582535
583- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
536+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
584537 print (output )
585538 assert output
586539
@@ -611,7 +564,7 @@ def check_model(model):
611564)
612565def test_compressed_tensors_2of4_quant_int8 (vllm_runner , args_2of4 ):
613566 model , weight_strategy , input_strategy = args_2of4
614- with vllm_runner (model ) as llm :
567+ with vllm_runner (model , enforce_eager = True ) as llm :
615568
616569 def check_model (model ):
617570 layer = model .model .layers [0 ]
@@ -622,7 +575,7 @@ def check_model(model):
622575
623576 llm .apply_model (check_model )
624577
625- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
578+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
626579 print (output )
627580 assert output
628581
@@ -637,7 +590,7 @@ def check_model(model):
637590)
638591def test_compressed_tensors_2of4_sparse (vllm_runner , args_2of4 ):
639592 model = args_2of4
640- with vllm_runner (model ) as llm :
593+ with vllm_runner (model , enforce_eager = True ) as llm :
641594
642595 def check_model (model ):
643596 layer = model .model .layers [0 ]
@@ -656,7 +609,7 @@ def check_model(model):
656609
657610 llm .apply_model (check_model )
658611
659- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
612+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
660613 print (output )
661614 assert output
662615
@@ -670,7 +623,7 @@ def check_model(model):
670623)
671624def test_compressed_tensors_2of4_sparse_compressed (vllm_runner , args_2of4 ):
672625 model = args_2of4
673- with vllm_runner (model ) as llm :
626+ with vllm_runner (model , enforce_eager = True ) as llm :
674627
675628 def check_model (model ):
676629 layer = model .model .layers [0 ]
@@ -689,7 +642,7 @@ def check_model(model):
689642
690643 llm .apply_model (check_model )
691644
692- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
645+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
693646 print (output )
694647 assert output
695648
@@ -723,7 +676,7 @@ def check_model(model):
723676 assert qkv_proj .scheme .group_size == 16
724677
725678 llm .apply_model (check_model )
726- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
679+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
727680 print (output )
728681 assert output
729682
@@ -758,7 +711,7 @@ def check_model(model):
758711 assert proj .scheme .group_size == 128
759712
760713 llm .apply_model (check_model )
761- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
714+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
762715 print (output )
763716 assert output
764717
@@ -792,7 +745,7 @@ def test_compressed_tensors_transforms_perplexity(
792745
793746def test_compressed_tensors_fp8_block_enabled (vllm_runner ):
794747 model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
795- with vllm_runner (model_path ) as llm :
748+ with vllm_runner (model_path , enforce_eager = True ) as llm :
796749 fp8_dtype = current_platform .fp8_dtype ()
797750
798751 def check_model (model ):
@@ -816,5 +769,5 @@ def check_model(model):
816769
817770 llm .apply_model (check_model )
818771
819- output = llm .generate_greedy ("Hello my name is" , max_tokens = 20 )
772+ output = llm .generate_greedy ("Hello my name is" , max_tokens = 4 )
820773 assert output
0 commit comments