[fix] Fix illegal mem access and possible accuracy lose. Cherry-pick … (#5017)

liji-nv · web-flow · commit 1d4f7487730a · 2025-06-09T17:50:57.000+08:00
Signed-off-by: Jin Li &lt;59594262+liji-nv@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h b/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h
@@ -23,6 +23,7 @@ namespace tensorrt_llm
 namespace kernels
 {
 void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
-    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream);
+    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
+    cudaStream_t stream);
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp
@@ -141,15 +141,16 @@ void KVCacheTransferManager::copyBlock(BlockPtr const& src, BlockPtr const& dst,
                 {
                     auto stream = (isOffload ? mOffloadManager : mOnboardManager).getStream().get();
                     int const numLayers = pools[poolIdx].numLayers;
+                    int const kvFactor = pools[poolIdx].kvFactor;
                     int const numHeads = pools[poolIdx].numKvHeads;
                     int const sizePerHead = pools[poolIdx].sizePerHead;
                     auto shape = srcPtr->getShape();
 
                     TLLM_CHECK_WITH_INFO(
                         shape.nbDims == 4, "Expected KVCache block to have 4 dims, got %d", shape.nbDims);
 
-                    tk::kvCacheBlockPartialCopy(
-                        *dstPtr, *srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead, numTokensToCopy, stream);
+                    tk::kvCacheBlockPartialCopy(*dstPtr, *srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead,
+                        numTokensToCopy, kvFactor, stream);
                 }
             }
         }
diff --git a/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu b/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu
@@ -58,7 +58,8 @@ unsigned int ipow2(unsigned int v)
 
 template <typename T>
 void hostKVCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
-    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream)
+    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
+    cudaStream_t stream)
 {
     unsigned int blockX = ipow2(numHidden);         // ensure block shape is a power of 2
     blockX = std::min(blockX, 32u);                 // blockX should not exceed warp size
@@ -75,55 +76,56 @@ void hostKVCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int
     auto srcData = bufferCast<T>(src);
     auto dstData = bufferCast<T>(dst);
     cuKVCacheBlockPartialCopy<<<grid, block, 0, stream>>>(
-        dstData, srcData, 2 * numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy);
+        dstData, srcData, numLayers * kvFactor, numHeads, tokensPerBlock, numHidden, numTokensToCopy);
 }
 } // namespace
 
 void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
-    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream)
+    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
+    cudaStream_t stream)
 {
     auto dataType = src.getDataType();
     TLLM_CHECK_WITH_INFO(dataType == dst.getDataType(), "src and dst dataType does not match");
     switch (dataType)
     {
     case nvinfer1::DataType::kINT64:
         hostKVCacheBlockPartialCopy<SizeType64>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
         break;
     case nvinfer1::DataType::kINT32:
         hostKVCacheBlockPartialCopy<std::int32_t>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
         break;
     case nvinfer1::DataType::kFLOAT:
         hostKVCacheBlockPartialCopy<float>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
         break;
 #ifdef ENABLE_BF16
     case nvinfer1::DataType::kBF16:
         hostKVCacheBlockPartialCopy<__nv_bfloat16>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
         break;
 #endif
     case nvinfer1::DataType::kHALF:
         hostKVCacheBlockPartialCopy<half>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
         break;
     case nvinfer1::DataType::kBOOL:
         hostKVCacheBlockPartialCopy<bool>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
         break;
     case nvinfer1::DataType::kUINT8:
         hostKVCacheBlockPartialCopy<std::uint8_t>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
         break;
     case nvinfer1::DataType::kINT8:
         hostKVCacheBlockPartialCopy<std::int8_t>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
         break;
 #ifdef ENABLE_FP8
     case nvinfer1::DataType::kFP8:
         hostKVCacheBlockPartialCopy<__nv_fp8_e4m3>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
         break;
 #endif
     default: TLLM_THROW("Unknown data type");
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -503,8 +503,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
                           [0, pytest.param(2, marks=skip_pre_hopper)])
     def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
                       overlap_scheduler, torch_compile):
-        if torch_compile:
-            pytest.skip("https://nvbugs/5292037")
         if torch_compile and mtp_nextn > 0:
             pytest.skip("https://nvbugs/5252313")
         if torch_compile and attention_dp:
@@ -547,8 +545,6 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
     def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                             attention_dp, cuda_graph, overlap_scheduler,
                             torch_compile):
-        if torch_compile:
-            pytest.skip("https://nvbugs/5292037")
         if torch_compile and mtp_nextn > 0:
             pytest.skip("https://nvbugs/5252313")
         if torch_compile and attention_dp:
@@ -593,8 +589,6 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
     @parametrize_with_ids("mtp_nextn", [0, 2])
     def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
                               overlap_scheduler, torch_compile):
-        if torch_compile:
-            pytest.skip("https://nvbugs/5292037")
         if torch_compile and mtp_nextn > 0:
             pytest.skip("https://nvbugs/5252313")
         if torch_compile and attention_dp:
@@ -712,8 +706,6 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
     def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                                     fp8kv, attention_dp, cuda_graph,
                                     overlap_scheduler, torch_compile):
-        if torch_compile:
-            pytest.skip("https://nvbugs/5292037")
         if torch_compile and mtp_nextn > 0:
             pytest.skip("https://nvbugs/5252313")
         if torch_compile and attention_dp:
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -358,7 +358,6 @@ accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://n
 accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5231468)
 accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://nvbugs/5231310)
 test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image] SKIP (https://nvbugs/5233423)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5294983)
 examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] SKIP (https://nvbugs/5234164)
 examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
 examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
@@ -382,17 +381,6 @@ triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-deco
 triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] SKIP
 triton_server/test_triton.py::test_gpt_ib_speculative_decoding_bls[gpt-ib-speculative-decoding-bls] SKIP
 triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble] SKIP
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5285965)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugspro.nvidia.com/bug/5324239)
 examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int4-float16] SKIP (https://nvbugs/5289523)
 examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (https://nvbugs/5289523)
@@ -438,8 +426,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype
 test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5236980)
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] SKIP (https://nvbugs/5318059)
 test_e2e.py::test_ptq_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] SKIP (https://nvbugspro.nvidia.com/bug/5324239)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5318087)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5318087)
 unittest/_torch/auto_deploy/integration/test_ad_build.py SKIP (https://nvbugs/5318103)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5318143)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5318143)

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@ namespace tensorrt_llm`
`23`	`23`	`namespace kernels`
`24`	`24`	`{`
`25`	`25`	`void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,`
`26`		`- unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream);`
	`26`	`+ unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,`
	`27`	`+ cudaStream_t stream);`
`27`	`28`	`} // namespace kernels`
`28`	`29`	`} // namespace tensorrt_llm`
Original file line number	Diff line number	Diff line change
`@@ -141,15 +141,16 @@ void KVCacheTransferManager::copyBlock(BlockPtr const& src, BlockPtr const& dst,`
`141`	`141`	`{`
`142`	`142`	`auto stream = (isOffload ? mOffloadManager : mOnboardManager).getStream().get();`
`143`	`143`	`int const numLayers = pools[poolIdx].numLayers;`
	`144`	`+ int const kvFactor = pools[poolIdx].kvFactor;`
`144`	`145`	`int const numHeads = pools[poolIdx].numKvHeads;`
`145`	`146`	`int const sizePerHead = pools[poolIdx].sizePerHead;`
`146`	`147`	`auto shape = srcPtr->getShape();`
`147`	`148`
`148`	`149`	`TLLM_CHECK_WITH_INFO(`
`149`	`150`	`shape.nbDims == 4, "Expected KVCache block to have 4 dims, got %d", shape.nbDims);`
`150`	`151`
`151`		`- tk::kvCacheBlockPartialCopy(`
`152`		`- dstPtr, srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead, numTokensToCopy, stream);`
	`152`	`+ tk::kvCacheBlockPartialCopy(dstPtr, srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead,`
	`153`	`+ numTokensToCopy, kvFactor, stream);`
`153`	`154`	`}`
`154`	`155`	`}`
`155`	`156`	`}`