QiJune
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/deep_ep/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/deep_ep/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu‎
Lines changed: 3 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp‎
Lines changed: 45 additions & 47 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp‎
Lines changed: 45 additions & 47 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h‎
Lines changed: 3 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/thop/moeOp.cpp‎
Lines changed: 15 additions & 0 deletions b/‎cpp/tensorrt_llm/thop/moeOp.cpp‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎docs/source/blogs/media/tech_blog13_dynasor_demo.gif‎
4.43 MB b/‎docs/source/blogs/media/tech_blog13_dynasor_demo.gif‎
4.43 MB
diff --git a/‎docs/source/blogs/media/tech_blog13_dynasor_hesitation.png‎
933 KB b/‎docs/source/blogs/media/tech_blog13_dynasor_hesitation.png‎
933 KB
diff --git a/‎docs/source/blogs/media/tech_blog13_dynasor_illustration.jpg‎
255 KB b/‎docs/source/blogs/media/tech_blog13_dynasor_illustration.jpg‎
255 KB
@@ -7,8 +7,8 @@ TensorRT-LLM
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/)
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.9.1-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
+[![cuda](https://img.shields.io/badge/cuda-13.0.0-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.13.2-green)](https://developer.nvidia.com/tensorrt)
 [![version](https://img.shields.io/badge/release-1.1.0rc6-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 
@@ -1,4 +1,4 @@
-set(DEEP_EP_COMMIT 515a311f290eb6d9592fcccfcc80c40f5123ca72)
+set(DEEP_EP_COMMIT be2582ffe69b5e7d61c3bc9bf7a5316bc48261f9)
 set(NVSHMEM_URL_HASH
     SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a)
 
 
@@ -553,6 +553,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         || std::is_same_v<T, __nv_fp8_e5m2>) &&!std::is_same_v<WeightType, cutlass::uint4b_t>;
     static constexpr bool use_w4afp8
         = std::is_same_v<WeightType, cutlass::uint4b_t> && std::is_same_v<T, __nv_fp8_e4m3>;
+    static constexpr bool use_fp8_input = std::is_same_v<InputType, __nv_fp8_e4m3>;
     static_assert(!std::is_same_v<BackBoneType, __nv_fp8_e4m3>, "Current logic requires backbone type to be >=16-bits");
     static_assert(!std::is_same_v<OutputType, __nv_fp8_e4m3>, "Current logic requires output type to be >=16-bits");
 #else
 
@@ -1625,7 +1625,7 @@ void expandInputRowsKernelLauncher(InputActivationsType const* unpermuted_input,
         else if constexpr (std::is_same_v<ExpandedActivationsType, __nv_fp8_e4m3>
             && std::is_same_v<InputActivationsType, __nv_fp8_e4m3>)
         {
-            TLLM_CHECK_WITH_INFO(!prequant_scales, "NVFP4 is not supported for AWQ");
+            TLLM_CHECK_WITH_INFO(!prequant_scales, "FP8 is not supported for AWQ");
             return quant_params.mxfp8_mxfp4.fc1.weight_block_scale
                 ? &expandInputRowsKernel<InputActivationsType, ExpandedActivationsType,
                     TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX, false>
@@ -3689,7 +3689,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
             permuted_token_final_scales_, permuted_row_to_unpermuted_row_, num_rows, hidden_size, experts_per_token,
             num_experts_per_node, quant_params, use_per_expert_act_scale, expert_first_token_offset_,
             fc1_fp4_act_scale_, input_sf, swizzled_input_sf,
-            use_w4afp8 ? quant_params.groupwise.fc1.act_scales : nullptr, stream);
+            (use_w4afp8 && !use_fp8_input) ? quant_params.groupwise.fc1.act_scales : nullptr, stream);
         auto const* gemm1_input = gemm1_input_expand;
 
         sync_check_cuda_error(stream);
@@ -4755,6 +4755,7 @@ template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half, half>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp8_e4m3, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16>;
+template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_fp8_e4m3>;
 #endif
 #endif
 #ifdef ENABLE_FP4
 
@@ -21,7 +21,8 @@
 #include "tensorrt_llm/common/envUtils.h"
 #include "trtllmGen_bmm_export/BatchedGemmInterface.h"
 #include "trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h"
-// DO NOT include logger.h before BatchedGemmInterface.h as it #undef TLLM_LOG_INFO and co.
+// DO NOT include cudaUtils.h and logger.h before BatchedGemmInterface.h as it #undef TLLM_LOG_INFO and co.
+#include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"
 
 namespace tensorrt_llm
@@ -306,6 +307,8 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(int32_t m
     auto const bmm = BatchedGemmInterface();
     auto const configs = bmm.getBatchedGemmConfigs();
 
+    int32_t multiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
+
     BatchedGemmData gemmData;
     // Dims
     gemmData.mProblemDimensions.mNumBatches = numBatches;
@@ -319,73 +322,68 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(int32_t m
     gemmData.mProblemDimensions.mRank = 0;
     gemmData.mProblemDimensions.mWorldSize = 1;
     gemmData.mProblemDimensions.mMaxNumCtasInTokenDim = maxNumCtasInBatchDim;
-    // Tier 0: K < tileK, prefer higher efficiency.
-    auto cmpTier0 = [&configs, &gemmData](int64_t idx0, int64_t idx1)
+    auto cmpFunc = [&configs, &gemmData, &bmm, &multiProcessorCount](int64_t idx0, int64_t idx1)
     {
         auto const& optionsA = configs[idx0].mOptions;
         auto const& optionsB = configs[idx1].mOptions;
         int32_t sizeK = gemmData.mProblemDimensions.mK;
-        // Both waste computation, prefer higher efficiency.
-        if (sizeK <= optionsA.mTileK && sizeK <= optionsB.mTileK)
-        {
-            double eff_a = (double) sizeK / optionsA.mTileK;
-            double eff_b = (double) sizeK / optionsB.mTileK;
-            return eff_a > eff_b;
-        }
-        // If either can be utilized, sort by tileK.
-        else
+
+        // Tier 0: K < tileK, prefer higher efficiency.
+        if (optionsA.mTileK != optionsB.mTileK)
         {
-            return optionsA.mTileK > optionsB.mTileK;
+            // Both waste computation, prefer higher efficiency.
+            if (sizeK <= optionsA.mTileK && sizeK <= optionsB.mTileK)
+            {
+                double eff_a = (double) sizeK / optionsA.mTileK;
+                double eff_b = (double) sizeK / optionsB.mTileK;
+                return eff_a > eff_b;
+            }
+            // If either can be utilized, sort by tileK.
+            else
+            {
+                return optionsA.mTileK > optionsB.mTileK;
+            }
         }
-    };
-    // Tier 1: When tileK is the same, prefer unroll loop 2x for mma.
-    auto cmpTier1 = [&configs](int64_t idx0, int64_t idx1)
-    {
-        auto const& optionsA = configs[idx0].mOptions;
-        auto const& optionsB = configs[idx1].mOptions;
-        if (optionsA.mTileK == optionsB.mTileK)
+
+        // Tier 1: When tileK is the same, prefer unroll loop 2x for mma.
+        if (optionsA.mUseUnrollLoop2xForMma != optionsB.mUseUnrollLoop2xForMma)
         {
             return optionsA.mUseUnrollLoop2xForMma;
         }
-        return false;
-    };
-    // Tier 2+: When previous comparators are the same, prefer higher tileM.
-    auto cmpTier2 = [&configs](int64_t idx0, int64_t idx1)
-    {
-        auto const& optionsA = configs[idx0].mOptions;
-        auto const& optionsB = configs[idx1].mOptions;
-        if (optionsA.mTileK == optionsB.mTileK && optionsA.mUseUnrollLoop2xForMma == optionsB.mUseUnrollLoop2xForMma)
+
+        // Tier 2+: When previous comparators are the same, prefer higher tileM.
+        if (optionsA.mTileM != optionsB.mTileM)
         {
             return optionsA.mTileM > optionsB.mTileM;
         }
-        return false;
-    };
-    // Tier 2+: When previous comparators are the same, and when number of estimated CTAs is on the larger side, prefer
-    // persistent tile scheduler. The threshold is hardcoded as >148 CTAs at the moment.
-    auto cmpTier3 = [&configs, &gemmData](int64_t idx0, int64_t idx1)
-    {
-        int32_t sizeM = gemmData.mProblemDimensions.mM;
-        int32_t sizeN = gemmData.mProblemDimensions.mN;
-        auto const& optionsA = configs[idx0].mOptions;
-        auto const& optionsB = configs[idx1].mOptions;
-        if (optionsA.mTileK == optionsB.mTileK && optionsA.mUseUnrollLoop2xForMma == optionsB.mUseUnrollLoop2xForMma
-            && optionsA.mTileM == optionsB.mTileM)
+
+        // Tier 2+: When previous comparators are the same, prefer higher tileN.
+        if (optionsA.mTileN != optionsB.mTileN)
+        {
+            return optionsA.mTileN > optionsB.mTileN;
+        }
+
+        // Tier 2+: When previous comparators are the same, and when the number of estimated CTAs is on the larger side,
+        // prefer persistent tile scheduler.
+        if (optionsA.mTileScheduler != optionsB.mTileScheduler)
         {
-            int64_t numTilesM = divUp(sizeM, optionsA.mTileM);
-            int64_t numTilesN = divUp(sizeN, optionsA.mTileN);
-            if (numTilesM * numTilesN > 148)
+            auto options = bmm.getOptionsFromConfigAndData(configs[idx0], gemmData);
+            auto numCtas = bmm.getNumCtas(options, gemmData.mProblemDimensions.mMaxNumCtasInTokenDim);
+            if (numCtas > multiProcessorCount)
             {
                 return optionsA.mTileScheduler == batchedGemm::gemm::TileScheduler::Persistent;
             }
+            else
+            {
+                return optionsB.mTileScheduler == batchedGemm::gemm::TileScheduler::Persistent;
+            }
         }
+
         return false;
     };
     // Sort configs by options.
     std::vector<int64_t> sortedIndices = mPassingConfigIndices;
-    std::sort(sortedIndices.begin(), sortedIndices.end(), cmpTier0);
-    std::sort(sortedIndices.begin(), sortedIndices.end(), cmpTier1);
-    std::sort(sortedIndices.begin(), sortedIndices.end(), cmpTier2);
-    std::sort(sortedIndices.begin(), sortedIndices.end(), cmpTier3);
+    std::sort(sortedIndices.begin(), sortedIndices.end(), cmpFunc);
 
     // Special rules for corner cases, if applicable.
     std::vector<int64_t> prioritizedIndices = prioritizePredefinedConfigs(m, n, k, sortedIndices, configs);
 
@@ -524,12 +524,13 @@ class BatchedGemmInterface
     // Returns true if the configuration of the cubin can be executed for the given params.
     bool isValidConfig(BatchedGemmConfig const& config, BatchedGemmData const& data) const;
 
+    // Creates GemmOptions from kernel and data.
+    BatchedGemmOptions getOptionsFromConfigAndData(BatchedGemmConfig const& config, BatchedGemmData const& data) const;
+
 private:
     // Aligns the pointer to the alignment
     template <typename Dtype>
     inline Dtype* alignPtr(Dtype* ptr, int64_t alignment) const;
-    // Creates GemmOptions from kernel and data.
-    BatchedGemmOptions getOptionsFromConfigAndData(BatchedGemmConfig const& config, BatchedGemmData const& data) const;
 
     // Returns the size of the workspace buffers in bytes
     std::vector<size_t> getWorkspaceSizesInBytes(BatchedGemmConfig const& config, BatchedGemmData const& data) const;
 
@@ -201,6 +201,21 @@ class FusedMoeRunner : public torch::CustomClassHolder
             }
             switch (mActivationDtype)
             {
+#ifdef ENABLE_FP8
+            case c10::ScalarType::Float8_e4m3fn:
+            {
+                if (isInt4Quant() and mUseW4GroupScaling)
+                {
+                    mKernelRunner = std::make_unique<
+                        kernels::CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_fp8_e4m3>>();
+                }
+                else
+                {
+                    C10_THROW_ERROR_FORMATTED(Error, "FP8 activation type is not supported for non-W4A8 quantization");
+                }
+                break;
+            }
+#endif
             case c10::ScalarType::Half: mKernelRunner = create_weight_quant_runner<half>(); break;
             case c10::ScalarType::BFloat16: mKernelRunner = create_weight_quant_runner<__nv_bfloat16>(); break;
             default: C10_THROW_ERROR_FORMATTED(Error, "Unsupported activation type for int-type weight");
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-set(DEEP_EP_COMMIT 515a311f290eb6d9592fcccfcc80c40f5123ca72)`
	`1`	`+set(DEEP_EP_COMMIT be2582ffe69b5e7d61c3bc9bf7a5316bc48261f9)`
`2`	`2`	`set(NVSHMEM_URL_HASH`
`3`	`3`	`SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a)`
`4`	`4`