QiJune
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h‎
Lines changed: 4 additions & 12 deletions b/‎cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/decoderState.h‎
Lines changed: 5 additions & 0 deletions b/‎cpp/include/tensorrt_llm/runtime/decoderState.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/request.h‎
Lines changed: 2 additions & 12 deletions b/‎cpp/include/tensorrt_llm/runtime/request.h‎
Lines changed: 2 additions & 12 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp‎
Lines changed: 210 additions & 176 deletions b/‎cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp‎
Lines changed: 210 additions & 176 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 3 additions & 3 deletions b/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp‎
Lines changed: 7 additions & 9 deletions b/‎cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp‎
Lines changed: 7 additions & 9 deletions b/‎cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎cpp/tensorrt_llm/runtime/decoderState.cpp‎
Lines changed: 5 additions & 0 deletions b/‎cpp/tensorrt_llm/runtime/decoderState.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎cpp/tests/runtime/gptDecoderBatchedTest.cpp‎
Lines changed: 3 additions & 4 deletions b/‎cpp/tests/runtime/gptDecoderBatchedTest.cpp‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎examples/auto_deploy/build_and_run_ad.py‎
Lines changed: 0 additions & 4 deletions b/‎examples/auto_deploy/build_and_run_ad.py‎
Lines changed: 0 additions & 4 deletions
@@ -75,27 +75,19 @@ class CreateNewDecoderRequests : Algorithm
         std::vector<executor::LookaheadDecodingConfig>>
     operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
         executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
-        runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers,
-        runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream,
-        SizeType32 maxSequenceLength, SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
+        nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
+        CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
+        SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
 
     [[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
         std::vector<executor::LookaheadDecodingConfig>>
     createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
         executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
-        runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
-        runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
+        nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
         runtime::CudaStream const& runtimeStream, runtime::CudaStream const& decoderStream,
         SizeType32 maxSequenceLength, OptionalRef<MedusaBuffers const> medusaBuffers) const;
 
 private:
-    //! @brief Initialize the decoder at `batchSlot` with a new `request`. Exposed only for static batching via
-    //! GptDecoderBatched::newBatch()
-    static void newRequest(SizeType32 batchSlot, runtime::decoder_batch::Request const& request,
-        SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
-        runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream,
-        SizeType32 maxSequenceLength);
-
     //! @brief Setups decoder internal tensors for new speculative decoding request
     static void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
         SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
 
@@ -173,6 +173,11 @@ class DecoderState
     //! @brief Workspace for beam search in streaming mode.
     [[nodiscard]] BeamSearchBuffers const& getBeamSearchBuffers() const;
 
+    //! @brief Set the beam width for a specific request in the batch.
+    //! @param batchIdx The index of the request in the batch.
+    //! @param beamWidth The beam width for the specified request.
+    void setBeamWidth(SizeType32 batchIdx, SizeType32 beamWidth);
+
     //! @brief Cache indirection input for beam search.
     [[nodiscard]] TensorPtr getCacheIndirectionInput() const;
 
 
@@ -31,26 +31,16 @@ class Request
     using TensorPtr = ITensor::SharedPtr;
     using BufferPtr = IBuffer::SharedPtr;
 
-    explicit Request(TensorConstPtr ids, SizeType32 inputLen, std::optional<SizeType32> maxNewTokens = std::nullopt,
-        std::optional<SizeType32> endId = std::nullopt)
-        : ids{std::move(ids)}
-        , inputLen(inputLen)
-        , maxNewTokens{maxNewTokens}
-        , endId{endId}
+    explicit Request(SizeType32 inputLen)
+        : inputLen(inputLen)
     {
     }
 
     //! Mandatory parameters
-    TensorConstPtr ids;  // The input sequence of token ids, [inputSeqLen], on gpu
     SizeType32 inputLen; // Input length without draft tokens, increasing with generation steps
 
     // optional parameters
-    std::optional<SizeType32> maxNewTokens;     // maximum number of tokens to generate for this request
-    std::optional<SizeType32> endId;            // end token id
     SizeType32 generatedTokensPerEngineStep{1}; //
-    TensorPtr embeddingBias;                    // [vocabSizePadded], on gpu
-    TensorPtr badWordsList;                     // [2, badWordsLength] on gpu
-    TensorPtr stopWordsList;                    // [2, stopWordsLength] on gpu
 
     //! Optional parameters for speculative decoding
     BufferPtr draftTokens;                // [generatedTokensPerEngineStep - 1] on gpu
 
@@ -1866,9 +1866,9 @@ void TrtGptModelInflightBatching::setupDecoderStep(
         auto const logitsType = mRuntime->getEngine().getTensorDataType("logits");
 
         auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs]
-            = (*mCreateNewDecoderRequests)(mModelConfig, mWorldConfig, mDecodingConfig, contextRequests,
-                mRuntime->getBufferManager(), logitsType, inputBuffers, *mDecoderState, mRuntime->getStream(),
-                *mDecoder->getDecoderStream(), getMaxSequenceLen(), mOperatingBeamWidth, buffers.mMedusaBuffers);
+            = (*mCreateNewDecoderRequests)(mModelConfig, mWorldConfig, mDecodingConfig, contextRequests, logitsType,
+                inputBuffers, *mDecoderState, mRuntime->getStream(), *mDecoder->getDecoderStream(), getMaxSequenceLen(),
+                mOperatingBeamWidth, buffers.mMedusaBuffers);
 
         auto const localBatchSize = batchSlots->getSize();
         if (localBatchSize > 0)
 
@@ -103,23 +103,21 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
             "__call__",
             [](CreateNewDecoderRequests& self, tr::ModelConfig const& modelConfig, tr::WorldConfig const& worldConfig,
                 executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
-                tr::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
-                DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
-                tensorrt_llm::runtime::CudaStream const& runtimeStream,
+                nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers,
+                runtime::decoder::DecoderState& decoderState, tensorrt_llm::runtime::CudaStream const& runtimeStream,
                 tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
                 SizeType32 beamWidth)
             {
                 OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt;
-                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
-                    worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
-                    runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
+                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs]
+                    = self(modelConfig, worldConfig, decodingConfig, contextRequests, logitsType, inputBuffers,
+                        decoderState, runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
 
                 return std::tuple{runtime::Torch::tensor(batchSlots), std::move(samplingConfigs),
                     std::move(lookaheadPrompt), std::move(lookaheadAlgoConfigs)};
             },
             nb::arg("model_config"), nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("context_requests"),
-            nb::arg("buffer_manager"), nb::arg("logits_type"), nb::arg("decoder_input_buffers"),
-            nb::arg("decoder_state"), nb::arg("runtime_stream"), nb::arg("decoder_stream"),
-            nb::arg("max_sequence_length"), nb::arg("beam_width"))
+            nb::arg("logits_type"), nb::arg("decoder_input_buffers"), nb::arg("decoder_state"),
+            nb::arg("runtime_stream"), nb::arg("decoder_stream"), nb::arg("max_sequence_length"), nb::arg("beam_width"))
         .def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });
 }
@@ -105,23 +105,21 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
             "__call__",
             [](CreateNewDecoderRequests& self, tr::ModelConfig const& modelConfig, tr::WorldConfig const& worldConfig,
                 executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
-                tr::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
-                DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
-                tensorrt_llm::runtime::CudaStream const& runtimeStream,
+                nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers,
+                runtime::decoder::DecoderState& decoderState, tensorrt_llm::runtime::CudaStream const& runtimeStream,
                 tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
                 SizeType32 beamWidth)
             {
                 OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt;
-                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
-                    worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
-                    runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
+                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs]
+                    = self(modelConfig, worldConfig, decodingConfig, contextRequests, logitsType, inputBuffers,
+                        decoderState, runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
 
                 return std::tuple{runtime::Torch::tensor(batchSlots), std::move(samplingConfigs),
                     std::move(lookaheadPrompt), std::move(lookaheadAlgoConfigs)};
             },
             py::arg("model_config"), py::arg("world_config"), py::arg("decoding_config"), py::arg("context_requests"),
-            py::arg("buffer_manager"), py::arg("logits_type"), py::arg("decoder_input_buffers"),
-            py::arg("decoder_state"), py::arg("runtime_stream"), py::arg("decoder_stream"),
-            py::arg("max_sequence_length"), py::arg("beam_width"))
+            py::arg("logits_type"), py::arg("decoder_input_buffers"), py::arg("decoder_state"),
+            py::arg("runtime_stream"), py::arg("decoder_stream"), py::arg("max_sequence_length"), py::arg("beam_width"))
         .def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });
 }
@@ -644,6 +644,11 @@ void DecoderState::setGenerationSteps(std::vector<SizeType32> const& generationS
     mJointDecodingInput->generationSteps = generationSteps;
 }
 
+void DecoderState::setBeamWidth(SizeType32 batchIdx, SizeType32 beamWidth)
+{
+    mJointDecodingInput->beamWidths.at(batchIdx) = beamWidth;
+}
+
 DecodingInput& DecoderState::getJointDecodingInput() const
 {
     return *mJointDecodingInput;
 
@@ -104,15 +104,14 @@ void newRequests(std::vector<std::shared_ptr<tb::LlmRequest>> const& requests, T
     SizeType32 maxSequenceLength, tb::DecoderInputBuffers& inputBuffers, decoder::DecoderState& decoderState)
 {
     auto const& decoderStream = *decoder.getDecoderStream();
-    auto const bufferManager = BufferManager{std::make_shared<CudaStream>(runtimeStream.get())};
 
     auto batchSlotsRange = BufferRange<SizeType32>(*batchSlots);
     auto const localBatchSize = batchSlots->getSize();
 
     tb::CreateNewDecoderRequests createNewDecoderRequests(false, false, false);
-    auto [lookaheadPrompt, lookaheadAlgoConfigs] = createNewDecoderRequests.createDecoderRequests(requests,
-        inputBuffers.inputsIds, decodingConfig, decoderState, bufferManager, logitsType, modelConfig, worldConfig,
-        runtimeStream, decoderStream, maxSequenceLength, std::nullopt);
+    auto [lookaheadPrompt, lookaheadAlgoConfigs]
+        = createNewDecoderRequests.createDecoderRequests(requests, inputBuffers.inputsIds, decodingConfig, decoderState,
+            logitsType, modelConfig, worldConfig, runtimeStream, decoderStream, maxSequenceLength, std::nullopt);
 
     std::vector<SamplingConfig> samplingConfigs;
     samplingConfigs.reserve(requests.size());
 
@@ -41,10 +41,6 @@ class PromptConfig(BaseModel):
             "In simple words and in a single sentence, explain the concept of gravity: ",
             "How to fix slicing in golf? ",
             "Where is the capital of Iceland? ",
-            "How big is the universe? ",
-            "In simple words and in a single sentence, explain the concept of gravity: ",
-            "How to fix slicing in golf? ",
-            "Where is the capital of Iceland? ",
         ]
     )
     sp_kwargs: Dict[str, Any] = Field(
Original file line number	Diff line number	Diff line change
`@@ -644,6 +644,11 @@ void DecoderState::setGenerationSteps(std::vector<SizeType32> const& generationS`
`644`	`644`	`mJointDecodingInput->generationSteps = generationSteps;`
`645`	`645`	`}`
`646`	`646`
	`647`	`+void DecoderState::setBeamWidth(SizeType32 batchIdx, SizeType32 beamWidth)`
	`648`	`+{`
	`649`	`+ mJointDecodingInput->beamWidths.at(batchIdx) = beamWidth;`
	`650`	`+}`
	`651`	`+`
`647`	`652`	`DecodingInput& DecoderState::getJointDecodingInput() const`
`648`	`653`	`{`
`649`	`654`	`return *mJointDecodingInput;`
Original file line number	Diff line number	Diff line change
`@@ -41,10 +41,6 @@ class PromptConfig(BaseModel):`
`41`	`41`	`"In simple words and in a single sentence, explain the concept of gravity: ",`
`42`	`42`	`"How to fix slicing in golf? ",`
`43`	`43`	`"Where is the capital of Iceland? ",`
`44`		`- "How big is the universe? ",`
`45`		`- "In simple words and in a single sentence, explain the concept of gravity: ",`
`46`		`- "How to fix slicing in golf? ",`
`47`		`- "Where is the capital of Iceland? ",`
`48`	`44`	`]`
`49`	`45`	`)`
`50`	`46`	`sp_kwargs: Dict[str, Any] = Field(`