feat: move tensor allocation to ctor

windreamer · windreamer · commit 3c4cbdb9c59c · 2025-09-26T09:18:14.000+08:00
diff --git a/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc b/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc
@@ -22,6 +22,9 @@ namespace turbomind {
 template<typename T>
 GuidedDecodeMaskLayer<T>::GuidedDecodeMaskLayer(const BaseParam& param): BaseDynamicDecodeLayer{param}
 {
+    const auto bitmask_size = xgrammar::GetBitmaskSize(vocab_size_padded_);
+    bitmask_buf_            = {{max_batch_size_, bitmask_size}, kCPU};
+    bitmask_                = {{max_batch_size_, bitmask_size}, kDEVICE};
 }
 
 template<typename T>
@@ -42,16 +45,14 @@ void GuidedDecodeMaskLayer<T>::Forward(TensorMap& args)
     Tensor_<float> logits = args.at("logits");
     const ssize_t  bsz    = logits.shape(0);
 
-    FT_CHECK(bsz == matchers_.size());
+    TM_CHECK(bsz == matchers_.size());
 
-    const auto           bitmask_size = xgrammar::GetBitmaskSize(vocab_size_padded_);
-    Tensor_<int32_t>     bitmask{{bsz, bitmask_size}, kCPU};
-    Tensor_<int32_t>     bitmask_device{{bsz, bitmask_size}, kDEVICE};
+    const auto           bitmask_size  = bitmask_buf_.shape(1);
     std::vector<int64_t> bitmask_shape = {bsz, bitmask_size};
 
-    DLTensor bitmask_dltensor{bitmask.data(),
+    DLTensor bitmask_dltensor{bitmask_buf_.data(),
                               DLDevice{kDLCPU, 0},
-                              bitmask.ndim(),
+                              bitmask_buf_.ndim(),
                               xgrammar::GetBitmaskDLType(),
                               bitmask_shape.data(),
                               nullptr,
@@ -66,8 +67,8 @@ void GuidedDecodeMaskLayer<T>::Forward(TensorMap& args)
     }
 
     if (need_apply) {
-        Copy(bitmask, bitmask_device);
-        ApplyTokenBitmaskInplace(logits, bitmask_device);
+        Copy(bitmask_buf_, bitmask_);
+        ApplyTokenBitmaskInplace(logits, bitmask_.slice(0, bsz));
     }
 }
 
diff --git a/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.h b/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.h
@@ -34,8 +34,11 @@ class GuidedDecodeMaskLayer: public BaseDynamicDecodeLayer {
     void Forward(TensorMap& args) override;
 
 private:
-    // host buffer
     std::vector<std::shared_ptr<xgrammar::GrammarMatcher>> matchers_;
+    // host buffer
+    Tensor_<int32_t> bitmask_buf_;
+    // device buffer
+    Tensor_<int32_t> bitmask_;
 };
 
 }  // namespace turbomind