test: add matrix grammar test

windreamer · windreamer · commit d75e11e465d6 · 2025-09-24T09:45:44.000+08:00
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
@@ -59,7 +59,7 @@ jobs:
           python3 -m pip install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu118
       - name: Install lmdeploy
         run: |
-          python3 -m pip install pynvml packaging protobuf transformers_stream_generator matplotlib
+          python3 -m pip install pynvml packaging protobuf transformers_stream_generator matplotlib timm
           # manually install flash attn
           python3 -m pip install /root/packages/cu118/flash_attn-*.whl
           python3 -m pip install -r requirements_cuda.txt -r requirements/test.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -82,7 +82,7 @@ FetchContent_MakeAvailable(yaml-cpp)
 FetchContent_Declare(
   xgrammar
   GIT_REPOSITORY          https://github.com/mlc-ai/xgrammar.git
-  GIT_TAG                 v0.1.21
+  GIT_TAG                 v0.1.25
   GIT_SUBMODULES          "3rdparty/dlpack"
   GIT_PROGRESS            TRUE
   USES_TERMINAL_DOWNLOAD  TRUE
@@ -94,7 +94,10 @@ if(NOT xgrammar_POPULATED)
   # Fetch the content using previously declared details
   FetchContent_Populate(xgrammar)
 
-  file(WRITE ${xgrammar_SOURCE_DIR}/config.cmake "set(XGRAMMAR_BUILD_PYTHON_BINDINGS OFF)")
+  file(WRITE ${xgrammar_SOURCE_DIR}/config.cmake "set(XGRAMMAR_BUILD_PYTHON_BINDINGS OFF)\n")
+  if(NOT MSVC)
+    file(APPEND ${xgrammar_SOURCE_DIR}/config.cmake "set(CMAKE_CXX_FLAGS \"-Wno-error\")\n")
+  endif()
 
   # Bring the populated content into the build
   add_subdirectory(${xgrammar_SOURCE_DIR} ${xgrammar_BINARY_DIR})
diff --git a/requirements/runtime_cuda.txt b/requirements/runtime_cuda.txt
@@ -5,7 +5,7 @@ fire
 mmengine-lite
 numpy<2.0.0
 openai
-outlines
+outlines<0.1.0
 partial_json_parser
 peft<=0.14.0
 pillow
diff --git a/requirements/runtime_rocm.txt b/requirements/runtime_rocm.txt
@@ -5,7 +5,7 @@ fire
 mmengine-lite
 numpy<2.0.0
 openai
-outlines
+outlines<0.1.0
 partial_json_parser
 peft<=0.14.0
 pillow
diff --git a/src/turbomind/kernels/apply_token_bitmask_inplace_cuda.cu b/src/turbomind/kernels/apply_token_bitmask_inplace_cuda.cu
@@ -214,19 +214,34 @@ void ApplyTokenBitmaskInplace(Tensor logits, Tensor bitmask, std::optional<Tenso
 
     switch (logits.dtype()) {
         case kFloat32: {
-            ApplyTokenBitmaskInplaceDispatchToPackedT(
-                logits.data<float>(), bitmask.data<int32_t>(), indices_ptr, vocab_size, 0, 0, num_rows);
+            ApplyTokenBitmaskInplaceDispatchToPackedT(logits.data<float>(),
+                                                      bitmask.data<int32_t>(),
+                                                      indices_ptr,
+                                                      vocab_size,
+                                                      logits.stride(0),
+                                                      bitmask.stride(0),
+                                                      num_rows);
             break;
         }
         case kFloat16: {
-            ApplyTokenBitmaskInplaceDispatchToPackedT(
-                logits.data<half_t>(), bitmask.data<int32_t>(), indices_ptr, vocab_size, 0, 0, num_rows);
+            ApplyTokenBitmaskInplaceDispatchToPackedT(logits.data<half_t>(),
+                                                      bitmask.data<int32_t>(),
+                                                      indices_ptr,
+                                                      vocab_size,
+                                                      logits.stride(0),
+                                                      bitmask.stride(0),
+                                                      num_rows);
             break;
         }
 #if __CUDA_ARCH__ >= 800
         case kBfloat16: {
-            ApplyTokenBitmaskInplaceDispatchToPackedT(
-                logits.data<bfloat16_t>(), bitmask.data<int32_t>(), indices_ptr, vocab_size, 0, 0, num_rows);
+            ApplyTokenBitmaskInplaceDispatchToPackedT(logits.data<bfloat16_t>(),
+                                                      bitmask.data<int32_t>(),
+                                                      indices_ptr,
+                                                      vocab_size,
+                                                      logits.stride(0),
+                                                      bitmask.stride(0),
+                                                      num_rows);
             break;
         }
 #endif
diff --git a/src/turbomind/python/xgrammar_bind.cpp b/src/turbomind/python/xgrammar_bind.cpp
@@ -107,15 +107,7 @@ PYBIND11_MODULE(_xgrammar, m)
                         return TokenizerInfo::FromVocabAndMetadata(CommonEncodedVocabType(encoded_vocab), metadata);
                     })
 
-        .def_static("_detect_metadata_from_hf", &TokenizerInfo::DetectMetadataFromHF)
-
-        .def("serialize_json", &TokenizerInfo::SerializeJSON)
-
-        .def_static(
-            "deserialize_json",
-            [](const std::string& str, const py::typing::List<std::variant<std::string, py::bytes>>& encoded_vocab) {
-                return TokenizerInfo::DeserializeJSON(str, CommonEncodedVocabType(encoded_vocab));
-            });
+        .def_static("_detect_metadata_from_hf", &TokenizerInfo::DetectMetadataFromHF);
 
     py::class_<CompiledGrammar>(m, "CompiledGrammar");
 
@@ -130,10 +122,11 @@ PYBIND11_MODULE(_xgrammar, m)
              &GrammarCompiler::CompileJSONSchema,
              py::call_guard<py::gil_scoped_release>(),
              py::arg("schema"),
-             py::arg("any_whitespace") = false,
-             py::arg("indent")         = py::none(),
-             py::arg("separators")     = py::none(),
-             py::arg("strict_mode")    = true)
+             py::arg("any_whitespace")     = false,
+             py::arg("indent")             = py::none(),
+             py::arg("separators")         = py::none(),
+             py::arg("strict_mode")        = true,
+             py::arg("max_whitespace_cnt") = py::none())
         .def("compile_regex",
              &GrammarCompiler::CompileRegex,
              py::call_guard<py::gil_scoped_release>(),
diff --git a/tests/test_lmdeploy/test_grammar.py b/tests/test_lmdeploy/test_grammar.py
@@ -4,20 +4,19 @@
 from jsonschema import validate
 
 from lmdeploy import pipeline
-from lmdeploy.messages import GenerationConfig, TurbomindEngineConfig
+from lmdeploy.messages import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig
 
+MODEL_IDS = [
+    'Qwen/Qwen3-0.6B',
+    'OpenGVLab/InternVL3_5-1B',
+]
 
-@pytest.fixture(scope='module')
-def tiny_model_id():
-    return 'internlm/internlm2_5-1_8b'
+BACKEND_FACTORIES = [
+    ('tm', lambda: TurbomindEngineConfig(max_batch_size=2, session_len=1024)),
+    ('pt', lambda: PytorchEngineConfig(max_batch_size=1, session_len=1024)),
+]
 
-
-@pytest.fixture(scope='module')
-def tmp_workspace(tmp_path_factory):
-    return tmp_path_factory.mktemp('tm_workspace')
-
-
-guide = {
+GUIDE_SCHEMA = {
     'type': 'object',
     'properties': {
         'name': {
@@ -29,7 +28,8 @@ def tmp_workspace(tmp_path_factory):
                 'type': 'string',
                 'maxLength': 10
             },
-            'minItems': 3
+            'minItems': 3,
+            'maxItems': 10,
         },
         'work history': {
             'type': 'array',
@@ -41,20 +41,39 @@ def tmp_workspace(tmp_path_factory):
                     },
                     'duration': {
                         'type': 'string'
-                    }
+                    },
                 },
-                'required': ['company']
-            }
-        }
+                'required': ['company'],
+            },
+        },
     },
-    'required': ['name', 'skills', 'work history']
+    'required': ['name', 'skills', 'work history'],
 }
 
 
-def test_tm_guided_pipeline(tiny_model_id):
-    pipe = pipeline(tiny_model_id,
-                    backend_config=TurbomindEngineConfig(max_batch_size=1, session_len=1024),
-                    log_level='INFO')
-    gen_config = GenerationConfig(response_format=dict(type='json_schema', json_schema=dict(name='test', schema=guide)))
-    response = pipe(['Make a self introduction please.'], gen_config=gen_config)
-    validate(instance=json.loads(response[0].text), schema=guide)
+@pytest.mark.parametrize('model_id', MODEL_IDS)
+@pytest.mark.parametrize('backend_name,backend_factory', BACKEND_FACTORIES)
+@pytest.mark.parametrize('enable_guide', [True, False])
+def test_guided_matrix(model_id, backend_name, backend_factory, enable_guide):
+    pipe = pipeline(
+        model_id,
+        backend_config=backend_factory(),
+        log_level='INFO',
+    )
+
+    try:
+        if enable_guide:
+            gen_config = GenerationConfig(response_format=dict(
+                type='json_schema',
+                json_schema=dict(name='test', schema=GUIDE_SCHEMA),
+            ), )
+        else:
+            gen_config = GenerationConfig()
+
+        response = pipe(['Make a self introduction please.'] * 3, gen_config=gen_config)
+        assert response and response[0].text
+
+        if enable_guide:
+            validate(instance=json.loads(response[0].text), schema=GUIDE_SCHEMA)
+    finally:
+        pipe.close()