openvinotoolkit · as-suvorov · Sep 11, 2025 · Aug 25, 2025 · Aug 27, 2025 · Aug 27, 2025
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -622,7 +622,7 @@ jobs:
           - name: 'GGUF Reader tests'
             cmd: 'tests/python_tests/test_gguf_reader.py'
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).GGUF.test || fromJSON(needs.smart_ci.outputs.affected_components).LLM.test }}
-            timeout: 60
+            timeout: 100
           - name: 'Tokenizer tests'
             cmd: 'tests/python_tests/test_tokenizer.py'
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).tokenizers.test }}

diff --git a/src/cpp/src/gguf_utils/gguf_tokenizer.cpp b/src/cpp/src/gguf_utils/gguf_tokenizer.cpp
@@ -122,12 +122,26 @@ bool is_special_token(int32_t token_type) {
     return token_type == 3 || token_type == 4;
 }
 
+std::string quote_meta(const std::string& str) {
+    std::string result = "(";
+
+    // todo: add also utf validate
+    for (char c : str) {
+        if (!std::isalnum(c) && c != '_') {
+            result += '\\';
+        }
+        result += c;
+    }
+    result += ")";
+    return result;
+}
+
 std::string join_special_tokens(const std::vector<std::string>& special_tokens) {
     std::ostringstream oss;
     for (size_t i = 0; i < special_tokens.size(); ++i) {
         if (i > 0)
             oss << "|";
-        oss << special_tokens[i];
+        oss << quote_meta(special_tokens[i]);
     }
     return oss.str();
 }

diff --git a/src/cpp/src/tokenizer/tokenizer_impl.cpp b/src/cpp/src/tokenizer/tokenizer_impl.cpp
@@ -4,6 +4,7 @@
 #include "tokenizer/tokenizer_impl.hpp"
 #include "add_second_input_pass.hpp"
 #include "sampling/structured_output/structured_output_controller.hpp"
+#include "openvino/genai/version.hpp"
 
 namespace ov {
 namespace genai {
@@ -273,7 +274,8 @@ void Tokenizer::TokenizerImpl::setup_tokenizer(const std::filesystem::path& mode
     std::shared_ptr<ov::Model> ov_tokenizer = nullptr;
     std::shared_ptr<ov::Model> ov_detokenizer = nullptr;
     auto [filtered_properties, enable_save_ov_model] = utils::extract_gguf_properties(properties);
-    if (is_gguf_model(models_path)) {
+
+    if (ov::genai::is_gguf_model(models_path)) {
         std::map<std::string, GGUFMetaData> tokenizer_config{};
         std::tie(ov_tokenizer, ov_detokenizer, tokenizer_config) =
             create_tokenizer_from_config(m_shared_object_ov_tokenizers, models_path);
@@ -293,6 +295,8 @@ void Tokenizer::TokenizerImpl::setup_tokenizer(const std::filesystem::path& mode
         if (!m_chat_template.empty()) {
             m_chat_template = patch_gguf_chat_template(m_chat_template);
         }
+        ov_tokenizer->set_rt_info(ov::genai::get_version().buildNumber, "openvino_genai_version");
+        ov_detokenizer->set_rt_info(ov::genai::get_version().buildNumber, "openvino_genai_version");
 
         if (enable_save_ov_model){
             std::filesystem::path gguf_model_path(models_path);
@@ -372,8 +376,14 @@ void Tokenizer::TokenizerImpl::setup_tokenizer(const std::pair<std::shared_ptr<o
     auto core = get_core_singleton();
     std::string device = "CPU";  // only CPU is supported for now
 
-    // Saving IR version was added only in 24.5, so if it's missing, then it's older than 24.5
-    m_older_than_24_5 = !(ov_tokenizer ? ov_tokenizer : ov_detokenizer)->has_rt_info("openvino_tokenizers_version");
+    // Save openvino GenAI runtime version was added in 25.4 for GGUF models,
+    // if we have it in ov::Model, then it's newer than 24.5 we don't need to check 'openvino_tokenizers' version.
+    if ((ov_tokenizer ? ov_tokenizer : ov_detokenizer)->has_rt_info("openvino_genai_version")) {
+        m_older_than_24_5 = false;
+    } else {
+        // Saving IR version was added only in 24.5, so if it's missing, then it's older than 24.5
+        m_older_than_24_5 = !(ov_tokenizer ? ov_tokenizer : ov_detokenizer)->has_rt_info("openvino_tokenizers_version");
+    }
 
     if (ov_tokenizer) {
         ov::pass::Manager manager;

diff --git a/tests/python_tests/test_gguf_reader.py b/tests/python_tests/test_gguf_reader.py
@@ -61,15 +61,24 @@ def test_pipelines_with_gguf_generate(pipeline_type, model_ids):
 @pytest.mark.parametrize("pipeline_type", get_gguf_pipeline_types())
 @pytest.mark.parametrize("model_ids", get_gguf_model_list())
 @pytest.mark.parametrize("enable_save_ov_model", [False, True])
+@pytest.mark.parametrize("prompt", [
+    'Why is the Sun yellow?', 
+    # To check that special tokens are handled correctly.
+    '<|endoftext|> <|im_end|>', 
+    '<|endoftext|><|endoftext|><|im_end|>', 
+    '<|endoftext|> Why the Sky is Blue? <|im_end|>',
+])
 @pytest.mark.precommit
-def test_full_gguf_pipeline(pipeline_type, model_ids, enable_save_ov_model):
+def test_full_gguf_pipeline(pipeline_type, model_ids, enable_save_ov_model, prompt):
     if sys.platform == 'darwin':
         pytest.skip(reason="168882: Sporadic segmentation fault failure on MacOS.")
     gguf_model_id = model_ids["gguf_model_id"]
     gguf_filename = model_ids["gguf_filename"]
     dynamic_quantization_group_size = model_ids["dynamic_quantization_group_size"]
-    prompt = 'Why is the Sun yellow?'
 
+    if gguf_model_id == "sammysun0711/tiny-random-deepseek-distill-qwen-gguf" and "<|endoftext|>" in prompt:
+        pytest.skip(reason="Prompts to test special tokens for this model fail on HF side")
+
     opt_model = load_hf_model_from_gguf(gguf_model_id, gguf_filename)
     hf_tokenizer = load_hf_tokenizer_from_gguf(gguf_model_id, gguf_filename)
     gc.collect()
@@ -97,6 +106,11 @@ def test_full_gguf_pipeline(pipeline_type, model_ids, enable_save_ov_model):
     gguf_full_path = download_gguf_model(gguf_model_id, gguf_filename)
     ov_pipe_gguf = create_ov_pipeline(gguf_full_path, pipeline_type=pipeline_type, enable_save_ov_model=enable_save_ov_model, dynamic_quantization_group_size=dynamic_quantization_group_size)
     res_string_input_2 = ov_pipe_gguf.generate(prompt, generation_config=ov_generation_config)
+
+    # Check that eos_token, bos_token string representations are loaded correctly from gguf file
+    assert ov_pipe_gguf.get_tokenizer().get_eos_token() == hf_tokenizer.decode([ov_pipe_gguf.get_tokenizer().get_eos_token_id()])
+    assert ov_pipe_gguf.get_tokenizer().get_bos_token() == hf_tokenizer.decode([ov_pipe_gguf.get_tokenizer().get_bos_token_id()])
+
     del ov_pipe_gguf
     gc.collect()
 
@@ -129,7 +143,7 @@ def test_full_gguf_qwen3_pipeline(pipeline_type, model_ids):
     # <think>\nOkay, the user is asking why the Sun is yellow. Let me start by recalling what I know about the Sun's color.
     # Prompt after applying chat template is identical between HF and GenAI, so the issue is not in chat template.
     # TODO: Investigate output difference for GGUF models. Ticket: TBD
-    res_string_input_1 = "</im_start>\nOkay, the user is asking why the Sun is yellow. Let me start by recalling what I know about the Sun's color."
+    res_string_input_1 = "\nOkay, the user is asking why the Sun is yellow. Let me start by recalling what I know about the Sun's color. I remember"
 
     gguf_full_path = download_gguf_model(gguf_model_id, gguf_filename)
     ov_pipe_gguf = create_ov_pipeline(gguf_full_path, pipeline_type=pipeline_type)