openvinotoolkit · Wovchena · Oct 16, 2025 · Sep 24, 2025 · Oct 8, 2025 · Oct 10, 2025
diff --git a/samples/cpp/text_generation/benchmark_genai.cpp b/samples/cpp/text_generation/benchmark_genai.cpp
@@ -55,6 +55,7 @@ int main(int argc, char* argv[]) try {
 
     ov::genai::GenerationConfig config;
     config.max_new_tokens = result["max_new_tokens"].as<size_t>();
+    config.apply_chat_template = false;
 
     ov::genai::SchedulerConfig scheduler_config;
     scheduler_config.enable_prefix_caching = false;

diff --git a/samples/python/text_generation/benchmark_genai.py b/samples/python/text_generation/benchmark_genai.py
@@ -1,11 +1,13 @@
 # Copyright (C) 2023-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import sys
 import argparse
+import sys
+
 import openvino_genai as ov_genai
 from openvino import get_version
 
+
 def main():
     parser = argparse.ArgumentParser(description="Help command")
     parser.add_argument("-m", "--model", type=str, required=True, help="Path to model and tokenizers base directory")
@@ -15,31 +17,32 @@ def main():
     parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations")
     parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
     parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
-    
+
     args = parser.parse_args()
 
     if args.prompt is not None and args.prompt_file is not None:
-        raise RuntimeError(f'Prompt and prompt file should not exist together!')
+        raise RuntimeError("Prompt and prompt file should not exist together!")
     else:
         if args.prompt_file is not None:
-            with open(args.prompt_file, 'r', encoding='utf-8') as f:
+            with open(args.prompt_file, "r", encoding="utf-8") as f:
                 prompt = [f.read()]
         else:
-            prompt = ['The Sky is blue because'] if args.prompt is None else [args.prompt]
+            prompt = ["The Sky is blue because"] if args.prompt is None else [args.prompt]
     if len(prompt) == 0:
-        raise RuntimeError(f'Prompt is empty!')
+        raise RuntimeError("Prompt is empty!")
 
-    print(f'openvino runtime version: {get_version()}, genai version: {ov_genai.__version__}')
+    print(f"openvino runtime version: {get_version()}, genai version: {ov_genai.__version__}")
 
-    # Perf metrics is stored in DecodedResults. 
+    # Perf metrics is stored in DecodedResults.
     # In order to get DecodedResults instead of a string input should be a list.
     models_path = args.model
     device = args.device
     num_warmup = args.num_warmup
     num_iter = args.num_iter
-    
+
     config = ov_genai.GenerationConfig()
     config.max_new_tokens = args.max_new_tokens
+    config.apply_chat_template = False
 
     if device == "NPU":
         pipe = ov_genai.LLMPipeline(models_path, device)
@@ -55,21 +58,28 @@ def main():
 
     for _ in range(num_warmup):
         pipe.generate(prompt, config)
-    
+
     res = pipe.generate(prompt, config)
     perf_metrics = res.perf_metrics
     for _ in range(num_iter - 1):
         res = pipe.generate(prompt, config)
         perf_metrics += res.perf_metrics
-    
+
     print(f"Output token size: {res.perf_metrics.get_num_generated_tokens()}")
     print(f"Load time: {perf_metrics.get_load_time():.2f} ms")
-    print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms")
-    print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms")
-    print(f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms")
+    print(
+        f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms"
+    )
+    print(
+        f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms"
+    )
+    print(
+        f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms"
+    )
     print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms")
     print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms")
     print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s")
 
+
 if __name__ == "__main__":
     main()
diff --git a/samples/python/text_generation/compound_grammar_generation.py b/samples/python/text_generation/compound_grammar_generation.py
@@ -7,12 +7,13 @@
 from typing import Any
 
 from openvino_genai import (
-    LLMPipeline,
     GenerationConfig,
-    StructuredOutputConfig as SOC,
+    LLMPipeline,
     StreamingStatus,
 )
-
+from openvino_genai import (
+    StructuredOutputConfig as SOC,
+)
 from pydantic import BaseModel, Field
 
 
@@ -25,9 +26,7 @@ class booking_flight_tickets(BaseModel):
     """booking flights"""
 
     origin_airport_code: str = Field(description="The name of Departure airport code")
-    destination_airport_code: str = Field(
-        description="The name of Destination airport code"
-    )
+    destination_airport_code: str = Field(description="The name of Destination airport code")
     departure_date: str = Field(description="The date of outbound flight")
     return_date: str = Field(description="The date of return flight")
 
@@ -74,12 +73,11 @@ def tools_to_array_schema(*tools: BaseModel) -> str:
     return json.dumps(
         {
             "type": "array",
-            "items": {
-                "anyOf": [tool_to_dict(tool, with_description=False) for tool in tools]
-            },
+            "items": {"anyOf": [tool_to_dict(tool, with_description=False) for tool in tools]},
         }
     )
 
+
 # modified system message from:
 # https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_phi4_mini.jinja
 sys_message = """You are a helpful AI assistant.
@@ -88,7 +86,7 @@ def tools_to_array_schema(*tools: BaseModel) -> str:
 Use the following rule to decide when to call a function:
     * if the response can be generated from your internal knowledge, do so, but use only yes or no as the response
     * if you need external information that can be obtained by calling one or more of the provided functions, generate function calls
-    
+
 If you decide to call functions:
     * prefix function calls with functools marker (no closing marker required)
     * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
@@ -118,16 +116,10 @@ def main():
     user_text_1 = "Do dolphins have fingers?"
     print("User: ", user_text_1)
     chat_history.append({"role": "user", "content": user_text_1})
-    model_input = tokenizer.apply_chat_template(
-        chat_history, add_generation_prompt=True
-    )
-
-    # the example grammar works the same as SOC.Regex("yes|no")
-    # but the Union grammar is more flexible and can be extended with more options
-    yes_or_no = SOC.Regex("yes") | SOC.Regex(
-        "no"
-    )  # SOC.Union(SOC.Regex("yes"), SOC.Regex("no"))
-    generation_config.structured_output_config = SOC(compound_grammar=yes_or_no)
+    model_input = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True)
+    # same as SOC.Union(SOC.ConstString("yes"), SOC.ConstString("no"))
+    yes_or_no_grammar = SOC.ConstString("yes") | SOC.ConstString("no")
+    generation_config.structured_output_config = SOC(structural_tags_config=yes_or_no_grammar)
     print("Assistant: ", end="")
     answer = pipe.generate(model_input, generation_config, streamer=streamer)
     chat_history.append({"role": "assistant", "content": answer})
@@ -139,21 +131,16 @@ def main():
     )
     print("User: ", user_text_2)
     chat_history.append({"role": "user", "content": user_text_2})
-    model_input = tokenizer.apply_chat_template(
-        chat_history, add_generation_prompt=True
-    )
+    model_input = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True)
 
-    start_tool_call_tag = SOC.Regex(r"functools")
-    tools_json = SOC.JSONSchema(
-        tools_to_array_schema(booking_flight_tickets, booking_hotels)
-    )
-    tool_call = (
-        start_tool_call_tag + tools_json
-    )  # SOC.Concat(start_tool_call_tag, tools_json)
-    generation_config.structured_output_config.compound_grammar = tool_call
+    start_tool_call_tag = SOC.ConstString(r"functools")
+    tools_json = SOC.JSONSchema(tools_to_array_schema(booking_flight_tickets, booking_hotels))
+    tool_call_grammar = start_tool_call_tag + tools_json  # SOC.Concat(start_tool_call_tag, tools_json)
+    generation_config.structured_output_config.structural_tags_config = tool_call_grammar
 
     print("Assistant: ", end="")
     pipe.generate(model_input, generation_config, streamer=streamer)
+    print()
 
 
 if __name__ == "__main__":

diff --git a/samples/python/text_generation/structural_tags_generation.py b/samples/python/text_generation/structural_tags_generation.py
@@ -3,20 +3,20 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
-import re
 import json
+import re
 from datetime import datetime
 from pprint import pprint
+from typing import ClassVar
 
 from openvino_genai import (
-    LLMPipeline,
     GenerationConfig,
-    StructuredOutputConfig,
-    StructuralTagsConfig,
-    StructuralTagItem,
+    LLMPipeline,
     StreamingStatus,
 )
-from typing import ClassVar
+from openvino_genai import (
+    StructuredOutputConfig as SOC,
+)
 from pydantic import BaseModel, Field
 
 
@@ -35,9 +35,7 @@ class WeatherRequest(ToolRequest):
 
     city: str = Field(description="City name")
     country: str = Field(description="Country name")
-    date: str = Field(
-        pattern=r"2\d\d\d-[0-1]\d-[0-3]\d", description="Date in YYYY-MM-DD format"
-    )
+    date: str = Field(pattern=r"2\d\d\d-[0-1]\d-[0-3]\d", description="Date in YYYY-MM-DD format")
 
 
 class CurrencyExchangeRequest(ToolRequest):
@@ -59,8 +57,8 @@ class CurrencyExchangeRequest(ToolRequest):
     "You can use the following tools:\n"
     f"{new_line.join([tool.string_representation() for tool in tools.values()])}\n"
     "Please, only use the following format for tool calling in your responses:\n"
-    "<function=\"function_name\">"
-    "{\"argument1\": \"value1\", ...}"
+    '<function="function_name">'
+    '{"argument1": "value1", ...}'
     "</function>\n"
     "Use the tool name and arguments as defined in the tool schema.\n"
     "If you don't know the answer, just say that you don't know, but try to call the tool if it helps to answer the question.\n"
@@ -77,10 +75,7 @@ def parse_tools_from_response(response: str) -> list[ToolRequest]:
     <function="function_name">{"argument1": "value1", ...}</function>
     """
     matches = re.finditer(function_pattern, response)
-    return [
-        tools.get(match.group(1)).model_validate_json(match.group(2))
-        for match in matches
-    ]
+    return [tools.get(match.group(1)).model_validate_json(match.group(2)) for match in matches]
 
 
 def streamer(subword):
@@ -89,7 +84,9 @@ def streamer(subword):
 
 
 def main():
-    default_prompt = "What is the weather in London today and in Paris yesterday, and how many pounds can I get for 100 euros?"
+    default_prompt = (
+        "What is the weather in London today and in Paris yesterday, and how many pounds can I get for 100 euros?"
+    )
 
     description = (
         "This script demonstrates how to use OpenVINO GenAI with structured tags to generate responses "
@@ -116,26 +113,24 @@ def main():
 
     for use_structural_tags in [False, True]:
         print("=" * 80)
-        print(
-            f"{'Using structural tags' if use_structural_tags else 'Using no structural tags':^80}"
-        )
+        print(f"{'Using structural tags' if use_structural_tags else 'Using no structural tags':^80}")
         print("=" * 80)
         config = GenerationConfig()
         config.max_new_tokens = 300
 
         pipe.start_chat(sys_message)
         if use_structural_tags:
-            config.structured_output_config = StructuredOutputConfig(
-                structural_tags_config=StructuralTagsConfig(
-                    structural_tags=[
-                        StructuralTagItem(
+            config.structured_output_config = SOC(
+                structural_tags_config=SOC.TriggeredTags(
+                    triggers=["<function="],
+                    tags=[
+                        SOC.Tag(
                             begin=f'<function="{name}">',
-                            schema=json.dumps(tool.model_json_schema()),
+                            content=SOC.JSONSchema(json.dumps(tool.model_json_schema())),
                             end="</function>",
                         )
                         for name, tool in tools.items()
                     ],
-                    triggers=["<function="],
                 )
             )
             config.do_sample = True

diff --git a/samples/python/text_generation/structured_output_generation.py b/samples/python/text_generation/structured_output_generation.py
@@ -4,8 +4,9 @@
 
 import argparse
 import json
-from openvino_genai import LLMPipeline, GenerationConfig, StructuredOutputConfig
 from typing import Literal
+
+from openvino_genai import GenerationConfig, LLMPipeline, StructuredOutputConfig
 from pydantic import BaseModel, Field
 
 
@@ -14,7 +15,7 @@ class Person(BaseModel):
     surname: str = Field(pattern=r"^[A-Z][a-z]{1,20}$")
     age: int
     city: Literal["Dublin", "Dubai", "Munich"]
-    
+
 
 class Car(BaseModel):
     model: str = Field(pattern=r"^[A-Z][a-z]{1,20} ?[A-Z][a-z]{0,20} ?.?$")
@@ -40,37 +41,42 @@ class ItemQuantities(BaseModel):
     "You generate JSON objects based on the user's request. You can generate JSON objects with different types of objects: person, car, transaction. "
     "If the user requested a different type, the JSON fields should remain zero. "
     "Please note that the words 'individual', 'person', 'people', 'man', 'human', 'woman', 'inhabitant', 'citizen' are synonyms and can be used interchangeably. "
-    "E.g. if the user wants 5 houses, then the JSON must be {\"person\": 0, \"car\": 0, \"transaction\": 0}. "
-    "If the user wants 3 people and 1 house, then the JSON must be {\"person\": 3, \"car\": 0, \"transaction\": 0}. "
+    'E.g. if the user wants 5 houses, then the JSON must be {"person": 0, "car": 0, "transaction": 0}. '
+    'If the user wants 3 people and 1 house, then the JSON must be {"person": 3, "car": 0, "transaction": 0}. '
     "Make sure that the JSON contains the numbers that the user requested. If the user asks for specific attributes, like 'surname', 'model', etc., "
     "ignore this information and generate JSON objects with the same fields as in the schema. "
     "Please use double quotes for JSON keys and values. "
 )
 
 sys_message_for_items = "Please try to avoid generating the same JSON objects multiple times."
 
+
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('model_dir', help="Path to the model directory. It should contain the OpenVINO model files.")
+    parser.add_argument("model_dir", help="Path to the model directory. It should contain the OpenVINO model files.")
     args = parser.parse_args()
 
-    device = 'CPU'  # GPU can be used as well
+    device = "CPU"  # GPU can be used as well
     pipe = LLMPipeline(args.model_dir, device)
 
     config = GenerationConfig()
     config.max_new_tokens = 300
 
-    print("This is a smart assistant that generates structured output in JSON format. "
-          "You can ask to generate information about a person, car, or bank transaction. "
-          'For example, you can ask: "Please generate jsons for 3 persons and 1 transaction."')
+    print(
+        "This is a smart assistant that generates structured output in JSON format. "
+        "You can ask to generate information about a person, car, or bank transaction. "
+        'For example, you can ask: "Please generate jsons for 3 persons and 1 transaction."'
+    )
 
     while True:
         try:
-            prompt = input('> ')
+            prompt = input("> ")
         except EOFError:
             break
         pipe.start_chat(sys_message)
-        config.structured_output_config = StructuredOutputConfig(json_schema = json.dumps(ItemQuantities.model_json_schema()))
+        config.structured_output_config = StructuredOutputConfig(
+            json_schema=json.dumps(ItemQuantities.model_json_schema())
+        )
         config.do_sample = False
         res = json.loads(pipe.generate(prompt, config))
         pipe.finish_chat()
@@ -82,7 +88,9 @@ def main():
         pipe.start_chat(sys_message_for_items)
         generate_has_run = False
         for item, quantity in res.items():
-            config.structured_output_config = StructuredOutputConfig(json_schema = json.dumps(items_map[item].model_json_schema()))
+            config.structured_output_config = StructuredOutputConfig(
+                json_schema=json.dumps(items_map[item].model_json_schema())
+            )
             for _ in range(quantity):
                 generate_has_run = True
                 json_strs = pipe.generate(prompt, config)
@@ -92,5 +100,5 @@ def main():
             print("No items generated. Please try again with a different request.")
 
 
-if '__main__' == __name__:
+if "__main__" == __name__:
     main()