Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions samples/cpp/text_generation/benchmark_genai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ int main(int argc, char* argv[]) try {

ov::genai::GenerationConfig config;
config.max_new_tokens = result["max_new_tokens"].as<size_t>();
config.apply_chat_template = false;

ov::genai::SchedulerConfig scheduler_config;
scheduler_config.enable_prefix_caching = false;
Expand Down
38 changes: 24 additions & 14 deletions samples/python/text_generation/benchmark_genai.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# Copyright (C) 2023-2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import sys
import argparse
import sys

import openvino_genai as ov_genai
from openvino import get_version


def main():
parser = argparse.ArgumentParser(description="Help command")
parser.add_argument("-m", "--model", type=str, required=True, help="Path to model and tokenizers base directory")
Expand All @@ -15,31 +17,32 @@ def main():
parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations")
parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")

args = parser.parse_args()

if args.prompt is not None and args.prompt_file is not None:
raise RuntimeError(f'Prompt and prompt file should not exist together!')
raise RuntimeError("Prompt and prompt file should not exist together!")
else:
if args.prompt_file is not None:
with open(args.prompt_file, 'r', encoding='utf-8') as f:
with open(args.prompt_file, "r", encoding="utf-8") as f:
prompt = [f.read()]
else:
prompt = ['The Sky is blue because'] if args.prompt is None else [args.prompt]
prompt = ["The Sky is blue because"] if args.prompt is None else [args.prompt]
if len(prompt) == 0:
raise RuntimeError(f'Prompt is empty!')
raise RuntimeError("Prompt is empty!")

print(f'openvino runtime version: {get_version()}, genai version: {ov_genai.__version__}')
print(f"openvino runtime version: {get_version()}, genai version: {ov_genai.__version__}")

# Perf metrics is stored in DecodedResults.
# Perf metrics is stored in DecodedResults.
# In order to get DecodedResults instead of a string input should be a list.
models_path = args.model
device = args.device
num_warmup = args.num_warmup
num_iter = args.num_iter

config = ov_genai.GenerationConfig()
config.max_new_tokens = args.max_new_tokens
config.apply_chat_template = False

if device == "NPU":
pipe = ov_genai.LLMPipeline(models_path, device)
Expand All @@ -55,21 +58,28 @@ def main():

for _ in range(num_warmup):
pipe.generate(prompt, config)

res = pipe.generate(prompt, config)
perf_metrics = res.perf_metrics
for _ in range(num_iter - 1):
res = pipe.generate(prompt, config)
perf_metrics += res.perf_metrics

print(f"Output token size: {res.perf_metrics.get_num_generated_tokens()}")
print(f"Load time: {perf_metrics.get_load_time():.2f} ms")
print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms")
print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms")
print(f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms")
print(
f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms"
)
print(
f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms"
)
print(
f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms"
)
print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms")
print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms")
print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s")


if __name__ == "__main__":
main()
49 changes: 18 additions & 31 deletions samples/python/text_generation/compound_grammar_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
from typing import Any

from openvino_genai import (
LLMPipeline,
GenerationConfig,
StructuredOutputConfig as SOC,
LLMPipeline,
StreamingStatus,
)

from openvino_genai import (
StructuredOutputConfig as SOC,
)
from pydantic import BaseModel, Field


Expand All @@ -25,9 +26,7 @@ class booking_flight_tickets(BaseModel):
"""booking flights"""

origin_airport_code: str = Field(description="The name of Departure airport code")
destination_airport_code: str = Field(
description="The name of Destination airport code"
)
destination_airport_code: str = Field(description="The name of Destination airport code")
departure_date: str = Field(description="The date of outbound flight")
return_date: str = Field(description="The date of return flight")

Expand Down Expand Up @@ -74,12 +73,11 @@ def tools_to_array_schema(*tools: BaseModel) -> str:
return json.dumps(
{
"type": "array",
"items": {
"anyOf": [tool_to_dict(tool, with_description=False) for tool in tools]
},
"items": {"anyOf": [tool_to_dict(tool, with_description=False) for tool in tools]},
}
)


# modified system message from:
# https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_phi4_mini.jinja
sys_message = """You are a helpful AI assistant.
Expand All @@ -88,7 +86,7 @@ def tools_to_array_schema(*tools: BaseModel) -> str:
Use the following rule to decide when to call a function:
* if the response can be generated from your internal knowledge, do so, but use only yes or no as the response
* if you need external information that can be obtained by calling one or more of the provided functions, generate function calls

If you decide to call functions:
* prefix function calls with functools marker (no closing marker required)
* all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
Expand Down Expand Up @@ -118,16 +116,10 @@ def main():
user_text_1 = "Do dolphins have fingers?"
print("User: ", user_text_1)
chat_history.append({"role": "user", "content": user_text_1})
model_input = tokenizer.apply_chat_template(
chat_history, add_generation_prompt=True
)

# the example grammar works the same as SOC.Regex("yes|no")
# but the Union grammar is more flexible and can be extended with more options
yes_or_no = SOC.Regex("yes") | SOC.Regex(
"no"
) # SOC.Union(SOC.Regex("yes"), SOC.Regex("no"))
generation_config.structured_output_config = SOC(compound_grammar=yes_or_no)
model_input = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True)
# same as SOC.Union(SOC.ConstString("yes"), SOC.ConstString("no"))
yes_or_no_grammar = SOC.ConstString("yes") | SOC.ConstString("no")
generation_config.structured_output_config = SOC(structural_tags_config=yes_or_no_grammar)
print("Assistant: ", end="")
answer = pipe.generate(model_input, generation_config, streamer=streamer)
chat_history.append({"role": "assistant", "content": answer})
Expand All @@ -139,21 +131,16 @@ def main():
)
print("User: ", user_text_2)
chat_history.append({"role": "user", "content": user_text_2})
model_input = tokenizer.apply_chat_template(
chat_history, add_generation_prompt=True
)
model_input = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True)

start_tool_call_tag = SOC.Regex(r"functools")
tools_json = SOC.JSONSchema(
tools_to_array_schema(booking_flight_tickets, booking_hotels)
)
tool_call = (
start_tool_call_tag + tools_json
) # SOC.Concat(start_tool_call_tag, tools_json)
generation_config.structured_output_config.compound_grammar = tool_call
start_tool_call_tag = SOC.ConstString(r"functools")
tools_json = SOC.JSONSchema(tools_to_array_schema(booking_flight_tickets, booking_hotels))
tool_call_grammar = start_tool_call_tag + tools_json # SOC.Concat(start_tool_call_tag, tools_json)
generation_config.structured_output_config.structural_tags_config = tool_call_grammar

print("Assistant: ", end="")
pipe.generate(model_input, generation_config, streamer=streamer)
print()


if __name__ == "__main__":
Expand Down
45 changes: 20 additions & 25 deletions samples/python/text_generation/structural_tags_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@
# SPDX-License-Identifier: Apache-2.0

import argparse
import re
import json
import re
from datetime import datetime
from pprint import pprint
from typing import ClassVar

from openvino_genai import (
LLMPipeline,
GenerationConfig,
StructuredOutputConfig,
StructuralTagsConfig,
StructuralTagItem,
LLMPipeline,
StreamingStatus,
)
from typing import ClassVar
from openvino_genai import (
StructuredOutputConfig as SOC,
)
from pydantic import BaseModel, Field


Expand All @@ -35,9 +35,7 @@ class WeatherRequest(ToolRequest):

city: str = Field(description="City name")
country: str = Field(description="Country name")
date: str = Field(
pattern=r"2\d\d\d-[0-1]\d-[0-3]\d", description="Date in YYYY-MM-DD format"
)
date: str = Field(pattern=r"2\d\d\d-[0-1]\d-[0-3]\d", description="Date in YYYY-MM-DD format")


class CurrencyExchangeRequest(ToolRequest):
Expand All @@ -59,8 +57,8 @@ class CurrencyExchangeRequest(ToolRequest):
"You can use the following tools:\n"
f"{new_line.join([tool.string_representation() for tool in tools.values()])}\n"
"Please, only use the following format for tool calling in your responses:\n"
"<function=\"function_name\">"
"{\"argument1\": \"value1\", ...}"
'<function="function_name">'
'{"argument1": "value1", ...}'
"</function>\n"
"Use the tool name and arguments as defined in the tool schema.\n"
"If you don't know the answer, just say that you don't know, but try to call the tool if it helps to answer the question.\n"
Expand All @@ -77,10 +75,7 @@ def parse_tools_from_response(response: str) -> list[ToolRequest]:
<function="function_name">{"argument1": "value1", ...}</function>
"""
matches = re.finditer(function_pattern, response)
return [
tools.get(match.group(1)).model_validate_json(match.group(2))
for match in matches
]
return [tools.get(match.group(1)).model_validate_json(match.group(2)) for match in matches]


def streamer(subword):
Expand All @@ -89,7 +84,9 @@ def streamer(subword):


def main():
default_prompt = "What is the weather in London today and in Paris yesterday, and how many pounds can I get for 100 euros?"
default_prompt = (
"What is the weather in London today and in Paris yesterday, and how many pounds can I get for 100 euros?"
)

description = (
"This script demonstrates how to use OpenVINO GenAI with structured tags to generate responses "
Expand All @@ -116,26 +113,24 @@ def main():

for use_structural_tags in [False, True]:
print("=" * 80)
print(
f"{'Using structural tags' if use_structural_tags else 'Using no structural tags':^80}"
)
print(f"{'Using structural tags' if use_structural_tags else 'Using no structural tags':^80}")
print("=" * 80)
config = GenerationConfig()
config.max_new_tokens = 300

pipe.start_chat(sys_message)
if use_structural_tags:
config.structured_output_config = StructuredOutputConfig(
structural_tags_config=StructuralTagsConfig(
structural_tags=[
StructuralTagItem(
config.structured_output_config = SOC(
structural_tags_config=SOC.TriggeredTags(
triggers=["<function="],
tags=[
SOC.Tag(
begin=f'<function="{name}">',
schema=json.dumps(tool.model_json_schema()),
content=SOC.JSONSchema(json.dumps(tool.model_json_schema())),
end="</function>",
)
for name, tool in tools.items()
],
triggers=["<function="],
)
)
config.do_sample = True
Expand Down
34 changes: 21 additions & 13 deletions samples/python/text_generation/structured_output_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

import argparse
import json
from openvino_genai import LLMPipeline, GenerationConfig, StructuredOutputConfig
from typing import Literal

from openvino_genai import GenerationConfig, LLMPipeline, StructuredOutputConfig
from pydantic import BaseModel, Field


Expand All @@ -14,7 +15,7 @@ class Person(BaseModel):
surname: str = Field(pattern=r"^[A-Z][a-z]{1,20}$")
age: int
city: Literal["Dublin", "Dubai", "Munich"]


class Car(BaseModel):
model: str = Field(pattern=r"^[A-Z][a-z]{1,20} ?[A-Z][a-z]{0,20} ?.?$")
Expand All @@ -40,37 +41,42 @@ class ItemQuantities(BaseModel):
"You generate JSON objects based on the user's request. You can generate JSON objects with different types of objects: person, car, transaction. "
"If the user requested a different type, the JSON fields should remain zero. "
"Please note that the words 'individual', 'person', 'people', 'man', 'human', 'woman', 'inhabitant', 'citizen' are synonyms and can be used interchangeably. "
"E.g. if the user wants 5 houses, then the JSON must be {\"person\": 0, \"car\": 0, \"transaction\": 0}. "
"If the user wants 3 people and 1 house, then the JSON must be {\"person\": 3, \"car\": 0, \"transaction\": 0}. "
'E.g. if the user wants 5 houses, then the JSON must be {"person": 0, "car": 0, "transaction": 0}. '
'If the user wants 3 people and 1 house, then the JSON must be {"person": 3, "car": 0, "transaction": 0}. '
"Make sure that the JSON contains the numbers that the user requested. If the user asks for specific attributes, like 'surname', 'model', etc., "
"ignore this information and generate JSON objects with the same fields as in the schema. "
"Please use double quotes for JSON keys and values. "
)

sys_message_for_items = "Please try to avoid generating the same JSON objects multiple times."


def main():
parser = argparse.ArgumentParser()
parser.add_argument('model_dir', help="Path to the model directory. It should contain the OpenVINO model files.")
parser.add_argument("model_dir", help="Path to the model directory. It should contain the OpenVINO model files.")
args = parser.parse_args()

device = 'CPU' # GPU can be used as well
device = "CPU" # GPU can be used as well
pipe = LLMPipeline(args.model_dir, device)

config = GenerationConfig()
config.max_new_tokens = 300

print("This is a smart assistant that generates structured output in JSON format. "
"You can ask to generate information about a person, car, or bank transaction. "
'For example, you can ask: "Please generate jsons for 3 persons and 1 transaction."')
print(
"This is a smart assistant that generates structured output in JSON format. "
"You can ask to generate information about a person, car, or bank transaction. "
'For example, you can ask: "Please generate jsons for 3 persons and 1 transaction."'
)

while True:
try:
prompt = input('> ')
prompt = input("> ")
except EOFError:
break
pipe.start_chat(sys_message)
config.structured_output_config = StructuredOutputConfig(json_schema = json.dumps(ItemQuantities.model_json_schema()))
config.structured_output_config = StructuredOutputConfig(
json_schema=json.dumps(ItemQuantities.model_json_schema())
)
config.do_sample = False
res = json.loads(pipe.generate(prompt, config))
pipe.finish_chat()
Expand All @@ -82,7 +88,9 @@ def main():
pipe.start_chat(sys_message_for_items)
generate_has_run = False
for item, quantity in res.items():
config.structured_output_config = StructuredOutputConfig(json_schema = json.dumps(items_map[item].model_json_schema()))
config.structured_output_config = StructuredOutputConfig(
json_schema=json.dumps(items_map[item].model_json_schema())
)
for _ in range(quantity):
generate_has_run = True
json_strs = pipe.generate(prompt, config)
Expand All @@ -92,5 +100,5 @@ def main():
print("No items generated. Please try again with a different request.")


if '__main__' == __name__:
if "__main__" == __name__:
main()
Loading
Loading