Skip to content

Commit aeb968b

Browse files
authored
feature: enable tool_call and reasoning_content parsing for qwen3 (#3615)
* feature: enable tool_call and reasoning_content parsing for qwen3 model (both stream & non-stream) * fix: supports multiple tool calls in a single response * refactor: parse reasoning content in reasoning parser * fix: model best match for qwen3 * fix: compatible with py3.9 * fix: revert for solve conflict with another PR * fix: yield the whole tool call at once * fix: update qwen match rule for qwen3 compatibility * fix: nonstream reasoning parser when model has no think output * fix: yield empty delta message for p-d disagg compatibility
1 parent 8f4ad3d commit aeb968b

File tree

8 files changed

+687
-21
lines changed

8 files changed

+687
-21
lines changed

lmdeploy/model.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -965,8 +965,7 @@ def match(cls, model_path: str) -> Optional[str]:
965965
model_path (str): the model path used for matching.
966966
"""
967967
model_path = model_path.lower()
968-
if ('qwen' in model_path and 'qwen2.5' not in model_path and 'qwen3' not in model_path
969-
and 'qwq' not in model_path):
968+
if 'qwen' in model_path and not any(keyword in model_path for keyword in ('qwen2.5', 'qwq', 'qwen3')):
970969
return 'qwen'
971970
if 'minicpm-v-2_6' in model_path:
972971
return 'minicpmv-2d6'
@@ -1976,7 +1975,8 @@ def best_match_model(query: str) -> Optional[str]:
19761975
str: the possible model name.
19771976
"""
19781977
for name, model in MODELS.module_dict.items():
1979-
if model.match(query):
1980-
return model.match(query)
1978+
matched_name = model.match(query) # cache the result to avoid matching twice
1979+
if matched_name:
1980+
return matched_name
19811981
logger.warning(f'Did not find a chat template matching {query}.')
19821982
return 'base'

lmdeploy/serve/openai/api_server.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
458458
previous_token_ids = []
459459
current_token_ids = []
460460
delta_token_ids = []
461+
has_parser = VariableInterface.tool_parser is not None or VariableInterface.reasoning_parser is not None
461462
streaming_tools = False
462463
async for res in result_generator:
463464
logprobs, usage = None, None
@@ -472,16 +473,17 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
472473
total_tokens=total_tokens,
473474
)
474475
delta_message = DeltaMessage(role='assistant', content=res.response)
475-
if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
476-
if res.finish_reason == 'stop' and streaming_tools is True:
477-
res.finish_reason = 'tool_calls'
476+
if has_parser:
478477
current_text = current_text + res.response
479478
delta_token_ids = res.token_ids if res.token_ids is not None else []
480479
current_token_ids = current_token_ids + delta_token_ids
480+
if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
481+
if res.finish_reason == 'stop' and streaming_tools is True:
482+
res.finish_reason = 'tool_calls'
481483
tool_delta = VariableInterface.tool_parser.extract_tool_calls_streaming(
482484
previous_text=previous_text,
483485
current_text=current_text,
484-
delta_text=res.response,
486+
delta_text=delta_message.content,
485487
previous_token_ids=previous_token_ids,
486488
current_token_ids=current_token_ids,
487489
delta_token_ids=delta_token_ids,
@@ -491,24 +493,20 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
491493
delta_message.content = tool_delta.content
492494
if isinstance(tool_delta.tool_calls, List) and len(tool_delta.tool_calls):
493495
streaming_tools = True
494-
previous_text = current_text
495-
previous_token_ids = current_token_ids
496496
elif request.tool_choice != 'none' and request.tools is not None and VariableInterface.tool_parser is None:
497-
logger.error('Please lanuch the api_server with --tool-call-parser if you want to use tool.')
497+
logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
498498
if VariableInterface.reasoning_parser is not None:
499-
current_text = current_text + res.response
500-
delta_token_ids = res.token_ids if res.token_ids is not None else []
501-
current_token_ids = current_token_ids + delta_token_ids
502499
reasoning_delta = VariableInterface.reasoning_parser.extract_reasoning_content_streaming(
503500
previous_text=previous_text,
504501
current_text=current_text,
505-
delta_text=res.response,
502+
delta_text=delta_message.content or '',
506503
previous_token_ids=previous_token_ids,
507504
current_token_ids=current_token_ids,
508505
delta_token_ids=delta_token_ids)
509506
if reasoning_delta is not None:
510507
delta_message.reasoning_content = reasoning_delta.reasoning_content
511508
delta_message.content = reasoning_delta.content
509+
if has_parser:
512510
previous_text = current_text
513511
previous_token_ids = current_token_ids
514512
response_json = create_stream_response_json(index=0,
@@ -561,7 +559,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
561559
logger.error(f'Failed to parse {text}. Exception: {e}.')
562560
return create_error_response(HTTPStatus.BAD_REQUEST, 'Failed to parse fc related info to json format!')
563561
elif request.tool_choice != 'none' and request.tools is not None and VariableInterface.tool_parser is None:
564-
logger.error('Please lanuch the api_server with --tool-call-parser if you want to use tool.')
562+
logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
565563

566564
if VariableInterface.reasoning_parser is not None:
567565
reasoning_content, text = VariableInterface.reasoning_parser.extract_reasoning_content(text, request)
Lines changed: 125 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,134 @@
11
# Copyright (c) OpenMMLab. All rights reserved.
2-
from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
3-
from .reasoning_parser import ReasoningParserManager
2+
import re
3+
from typing import Optional, Sequence, Tuple, Union
4+
5+
from lmdeploy.serve.openai.protocol import ChatCompletionRequest, DeltaMessage
6+
7+
from .reasoning_parser import ReasoningParser, ReasoningParserManager
48

59

610
@ReasoningParserManager.register_module(name='qwen-qwq')
7-
class QwenQwQReasoningParser(DeepSeekR1ReasoningParser):
11+
class QwenQwQReasoningParser(ReasoningParser):
812
"""Reasoning parser for Qwen QwQ model.
913
1014
The Qwen QwQ model uses <think>...</think> tokens to denote reasoning text. This parser extracts the reasoning
1115
content from the model output.
1216
"""
17+
18+
def __init__(self, tokenizer: object):
19+
super().__init__(tokenizer)
20+
self.think_start_token = '<think>'
21+
self.think_end_token = '</think>'
22+
23+
self.reasoning_regex = re.compile(rf'{self.think_start_token}(.*?){self.think_end_token}', re.DOTALL)
24+
25+
if not self.model_tokenizer:
26+
raise ValueError('The model tokenizer must be passed to the ReasoningParser '
27+
'constructor during construction.')
28+
29+
def extract_reasoning_content_streaming(
30+
self,
31+
previous_text: str,
32+
current_text: str,
33+
delta_text: str,
34+
previous_token_ids: Sequence[int],
35+
current_token_ids: Sequence[int],
36+
delta_token_ids: Sequence[int],
37+
**kwargs,
38+
) -> Union[DeltaMessage, None]:
39+
"""Instance method that should be implemented for extracting reasoning
40+
from an incomplete response; for use when handling reasoning calls and
41+
streaming.
42+
43+
Has to be an instance method because it requires state - the current tokens/diffs, but also the information
44+
about what has previously been parsed and extracted (see constructor)
45+
"""
46+
# Skip single special tokens
47+
if delta_text == self.think_end_token or delta_text == self.think_start_token:
48+
return DeltaMessage(content='')
49+
50+
# Check if <think> is present in previous or delta.
51+
# Keep compatibility with models that don't generate <think> tokens.
52+
if self.think_start_token in previous_text:
53+
if self.think_end_token in delta_text:
54+
# <think> in previous, </think> in delta,
55+
# extract reasoning content
56+
end_index = delta_text.find(self.think_end_token)
57+
reasoning_content = delta_text[:end_index]
58+
content = delta_text[end_index + len(self.think_end_token):]
59+
return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
60+
elif self.think_end_token in previous_text:
61+
# <think> in previous, </think> in previous,
62+
return DeltaMessage(content=delta_text)
63+
else:
64+
# <think> in previous, no </think> in previous or delta,
65+
# reasoning content continues
66+
return DeltaMessage(reasoning_content=delta_text)
67+
elif self.think_start_token in delta_text:
68+
if self.think_end_token in delta_text:
69+
# <think> in delta, </think> in delta, extract reasoning content
70+
start_index = delta_text.find(self.think_start_token)
71+
end_index = delta_text.find(self.think_end_token)
72+
reasoning_content = delta_text[start_index + len(self.think_start_token):end_index]
73+
content = delta_text[end_index + len(self.think_end_token):]
74+
return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
75+
else:
76+
# <think> in delta, no </think> in delta,
77+
# reasoning content continues
78+
return DeltaMessage(reasoning_content=delta_text)
79+
else:
80+
# No <think> in previous or delta, also need to check for </think>.
81+
# Because the model may have generated </think> without <think>
82+
# Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
83+
if self.think_end_token in delta_text:
84+
# </think> in delta with more tokens,
85+
# extract reasoning content and content
86+
end_index = delta_text.find(self.think_end_token)
87+
reasoning_content = delta_text[:end_index]
88+
content = delta_text[end_index + len(self.think_end_token):]
89+
return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
90+
elif self.think_end_token in previous_text:
91+
# </think> in previous, thinking content ends
92+
return DeltaMessage(content=delta_text)
93+
else:
94+
# no </think> in previous or delta, reasoning content continues
95+
return DeltaMessage(reasoning_content=delta_text)
96+
97+
def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest,
98+
**kwargs) -> Tuple[Optional[str], Optional[str]]:
99+
"""Extract reasoning content from a complete model-generated string.
100+
101+
Used for non-streaming responses where we have the entire model response
102+
available before sending to the client.
103+
104+
Args:
105+
model_output (str): The model-generated string to extract reasoning content from.
106+
request (ChatCompletionRequest): he request object that was used to generate the model_output.
107+
108+
Returns:
109+
reasoning_content (str | None): The reasoning content.
110+
final_output (str | None): The content.
111+
"""
112+
# DeepSeek R1 doesn't generate <think> now.
113+
# Thus we assume the reasoning content is always at the start.
114+
# Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
115+
if self.think_end_token not in model_output:
116+
# for qwen3 model, the reasoning content is wrapped by <think> </think> xml tags
117+
return None, model_output
118+
# Add a start token if it's missing to keep compatibility.
119+
if self.think_start_token not in model_output:
120+
model_output = f'{self.think_start_token}{model_output}'
121+
# Use a regex to find the reasoning content
122+
reasoning_content = self.reasoning_regex.findall(model_output)[0]
123+
124+
end_index = len(f'{self.think_start_token}{reasoning_content}{self.think_end_token}')
125+
final_output = model_output[end_index:]
126+
if reasoning_content.startswith('\n'):
127+
reasoning_content = reasoning_content[1:]
128+
if reasoning_content.endswith('\n'):
129+
reasoning_content = reasoning_content[:-1]
130+
131+
if len(final_output) == 0:
132+
return reasoning_content, None
133+
134+
return reasoning_content, final_output

lmdeploy/serve/openai/tool_parser/__init__.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22
from .internlm2_parser import Internlm2ToolParser
33
from .llama3_parser import Llama3JsonToolParser
44
from .qwen2d5_parser import Qwen2d5ToolParser
5+
from .qwen3_parser import Qwen3ToolParser
56
from .tool_parser import ToolParser, ToolParserManager
67

7-
__all__ = ['Internlm2ToolParser', 'Qwen2d5ToolParser', 'ToolParser', 'ToolParserManager', 'Llama3JsonToolParser']
8+
__all__ = [
9+
'Internlm2ToolParser',
10+
'Qwen2d5ToolParser',
11+
'Qwen3ToolParser',
12+
'ToolParser',
13+
'ToolParserManager',
14+
'Llama3JsonToolParser',
15+
]

lmdeploy/serve/openai/tool_parser/qwen2d5_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
logger = get_logger('lmdeploy')
1818

1919

20-
@ToolParserManager.register_module(['qwen'])
20+
@ToolParserManager.register_module(['qwen2d5'])
2121
class Qwen2d5ToolParser(ToolParser):
2222

2323
def __init__(self, tokenizer: object):

0 commit comments

Comments
 (0)