InternLM
diff --git a/‎lmdeploy/model.py‎
Lines changed: 4 additions & 4 deletions b/‎lmdeploy/model.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎lmdeploy/serve/openai/api_server.py‎
Lines changed: 10 additions & 12 deletions b/‎lmdeploy/serve/openai/api_server.py‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py‎
Lines changed: 125 additions & 3 deletions b/‎lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py‎
Lines changed: 125 additions & 3 deletions
diff --git a/‎lmdeploy/serve/openai/tool_parser/__init__.py‎
Lines changed: 9 additions & 1 deletion b/‎lmdeploy/serve/openai/tool_parser/__init__.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎lmdeploy/serve/openai/tool_parser/qwen2d5_parser.py‎
Lines changed: 1 addition & 1 deletion b/‎lmdeploy/serve/openai/tool_parser/qwen2d5_parser.py‎
Lines changed: 1 addition & 1 deletion
@@ -965,8 +965,7 @@ def match(cls, model_path: str) -> Optional[str]:
             model_path (str): the model path used for matching.
         """
         model_path = model_path.lower()
-        if ('qwen' in model_path and 'qwen2.5' not in model_path and 'qwen3' not in model_path
-                and 'qwq' not in model_path):
+        if 'qwen' in model_path and not any(keyword in model_path for keyword in ('qwen2.5', 'qwq', 'qwen3')):
             return 'qwen'
         if 'minicpm-v-2_6' in model_path:
             return 'minicpmv-2d6'
@@ -1976,7 +1975,8 @@ def best_match_model(query: str) -> Optional[str]:
         str: the possible model name.
     """
     for name, model in MODELS.module_dict.items():
-        if model.match(query):
-            return model.match(query)
+        matched_name = model.match(query)  # cache the result to avoid matching twice
+        if matched_name:
+            return matched_name
     logger.warning(f'Did not find a chat template matching {query}.')
     return 'base'
@@ -458,6 +458,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
         previous_token_ids = []
         current_token_ids = []
         delta_token_ids = []
+        has_parser = VariableInterface.tool_parser is not None or VariableInterface.reasoning_parser is not None
         streaming_tools = False
         async for res in result_generator:
             logprobs, usage = None, None
@@ -472,16 +473,17 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
                     total_tokens=total_tokens,
                 )
             delta_message = DeltaMessage(role='assistant', content=res.response)
-            if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
-                if res.finish_reason == 'stop' and streaming_tools is True:
-                    res.finish_reason = 'tool_calls'
+            if has_parser:
                 current_text = current_text + res.response
                 delta_token_ids = res.token_ids if res.token_ids is not None else []
                 current_token_ids = current_token_ids + delta_token_ids
+            if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
+                if res.finish_reason == 'stop' and streaming_tools is True:
+                    res.finish_reason = 'tool_calls'
                 tool_delta = VariableInterface.tool_parser.extract_tool_calls_streaming(
                     previous_text=previous_text,
                     current_text=current_text,
-                    delta_text=res.response,
+                    delta_text=delta_message.content,
                     previous_token_ids=previous_token_ids,
                     current_token_ids=current_token_ids,
                     delta_token_ids=delta_token_ids,
@@ -491,24 +493,20 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
                     delta_message.content = tool_delta.content
                     if isinstance(tool_delta.tool_calls, List) and len(tool_delta.tool_calls):
                         streaming_tools = True
-                previous_text = current_text
-                previous_token_ids = current_token_ids
             elif request.tool_choice != 'none' and request.tools is not None and VariableInterface.tool_parser is None:
-                logger.error('Please lanuch the api_server with --tool-call-parser if you want to use tool.')
+                logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
             if VariableInterface.reasoning_parser is not None:
-                current_text = current_text + res.response
-                delta_token_ids = res.token_ids if res.token_ids is not None else []
-                current_token_ids = current_token_ids + delta_token_ids
                 reasoning_delta = VariableInterface.reasoning_parser.extract_reasoning_content_streaming(
                     previous_text=previous_text,
                     current_text=current_text,
-                    delta_text=res.response,
+                    delta_text=delta_message.content or '',
                     previous_token_ids=previous_token_ids,
                     current_token_ids=current_token_ids,
                     delta_token_ids=delta_token_ids)
                 if reasoning_delta is not None:
                     delta_message.reasoning_content = reasoning_delta.reasoning_content
                     delta_message.content = reasoning_delta.content
+            if has_parser:
                 previous_text = current_text
                 previous_token_ids = current_token_ids
             response_json = create_stream_response_json(index=0,
@@ -561,7 +559,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
             logger.error(f'Failed to parse {text}. Exception: {e}.')
             return create_error_response(HTTPStatus.BAD_REQUEST, 'Failed to parse fc related info to json format!')
     elif request.tool_choice != 'none' and request.tools is not None and VariableInterface.tool_parser is None:
-        logger.error('Please lanuch the api_server with --tool-call-parser if you want to use tool.')
+        logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
 
     if VariableInterface.reasoning_parser is not None:
         reasoning_content, text = VariableInterface.reasoning_parser.extract_reasoning_content(text, request)
 
@@ -1,12 +1,134 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
-from .reasoning_parser import ReasoningParserManager
+import re
+from typing import Optional, Sequence, Tuple, Union
+
+from lmdeploy.serve.openai.protocol import ChatCompletionRequest, DeltaMessage
+
+from .reasoning_parser import ReasoningParser, ReasoningParserManager
 
 
 @ReasoningParserManager.register_module(name='qwen-qwq')
-class QwenQwQReasoningParser(DeepSeekR1ReasoningParser):
+class QwenQwQReasoningParser(ReasoningParser):
     """Reasoning parser for Qwen QwQ model.
 
     The Qwen QwQ model uses <think>...</think> tokens to denote reasoning text. This parser extracts the reasoning
     content from the model output.
     """
+
+    def __init__(self, tokenizer: object):
+        super().__init__(tokenizer)
+        self.think_start_token = '<think>'
+        self.think_end_token = '</think>'
+
+        self.reasoning_regex = re.compile(rf'{self.think_start_token}(.*?){self.think_end_token}', re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError('The model tokenizer must be passed to the ReasoningParser '
+                             'constructor during construction.')
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        **kwargs,
+    ) -> Union[DeltaMessage, None]:
+        """Instance method that should be implemented for extracting reasoning
+        from an incomplete response; for use when handling reasoning calls and
+        streaming.
+
+        Has to be an instance method because  it requires state - the current tokens/diffs, but also the information
+        about what has previously been parsed and extracted (see constructor)
+        """
+        # Skip single special tokens
+        if delta_text == self.think_end_token or delta_text == self.think_start_token:
+            return DeltaMessage(content='')
+
+        # Check if <think> is present in previous or delta.
+        # Keep compatibility with models that don't generate <think> tokens.
+        if self.think_start_token in previous_text:
+            if self.think_end_token in delta_text:
+                # <think> in previous, </think> in delta,
+                # extract reasoning content
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
+            elif self.think_end_token in previous_text:
+                # <think> in previous, </think> in previous,
+                return DeltaMessage(content=delta_text)
+            else:
+                # <think> in previous, no </think> in previous or delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        elif self.think_start_token in delta_text:
+            if self.think_end_token in delta_text:
+                # <think> in delta, </think> in delta, extract reasoning content
+                start_index = delta_text.find(self.think_start_token)
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[start_index + len(self.think_start_token):end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
+            else:
+                # <think> in delta, no </think> in delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        else:
+            # No <think> in previous or delta, also need to check for </think>.
+            # Because the model may have generated </think> without <think>
+            # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+            if self.think_end_token in delta_text:
+                # </think> in delta with more tokens,
+                # extract reasoning content and content
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
+            elif self.think_end_token in previous_text:
+                # </think> in previous, thinking content ends
+                return DeltaMessage(content=delta_text)
+            else:
+                # no </think> in previous or delta, reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+
+    def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest,
+                                  **kwargs) -> Tuple[Optional[str], Optional[str]]:
+        """Extract reasoning content from a complete model-generated string.
+
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+
+        Args:
+            model_output (str): The model-generated string to extract reasoning content from.
+            request (ChatCompletionRequest): he request object that was used to generate the model_output.
+
+        Returns:
+            reasoning_content (str | None): The reasoning content.
+            final_output (str | None): The content.
+        """
+        # DeepSeek R1 doesn't generate <think> now.
+        # Thus we assume the reasoning content is always at the start.
+        # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+        if self.think_end_token not in model_output:
+            # for qwen3 model, the reasoning content is wrapped by <think> </think> xml tags
+            return None, model_output
+        # Add a start token if it's missing to keep compatibility.
+        if self.think_start_token not in model_output:
+            model_output = f'{self.think_start_token}{model_output}'
+        # Use a regex to find the reasoning content
+        reasoning_content = self.reasoning_regex.findall(model_output)[0]
+
+        end_index = len(f'{self.think_start_token}{reasoning_content}{self.think_end_token}')
+        final_output = model_output[end_index:]
+        if reasoning_content.startswith('\n'):
+            reasoning_content = reasoning_content[1:]
+        if reasoning_content.endswith('\n'):
+            reasoning_content = reasoning_content[:-1]
+
+        if len(final_output) == 0:
+            return reasoning_content, None
+
+        return reasoning_content, final_output
@@ -2,6 +2,14 @@
 from .internlm2_parser import Internlm2ToolParser
 from .llama3_parser import Llama3JsonToolParser
 from .qwen2d5_parser import Qwen2d5ToolParser
+from .qwen3_parser import Qwen3ToolParser
 from .tool_parser import ToolParser, ToolParserManager
 
-__all__ = ['Internlm2ToolParser', 'Qwen2d5ToolParser', 'ToolParser', 'ToolParserManager', 'Llama3JsonToolParser']
+__all__ = [
+    'Internlm2ToolParser',
+    'Qwen2d5ToolParser',
+    'Qwen3ToolParser',
+    'ToolParser',
+    'ToolParserManager',
+    'Llama3JsonToolParser',
+]
@@ -17,7 +17,7 @@
 logger = get_logger('lmdeploy')
 
 
-@ToolParserManager.register_module(['qwen'])
+@ToolParserManager.register_module(['qwen2d5'])
 class Qwen2d5ToolParser(ToolParser):
 
     def __init__(self, tokenizer: object):