support gpt-oss function/reasoning in /v1/chat/completions (#3962)

irexyc · web-flow · commit e7cbc54c65d4 · 2025-09-18T18:35:47.000+08:00
* support gpt-oss final output

* support reasoning_effort

* output reasoning content

* fix reasoning_effort

* update

* fix ut

* support gpt-oss function/reasoning in /v1/chat/completions

* fix lint

* skip process prompt tokens

* remove commentary channel when no tools are provided

* update

* reduce warning
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -737,7 +737,7 @@ class HFChatTemplate(BaseChatTemplate):
 
     def __init__(self, model_path: str = '', **kwargs):
         try:
-            from transformers import AutoTokenizer
+            from transformers import AutoTokenizer, PretrainedConfig
             self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
             self.system_start, self.system_end = self._role_instruction('system')
             self.user_start, self.user_end = self._role_instruction('user')
@@ -747,6 +747,10 @@ def __init__(self, model_path: str = '', **kwargs):
                 self.stop_words.append(self.tokenizer.eos_token)
             if hasattr(self.tokenizer, 'eot_token') and self.tokenizer.eot_token is not None:
                 self.stop_words.append(self.tokenizer.eot_token)
+            cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
+            self.is_gpt_oss = getattr(cfg, 'architectures', [''])[0] == 'GptOssForCausalLM'
+            if self.is_gpt_oss:
+                self.stop_words.append('<|call|>')
         except Exception as e:
             raise ValueError(f'Try apply_chat_template failed: {e}')
 
@@ -787,6 +791,9 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
 
         if messages[-1]['role'] == 'assistant' and len(self.assistant_end) > 0:
             prompt = prompt[:-len(self.assistant_end)]  # prefix of response to let the model complete the response
+        if self.is_gpt_oss and not kwargs.get('tools'):
+            # for gpt-oss model, remove this seems more conducive to instruction following.
+            prompt = prompt.replace('commentary, ', '', 1)
         return prompt
 
     def _role_instruction(self, role):
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
@@ -30,6 +30,7 @@
                                                    DistServeDropConnectionRequest, DistServeInitRequest,
                                                    MigrationRequest)
 from lmdeploy.serve.async_engine import AsyncEngine
+from lmdeploy.serve.openai.harmony_utils import GptOssChatParser
 from lmdeploy.serve.openai.protocol import ChatCompletionResponse  # noqa: E501
 from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, ChatCompletionResponseChoice,
                                             ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,
@@ -372,6 +373,9 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
         adapter_name = model_name  # got a adapter name
     request_id = str(request.session_id)
     created_time = int(time.time())
+    gpt_oss_parser = None
+    if VariableInterface.async_engine.arch == 'GptOssForCausalLM':
+        gpt_oss_parser = GptOssChatParser()
 
     if isinstance(request.stop, str):
         request.stop = [request.stop]
@@ -423,12 +427,21 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
         gen_config.skip_special_tokens = False
         # internlm2 only uses contents inside function regardless of 'type'
         if not isinstance(request.tool_choice, str):
-            tools = [
-                item.function.model_dump() for item in request.tools
-                if item.function.name == request.tool_choice.function.name
-            ]
+            if gpt_oss_parser:
+                tools = [
+                    item.model_dump() for item in request.tools
+                    if item.function.name == request.tool_choice.function.name
+                ]
+            else:
+                tools = [
+                    item.function.model_dump() for item in request.tools
+                    if item.function.name == request.tool_choice.function.name
+                ]
         else:
-            tools = [item.function.model_dump() for item in request.tools]
+            if gpt_oss_parser:
+                tools = [item.model_dump() for item in request.tools]
+            else:
+                tools = [item.function.model_dump() for item in request.tools]
     # text completion for string input
     do_preprocess = False if isinstance(request.messages, str) else request.do_preprocess
     result_generator = VariableInterface.async_engine.generate(
@@ -486,46 +499,53 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
                     completion_tokens=res.generate_token_len,
                     total_tokens=total_tokens,
                 )
+
             delta_token_ids = res.token_ids if res.token_ids is not None else []
-            delta_message = DeltaMessage(role='assistant', content=res.response)
+            if gpt_oss_parser:
+                delta_message = gpt_oss_parser.parse_streaming(res.token_ids)
+                if res.finish_reason == 'stop' and len(delta_message.tool_calls) > 0:
+                    res.finish_reason = 'tool_calls'
+            else:
+                delta_message = DeltaMessage(role='assistant', content=res.response)
+                if has_parser:
+                    current_text = current_text + res.response
+                    current_token_ids = current_token_ids + delta_token_ids
+                if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
+                    if res.finish_reason == 'stop' and streaming_tools is True:
+                        res.finish_reason = 'tool_calls'
+                    tool_delta = VariableInterface.tool_parser.extract_tool_calls_streaming(
+                        previous_text=previous_text,
+                        current_text=current_text,
+                        delta_text=delta_message.content,
+                        previous_token_ids=previous_token_ids,
+                        current_token_ids=current_token_ids,
+                        delta_token_ids=delta_token_ids,
+                        request=request)
+                    if tool_delta is not None:
+                        delta_message.tool_calls = tool_delta.tool_calls
+                        delta_message.content = tool_delta.content
+                        if isinstance(tool_delta.tool_calls, List) and len(tool_delta.tool_calls):
+                            streaming_tools = True
+                elif (request.tool_choice != 'none' and request.tools is not None
+                      and VariableInterface.tool_parser is None):
+                    logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
+
+                if VariableInterface.reasoning_parser is not None and request.enable_thinking is not False:
+                    reasoning_delta = VariableInterface.reasoning_parser.extract_reasoning_content_streaming(
+                        previous_text=previous_text,
+                        current_text=current_text,
+                        delta_text=delta_message.content or '',
+                        previous_token_ids=previous_token_ids,
+                        current_token_ids=current_token_ids,
+                        delta_token_ids=delta_token_ids)
+                    if reasoning_delta is not None:
+                        delta_message.reasoning_content = reasoning_delta.reasoning_content
+                        delta_message.content = reasoning_delta.content
+                if has_parser:
+                    previous_text = current_text
+                    previous_token_ids = current_token_ids
             if request.return_token_ids:
                 delta_message.gen_tokens = delta_token_ids
-            if has_parser:
-                current_text = current_text + res.response
-                current_token_ids = current_token_ids + delta_token_ids
-            if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
-                if res.finish_reason == 'stop' and streaming_tools is True:
-                    res.finish_reason = 'tool_calls'
-                tool_delta = VariableInterface.tool_parser.extract_tool_calls_streaming(
-                    previous_text=previous_text,
-                    current_text=current_text,
-                    delta_text=delta_message.content,
-                    previous_token_ids=previous_token_ids,
-                    current_token_ids=current_token_ids,
-                    delta_token_ids=delta_token_ids,
-                    request=request)
-                if tool_delta is not None:
-                    delta_message.tool_calls = tool_delta.tool_calls
-                    delta_message.content = tool_delta.content
-                    if isinstance(tool_delta.tool_calls, List) and len(tool_delta.tool_calls):
-                        streaming_tools = True
-            elif request.tool_choice != 'none' and request.tools is not None and VariableInterface.tool_parser is None:
-                logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
-
-            if VariableInterface.reasoning_parser is not None and request.enable_thinking is not False:
-                reasoning_delta = VariableInterface.reasoning_parser.extract_reasoning_content_streaming(
-                    previous_text=previous_text,
-                    current_text=current_text,
-                    delta_text=delta_message.content or '',
-                    previous_token_ids=previous_token_ids,
-                    current_token_ids=current_token_ids,
-                    delta_token_ids=delta_token_ids)
-                if reasoning_delta is not None:
-                    delta_message.reasoning_content = reasoning_delta.reasoning_content
-                    delta_message.content = reasoning_delta.content
-            if has_parser:
-                previous_text = current_text
-                previous_token_ids = current_token_ids
             response_json = create_stream_response_json(index=0,
                                                         delta_message=delta_message,
                                                         finish_reason=res.finish_reason,
@@ -562,24 +582,34 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
         cache_block_ids.append(res.cache_block_ids)
         remote_token_ids.append(res.token_ids)
 
-    tool_calls = None
-    reasoning_content = None
-    if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
-        try:  # TODO add json_schema guidance to turbomind
-            tool_call_info = VariableInterface.tool_parser.extract_tool_calls(text, request=request)
-            text, tool_calls = tool_call_info.content, tool_call_info.tool_calls
-            if isinstance(tool_calls, List) and len(tool_calls):
-                if final_res.finish_reason == 'stop':
-                    final_res.finish_reason = 'tool_calls'
-
-        except Exception as e:
-            logger.error(f'Failed to parse {text}. Exception: {e}.')
-            return create_error_response(HTTPStatus.BAD_REQUEST, 'Failed to parse fc related info to json format!')
-    elif request.tool_choice != 'none' and request.tools is not None and VariableInterface.tool_parser is None:
-        logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
-
-    if VariableInterface.reasoning_parser is not None and request.enable_thinking is not False:
-        reasoning_content, text = VariableInterface.reasoning_parser.extract_reasoning_content(text, request)
+    if gpt_oss_parser:
+        message = gpt_oss_parser.parse_full(final_token_ids)
+        if final_res.finish_reason == 'stop' and len(message.tool_calls) > 0:
+            final_res.finish_reason = 'tool_calls'
+    else:
+        tool_calls = None
+        reasoning_content = None
+        if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
+            try:  # TODO add json_schema guidance to turbomind
+                tool_call_info = VariableInterface.tool_parser.extract_tool_calls(text, request=request)
+                text, tool_calls = tool_call_info.content, tool_call_info.tool_calls
+                if isinstance(tool_calls, List) and len(tool_calls):
+                    if final_res.finish_reason == 'stop':
+                        final_res.finish_reason = 'tool_calls'
+
+            except Exception as e:
+                logger.error(f'Failed to parse {text}. Exception: {e}.')
+                return create_error_response(HTTPStatus.BAD_REQUEST, 'Failed to parse fc related info to json format!')
+        elif request.tool_choice != 'none' and request.tools is not None and VariableInterface.tool_parser is None:
+            logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
+
+        if VariableInterface.reasoning_parser is not None and request.enable_thinking is not False:
+            reasoning_content, text = VariableInterface.reasoning_parser.extract_reasoning_content(text, request)
+
+        message = ChatMessage(role='assistant',
+                              content=text,
+                              tool_calls=tool_calls,
+                              reasoning_content=reasoning_content)
 
     logprobs = None
     if gen_logprobs and len(final_logprobs):
@@ -588,15 +618,11 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
 
     assert final_res is not None
     choices = []
-    chat_message = ChatMessage(role='assistant',
-                               content=text,
-                               tool_calls=tool_calls,
-                               reasoning_content=reasoning_content)
     if request.return_token_ids:
-        chat_message.gen_tokens = final_token_ids
+        message.gen_tokens = final_token_ids
     choice_data = ChatCompletionResponseChoice(
         index=0,
-        message=chat_message,
+        message=message,
         logprobs=logprobs,
         finish_reason=final_res.finish_reason,
     )
diff --git a/lmdeploy/serve/openai/harmony_utils.py b/lmdeploy/serve/openai/harmony_utils.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/vllm-project/vllm/blob/v0.10.2rc1/vllm/entrypoints/harmony_utils.py
+from typing import List
+
+import shortuuid
+
+from lmdeploy.serve.openai.protocol import (ChatMessage, DeltaFunctionCall, DeltaMessage, DeltaToolCall, FunctionCall,
+                                            ToolCall)
+
+try:
+    from openai_harmony import HarmonyEncodingName, Role, StreamableParser, load_harmony_encoding
+except (ImportError, ModuleNotFoundError):
+    pass
+
+_harmony_encoding = None
+
+
+def get_encoding():
+    global _harmony_encoding
+    if _harmony_encoding is None:
+        _harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    return _harmony_encoding
+
+
+def get_streamable_parser_for_assistant() -> 'StreamableParser':
+    return StreamableParser(get_encoding(), role=Role.ASSISTANT)
+
+
+class GptOssChatParser:
+
+    def __init__(self):
+        self.parser = get_streamable_parser_for_assistant()
+
+    def parse_streaming(self, tokens: List[int]) -> DeltaMessage:
+        parser = self.parser
+        delta_message = DeltaMessage(role='assistant')
+        content = ''
+        reasoning_content = ''
+        tool_calls = []
+        delta_tool_call = None
+        for token in tokens:
+            prev_recipient = parser.current_recipient
+            parser.process(token)
+            cur_channel = parser.current_channel
+            cur_recipient = parser.current_recipient
+            delta_text = parser.last_content_delta or ''
+            if cur_channel == 'final':
+                content += delta_text
+            elif cur_channel == 'analysis':
+                reasoning_content += delta_text
+            elif cur_channel == 'commentary' and cur_recipient and cur_recipient.startswith('functions.'):
+                base_index = 0
+                for msg in parser.messages:
+                    if msg.channel == 'commentary' and msg.recipient and msg.recipient.startswith('functions.'):
+                        base_index += 1
+                if prev_recipient != cur_recipient:
+                    if delta_tool_call is not None:
+                        tool_calls.append(delta_tool_call)
+                    tool_name = cur_recipient.split('functions.', 1)[1]
+                    delta_tool_call = DeltaToolCall(id=f'chatcmpl-tool-{shortuuid.random()}',
+                                                    type='function',
+                                                    index=base_index,
+                                                    function=DeltaFunctionCall(name=tool_name, arguments=''))
+                elif delta_text:
+                    if delta_tool_call is None:
+                        delta_tool_call = DeltaToolCall(index=base_index,
+                                                        function=DeltaFunctionCall(arguments=delta_text))
+                    delta_tool_call.function.arguments += delta_text
+
+        if delta_tool_call:
+            tool_calls.append(delta_tool_call)
+
+        delta_message.content = content if content else None
+        delta_message.reasoning_content = reasoning_content if reasoning_content else None
+        delta_message.tool_calls = tool_calls
+        return delta_message
+
+    def parse_full(self, tokens: List[int]) -> ChatMessage:
+        delta_message = self.parse_streaming(tokens)
+        tool_calls = []
+        for delta_tool_call in delta_message.tool_calls:
+            function = FunctionCall(**delta_tool_call.function.model_dump())
+            tool_calls.append(ToolCall(id=delta_tool_call.id, type=delta_tool_call.type, function=function))
+        chat_message = ChatMessage(role='assistant',
+                                   content=delta_message.content,
+                                   tool_calls=tool_calls,
+                                   reasoning_content=delta_message.reasoning_content)
+        return chat_message
diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py
@@ -400,9 +400,6 @@ def detokenize_incrementally(self,
                                  spaces_between_special_tokens: bool = True):
         if not hasattr(state, 'stream'):
             state.stream = self.parser()
-            ids_offset = state.ids_offset
-            for token_id in all_input_ids[:ids_offset]:
-                state.stream.process(token_id)
 
         response = ''
         stream = state.stream
@@ -423,8 +420,11 @@ class Tokenizer:
     """
 
     def __init__(self, model_path: str):
-        from transformers import PretrainedConfig
-        model_cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
+        from transformers import AutoConfig, PretrainedConfig
+        try:
+            model_cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        except Exception as e:  # noqa
+            model_cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
         is_gpt_oss = getattr(model_cfg, 'model_type', '') == 'gpt_oss'
         from transformers.models.auto.tokenization_auto import get_tokenizer_config
         tokenizer_config = get_tokenizer_config(model_path, trust_remote_code=True)