[Refactor]: Remove tokenizer when building engine (#3978)

RunningLeon · web-flow · commit 809530759c38 · 2025-09-17T11:26:16.000+08:00
* to not serialize tokenizer

* remove tokenizer when build mp engine

* remove tokenizer argument when building engine

* use hf tokenizer

* use raw tokenizer to align with original
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
@@ -16,7 +16,6 @@
 
 from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter
 from lmdeploy.messages import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig
-from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger
 
 get_logger('lmdeploy').setLevel('WARNING')
@@ -121,13 +120,12 @@ def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
           f'n_prompt_token: {input_seqlen}, '
           f'n_completion_token: {output_seqlen}, '
           f'test_round: {test_round}, warmup_round: {warmup_round}')
-    tokenizer = Tokenizer(model_path)
     if isinstance(engine_config, TurbomindEngineConfig):
         from lmdeploy.turbomind import TurboMind
-        tm_model = TurboMind.from_pretrained(model_path, tokenizer=tokenizer, engine_config=engine_config)
+        tm_model = TurboMind.from_pretrained(model_path, engine_config=engine_config)
     elif isinstance(engine_config, PytorchEngineConfig):
         from lmdeploy.pytorch.engine import Engine
-        tm_model = Engine(model_path, tokenizer=tokenizer, engine_config=engine_config)
+        tm_model = Engine(model_path, engine_config=engine_config)
 
     event_loop = asyncio.new_event_loop()
     asyncio.set_event_loop(event_loop)
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
@@ -138,11 +138,11 @@ def __init__(self, model_path: str, engine_config: Union[PytorchEngineConfig, Tu
         self.tokenizer = Tokenizer(model_path)
         if isinstance(engine_config, TurbomindEngineConfig):
             from lmdeploy.turbomind import TurboMind
-            tm_model = TurboMind.from_pretrained(model_path, tokenizer=self.tokenizer, engine_config=engine_config)
+            tm_model = TurboMind.from_pretrained(model_path, engine_config=engine_config)
             self.backend = 'turbomind'
         elif isinstance(engine_config, PytorchEngineConfig):
             from lmdeploy.pytorch.engine import Engine as PytorchEngine
-            tm_model = PytorchEngine.from_pretrained(model_path, tokenizer=self.tokenizer, engine_config=engine_config)
+            tm_model = PytorchEngine.from_pretrained(model_path, engine_config=engine_config)
             self.backend = 'pytorch'
 
         self.tm_model = tm_model
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -316,21 +316,18 @@ class Engine(EngineBase):
 
     Args:
         model_path (str): The hugging face model path.
-        tokenizer (lmdeploy.Tokenizer): an instance of lmdeploy.Tokenizer
         engine_config (PytorchEngineConfig): The config of the Engine.
         trust_remote_code (bool): Trust remote code.
     """
 
     def __init__(self,
                  model_path: str,
-                 tokenizer: object,
                  engine_config: PytorchEngineConfig = None,
                  trust_remote_code: bool = True) -> None:
         # make sure engine config exist
         engine_config = _update_engine_config(engine_config)
 
         # dist args
-        self.tokenizer = tokenizer
         self.tp = engine_config.tp
         self.dp = engine_config.dp
         self.dp_rank = engine_config.dp_rank
@@ -358,15 +355,11 @@ def __init__(self,
         misc_config = _build_misc_config(engine_config)
 
         # build model agent
-        raw_tokenizer = None
-        if tokenizer is not None:
-            raw_tokenizer = tokenizer.model.model
         self.executor = build_executor(model_path,
                                        cache_config=cache_config,
                                        backend_config=backend_config,
                                        dist_config=dist_config,
                                        misc_config=misc_config,
-                                       tokenizer=raw_tokenizer,
                                        adapters=adapters,
                                        device_type=engine_config.device_type,
                                        distributed_executor_backend=engine_config.distributed_executor_backend,
@@ -406,7 +399,6 @@ def __init__(self,
     @classmethod
     def from_pretrained(cls,
                         pretrained_model_name_or_path: str,
-                        tokenizer: object,
                         engine_config: PytorchEngineConfig = None,
                         trust_remote_code: bool = True,
                         **kwargs):
@@ -423,7 +415,6 @@ def from_pretrained(cls,
                       on huggingface.co, such as "InternLM/internlm-chat-7b",
                       "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
                       and so on.
-            tokenizer (lmdeploy.Tokenizer): an instance of lmdeploy.Tokenizer
             engine_config (PytorchEngineConfig): Pytorch engine config.
             trust_remote_code (bool): Trust remote code
         """
@@ -432,13 +423,11 @@ def from_pretrained(cls,
             backend = engine_config.mp_engine_backend
             return build_mp_engine(backend=backend,
                                    model_path=pretrained_model_name_or_path,
-                                   tokenizer=tokenizer,
                                    engine_config=engine_config,
                                    trust_remote_code=trust_remote_code)
         if len(kwargs) > 0:
             logger.debug(f'Get unexpected kwargs: {kwargs}')
         return cls(model_path=pretrained_model_name_or_path,
-                   tokenizer=tokenizer,
                    engine_config=engine_config,
                    trust_remote_code=trust_remote_code)
 
diff --git a/lmdeploy/pytorch/engine/executor/__init__.py b/lmdeploy/pytorch/engine/executor/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from logging import Logger
-from typing import Any, Dict
+from typing import Dict
 
 from lmdeploy.pytorch import envs
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, DistConfig, MiscConfig, ModelConfig
@@ -58,7 +58,6 @@ def build_executor(model_path: str,
                    backend_config: BackendConfig,
                    dist_config: DistConfig,
                    misc_config: MiscConfig,
-                   tokenizer: Any,
                    adapters: Dict[str, str] = None,
                    device_type: str = 'cuda',
                    distributed_executor_backend: str = None,
@@ -98,7 +97,6 @@ def build_executor(model_path: str,
             cache_config=cache_config,
             backend_config=backend_config,
             misc_config=misc_config,
-            tokenizer=tokenizer,
             adapters=adapters,
             device_type=device_type,
         )
@@ -111,7 +109,6 @@ def build_executor(model_path: str,
             backend_config=backend_config,
             dist_config=dist_config,
             misc_config=misc_config,
-            tokenizer=tokenizer,
             adapters=adapters,
             device_type=device_type,
         )
@@ -124,7 +121,6 @@ def build_executor(model_path: str,
             backend_config=backend_config,
             dist_config=dist_config,
             misc_config=misc_config,
-            tokenizer=tokenizer,
             adapters=adapters,
             device_type=device_type,
             dtype=dtype,
diff --git a/lmdeploy/pytorch/engine/executor/base.py b/lmdeploy/pytorch/engine/executor/base.py
@@ -23,7 +23,6 @@ def __init__(self,
                  backend_config: BackendConfig,
                  dist_config: DistConfig,
                  misc_config: MiscConfig,
-                 tokenizer: Any,
                  adapters: Dict[str, str] = None,
                  device_type: str = 'cuda'):
         """Initialize Executor."""
@@ -37,7 +36,6 @@ def __init__(self,
         self.backend_config = backend_config
         self.dist_config = dist_config
         self.misc_config = misc_config,
-        self.tokenizer = tokenizer
         self.dp = dist_config.dp
         self.tp = dist_config.tp
         self.world_size = dist_config.world_size
diff --git a/lmdeploy/pytorch/engine/executor/base_worker.py b/lmdeploy/pytorch/engine/executor/base_worker.py
@@ -29,7 +29,6 @@ def __init__(
         misc_config: MiscConfig,
         adapters: Dict[str, str] = None,
         device_type: str = 'cuda',
-        tokenizer: Any = None,
         log_level: int = 30,
     ):
         self.model_path = model_path
@@ -38,7 +37,6 @@ def __init__(
         self.backend_config = backend_config
         self.dist_config = dist_config
         self.misc_config = misc_config
-        self.tokenizer = tokenizer
         self.adapters = adapters
         self.device_type = device_type
         self.log_level = log_level
@@ -96,7 +94,6 @@ def build_model(self):
                                              cache_config=self.cache_config,
                                              backend_config=self.backend_config,
                                              misc_config=self.misc_config,
-                                             tokenizer=self.tokenizer,
                                              device_ctx=self.device_ctx,
                                              dist_ctx=self.dist_ctx,
                                              adapters=self.adapters)
diff --git a/lmdeploy/pytorch/engine/executor/mp_executor.py b/lmdeploy/pytorch/engine/executor/mp_executor.py
@@ -224,15 +224,13 @@ def __init__(self,
                  backend_config: BackendConfig,
                  dist_config: DistConfig,
                  misc_config: MiscConfig,
-                 tokenizer: Any,
                  adapters: Dict[str, str] = None,
                  device_type: str = 'cuda'):
         """Initialize Executor."""
         super().__init__(model_path=model_path,
                          model_config=model_config,
                          cache_config=cache_config,
                          backend_config=backend_config,
-                         tokenizer=tokenizer,
                          dist_config=dist_config,
                          misc_config=misc_config,
                          adapters=adapters,
@@ -266,7 +264,6 @@ def __init__(self,
                        backend_config=backend_config,
                        dist_config=dist_config,
                        misc_config=misc_config,
-                       tokenizer=tokenizer,
                        adapters=adapters,
                        device_type=device_type,
                        log_level=logger.level)
@@ -430,7 +427,6 @@ def __init__(
         misc_config: MiscConfig,
         adapters: Dict[str, str] = None,
         device_type: str = 'cuda',
-        tokenizer: Any = None,
         log_level: int = 30,
     ):
         super().__init__(
@@ -442,7 +438,6 @@ def __init__(
             misc_config=misc_config,
             adapters=adapters,
             device_type=device_type,
-            tokenizer=tokenizer,
             log_level=log_level,
         )
 
@@ -491,7 +486,6 @@ def _main_loop(
         backend_config: BackendConfig,
         dist_config: DistConfig,
         misc_config: MiscConfig,
-        tokenizer: Any,
         adapters: Dict[str, str] = None,
         device_type: str = 'cuda',
         log_level: int = 30,
@@ -515,7 +509,6 @@ def handle_sigterm(signum, frame):
                                  misc_config=misc_config,
                                  adapters=adapters,
                                  device_type=device_type,
-                                 tokenizer=tokenizer,
                                  log_level=log_level)
         try_import_deeplink(device_type)
         worker.init_process_group(proc_id)
diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py
@@ -160,8 +160,6 @@ def __init__(
         init_backend(device_type)
         try_import_deeplink(device_type)
 
-        from lmdeploy.tokenizer import Tokenizer
-        tokenizer = Tokenizer(model_path).model.model
         model_config = ModelConfig.from_pretrained(model_path,
                                                    dtype=dtype,
                                                    hf_overrides=misc_config.hf_overrides,
@@ -176,7 +174,6 @@ def __init__(
             misc_config=misc_config,
             adapters=adapters,
             device_type=device_type,
-            tokenizer=tokenizer,
             log_level=log_level,
         )
         self.node_ip = ray.util.get_node_ip_address()
@@ -232,7 +229,6 @@ def __init__(self,
                  backend_config: BackendConfig,
                  dist_config: DistConfig,
                  misc_config: MiscConfig,
-                 tokenizer: Any,
                  adapters: Dict[str, str] = None,
                  device_type: str = 'cuda',
                  dtype: str = 'auto'):
@@ -243,7 +239,6 @@ def __init__(self,
                          backend_config=backend_config,
                          dist_config=dist_config,
                          misc_config=misc_config,
-                         tokenizer=tokenizer,
                          adapters=adapters,
                          device_type=device_type)
 
diff --git a/lmdeploy/pytorch/engine/executor/uni_executor.py b/lmdeploy/pytorch/engine/executor/uni_executor.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
-from typing import Any, Dict, List
+from typing import Dict, List
 
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, DistConfig, MiscConfig, ModelConfig
 from lmdeploy.pytorch.devices import DeviceContext
@@ -23,7 +23,6 @@ def __init__(self,
                  cache_config: CacheConfig,
                  backend_config: BackendConfig,
                  misc_config: MiscConfig,
-                 tokenizer: Any,
                  adapters: Dict[str, str] = None,
                  device_type: str = 'cuda'):
         """Initialize Executor."""
@@ -33,7 +32,6 @@ def __init__(self,
                          backend_config=backend_config,
                          dist_config=DistConfig(),
                          misc_config=misc_config,
-                         tokenizer=tokenizer,
                          adapters=adapters,
                          device_type=device_type)
 
@@ -43,7 +41,6 @@ def __init__(self,
                                              cache_config=cache_config,
                                              backend_config=backend_config,
                                              misc_config=misc_config,
-                                             tokenizer=tokenizer,
                                              device_ctx=self.device_ctx,
                                              adapters=adapters)
 
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -16,6 +16,7 @@
 
 from lmdeploy.pytorch.disagg.config import EngineRole
 from lmdeploy.serve.openai.protocol import UpdateParamsRequest
+from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger
 
 from ..backends import get_backend
@@ -307,16 +308,16 @@ def __init__(self,
                  cache_config: CacheConfig,
                  backend_config: BackendConfig,
                  misc_config: MiscConfig,
-                 tokenizer: Any,
                  dist_ctx: DistContext,
                  device_ctx: DeviceContext,
                  adapters: Dict[str, str] = None):
 
         self.model_config = model_config
         self.cache_config = cache_config
-        self.tokenizer = tokenizer
+        # use raw tokenizer
+        self.tokenizer = Tokenizer(model_path).model.model
         try:
-            self.sampling_vocab_size = len(tokenizer)
+            self.sampling_vocab_size = len(self.tokenizer)
         except BaseException:
             self.sampling_vocab_size = None
 
@@ -1158,7 +1159,6 @@ def build_model_agent(model_path: str,
                       cache_config: CacheConfig,
                       backend_config: BackendConfig,
                       misc_config: MiscConfig,
-                      tokenizer: Any,
                       dist_ctx: DistContext = None,
                       device_ctx: DeviceContext = None,
                       adapters: Dict[str, str] = None):
@@ -1187,7 +1187,6 @@ def build_model_agent(model_path: str,
         cache_config=cache_config,
         backend_config=backend_config,
         misc_config=misc_config,
-        tokenizer=tokenizer,
         adapters=adapters,
         dist_ctx=dist_ctx,
         device_ctx=device_ctx,
diff --git a/lmdeploy/pytorch/engine/mp_engine/__init__.py b/lmdeploy/pytorch/engine/mp_engine/__init__.py
@@ -2,17 +2,13 @@
 from lmdeploy.messages import PytorchEngineConfig
 
 
-def build_mp_engine(backend: str,
-                    model_path: str,
-                    tokenizer: object,
-                    engine_config: PytorchEngineConfig = None,
-                    **kwargs):
+def build_mp_engine(backend: str, model_path: str, engine_config: PytorchEngineConfig = None, **kwargs):
     """Build mp engine."""
     if backend == 'mp':
         from .zmq_engine import ZMQMPEngine
-        return ZMQMPEngine(model_path, tokenizer, engine_config=engine_config, **kwargs)
+        return ZMQMPEngine(model_path, engine_config=engine_config, **kwargs)
     elif backend == 'ray':
         from .ray_engine import RayMPEngine
-        return RayMPEngine(model_path, tokenizer, engine_config=engine_config, **kwargs)
+        return RayMPEngine(model_path, engine_config=engine_config, **kwargs)
     else:
         raise ValueError(f'Unsupported backend: {backend}')
diff --git a/lmdeploy/pytorch/engine/mp_engine/ray_engine.py b/lmdeploy/pytorch/engine/mp_engine/ray_engine.py
diff --git a/lmdeploy/pytorch/engine/mp_engine/zmq_engine.py b/lmdeploy/pytorch/engine/mp_engine/zmq_engine.py
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py