InternLM
diff --git a/‎lmdeploy/messages.py‎
Lines changed: 3 additions & 0 deletions b/‎lmdeploy/messages.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lmdeploy/pytorch/engine/base.py‎
Lines changed: 52 additions & 0 deletions b/‎lmdeploy/pytorch/engine/base.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎lmdeploy/pytorch/engine/engine.py‎
Lines changed: 9 additions & 6 deletions b/‎lmdeploy/pytorch/engine/engine.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎lmdeploy/pytorch/engine/engine_instance.py‎
Lines changed: 2 additions & 1 deletion b/‎lmdeploy/pytorch/engine/engine_instance.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lmdeploy/pytorch/engine/executor/ray_executor.py‎
Lines changed: 7 additions & 112 deletions b/‎lmdeploy/pytorch/engine/executor/ray_executor.py‎
Lines changed: 7 additions & 112 deletions
diff --git a/‎lmdeploy/pytorch/engine/mp_engine/__init__.py‎
Lines changed: 17 additions & 0 deletions b/‎lmdeploy/pytorch/engine/mp_engine/__init__.py‎
Lines changed: 17 additions & 0 deletions
@@ -326,6 +326,8 @@ class PytorchEngineConfig:
         migration_backend: migration backend. options: ['DLSlime'].
             Default to `MigrationBackend.DLSlime`.
         enable_mp_engine (bool): run engine in multi-process mode.
+        mp_engine_backend (str): backend of mp engine, options:
+            ['mp', 'ray']. Default to `mp`.
         model_format (str): weight quantization policy, options: ['fp8'].
         hf_overrides (Dict[str, Any]): Huggingface overrides for the model.
             It can be used to override the default config of the model,
@@ -359,6 +361,7 @@ class PytorchEngineConfig:
     enable_microbatch: bool = False
     enable_eplb: bool = False
     enable_mp_engine: bool = False
+    mp_engine_backend: str = 'mp'
     model_format: str = None
     enable_metrics: bool = False
     hf_overrides: Optional[Dict[str, Any]] = None
 
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from lmdeploy.pytorch.disagg.conn.protocol import (DistServeConnectionRequest, DistServeDropConnectionRequest,
+                                                   DistServeInitRequest)
+
+
+class EngineBase:
+
+    def close(self) -> None:
+        """Close mp engine."""
+        raise NotImplementedError('This method is not implemented.')
+
+    def start_loop(self) -> None:
+        """Start mp engine loop."""
+
+    def end_session(self, session_id: int):
+        """End session."""
+        raise NotImplementedError('This method is not implemented.')
+
+    def p2p_initialize(self, conn_request: DistServeInitRequest):
+        """Init rdma link."""
+        raise NotImplementedError('This method is not implemented.')
+
+    def p2p_connect(self, conn_request: DistServeConnectionRequest):
+        """rdma_connect."""
+        raise NotImplementedError('This method is not implemented.')
+
+    def p2p_drop_connect(self, drop_conn_request: DistServeDropConnectionRequest):
+        """Drop connection.
+
+        1. drop engine connection (zmq connection)
+        2. TODO(JimyMa) drop RDMA Connection.
+        """
+        raise NotImplementedError('This method is not implemented.')
+
+    def create_instance(self, cuda_stream_id=0):
+        """Create instance."""
+        raise NotImplementedError('This method is not implemented.')
+
+
+class EngineInstanceBase:
+
+    async def async_end(self, session_id: int):
+        """End the given session."""
+        raise NotImplementedError('This method is not implemented.')
+
+    async def async_cancel(self, session_id: int):
+        """Stop current streaming inference."""
+        raise NotImplementedError('This method is not implemented.')
+
+    async def async_stream_infer(self, *args, **kwargs):
+        """Send stream inference request."""
+        raise NotImplementedError('This method is not implemented.')
@@ -23,6 +23,7 @@
 from ..messages import MessageStatus, SchedulerSequence
 from ..model_inputs import ModelInputs, VisionModelInputs
 from ..paging import Scheduler
+from .base import EngineBase
 from .engine_checker import EngineChecker
 from .executor import build_executor
 from .logits_process import SamplingInputs
@@ -308,7 +309,7 @@ def build_inputs_maker(engine: 'Engine'):
     return InputsMakerAsync(engine)
 
 
-class Engine:
+class Engine(EngineBase):
     """The inference engine of lmdeploy pytorch.
 
     Args:
@@ -425,11 +426,13 @@ def from_pretrained(cls,
             trust_remote_code (bool): Trust remote code
         """
         if engine_config is not None and engine_config.enable_mp_engine:
-            from .mp_engine.mp_engine import MPEngine
-            return MPEngine(model_path=pretrained_model_name_or_path,
-                            tokenizer=tokenizer,
-                            engine_config=engine_config,
-                            trust_remote_code=trust_remote_code)
+            from .mp_engine import build_mp_engine
+            backend = engine_config.mp_engine_backend
+            return build_mp_engine(backend=backend,
+                                   model_path=pretrained_model_name_or_path,
+                                   tokenizer=tokenizer,
+                                   engine_config=engine_config,
+                                   trust_remote_code=trust_remote_code)
         if len(kwargs) > 0:
             logger.debug(f'Get unexpected kwargs: {kwargs}')
         return cls(model_path=pretrained_model_name_or_path,
 
@@ -5,6 +5,7 @@
 from lmdeploy.utils import get_logger
 
 from ..messages import SamplingParam
+from .base import EngineInstanceBase
 from .engine import Engine
 from .request import RequestSender, RequestType, Response, ResponseType
 
@@ -71,7 +72,7 @@ def cancel(req_sender: RequestSender, session_id: int):
                                f'Error: {resp.type}.'))
 
 
-class EngineInstance:
+class EngineInstance(EngineInstanceBase):
     """Instance of TurboMind.
 
     Args:
 
@@ -3,7 +3,6 @@
 import contextlib
 import json
 import os
-import time
 from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
@@ -19,6 +18,7 @@
 from lmdeploy.pytorch.devices import DeviceContext, get_device_manager
 from lmdeploy.pytorch.disagg.conn.protocol import DistServeInitRequest, DistServeKVTransferEndpointInfo
 from lmdeploy.pytorch.disagg.messages import MigrationExecutionBatch
+from lmdeploy.pytorch.ray import RayContext
 from lmdeploy.utils import get_logger, try_import_deeplink
 
 from .base import ExecutorBase
@@ -27,8 +27,6 @@
 
 logger = get_logger('lmdeploy')
 
-PG_WAIT_TIMEOUT = 1800
-
 
 def get_device_str():
     """Get device str."""
@@ -43,109 +41,6 @@ def get_device_str():
     return device_type
 
 
-def _wait_until_pg_ready(current_placement_group: 'PlacementGroup'):
-    """Wait until a placement group is ready.
-
-    It prints the informative log messages if the placement group is not created within time.
-    """
-    # copy from vLLM
-    # Wait until PG is ready - this will block until all
-    # requested resources are available, and will timeout
-    # if they cannot be provisioned.
-    placement_group_specs = current_placement_group.bundle_specs
-
-    s = time.time()
-    pg_ready_ref = current_placement_group.ready()
-    wait_interval = 10
-    while time.time() - s < PG_WAIT_TIMEOUT:
-        ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
-        if len(ready) > 0:
-            break
-
-        # Exponential backoff for warning print.
-        wait_interval *= 2
-        logger.info(
-            'Waiting for creating a placement group of specs for '
-            '%d seconds. specs=%s. Check '
-            '`ray status` to see if you have enough resources,'
-            ' and make sure the IP addresses used by ray cluster'
-            ' are the same as VLLM_HOST_IP environment variable'
-            ' specified in each node if you are running on a multi-node.', int(time.time() - s), placement_group_specs)
-
-    try:
-        ray.get(pg_ready_ref, timeout=0)
-    except ray.exceptions.GetTimeoutError:
-        raise ValueError('Cannot provide a placement group of '
-                         f'{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See '
-                         '`ray status` to make sure the cluster has enough resources.') from None
-
-
-def _get_obj_store_memory(dp: int = 1):
-    """Get obj store memory."""
-    import psutil
-    DEFAULT_OBJECT_STORE_MEMORY_PROPORTION = os.getenv('RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION', '0.3')
-    DEFAULT_OBJECT_STORE_MEMORY_PROPORTION = float(DEFAULT_OBJECT_STORE_MEMORY_PROPORTION)
-    DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = os.getenv('RAY_DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES', None)
-    if DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES is None:
-        DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = 80 * (10**9)
-    else:
-        DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = int(DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES)
-    total_mem = psutil.virtual_memory().total
-    obj_store_mem = int(total_mem * DEFAULT_OBJECT_STORE_MEMORY_PROPORTION)
-    obj_store_mem = min(DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES, obj_store_mem)
-    if dp > 1:
-        obj_store_mem = obj_store_mem // min(8, dp)
-    return obj_store_mem
-
-
-def init_ray_cluster(world_size: int, ray_address: str = None, dp: int = 1):
-    """Init ray cluster."""
-    # modifier from vLLM
-    if not ray.is_initialized():
-        try:
-            num_cpus = world_size
-            object_store_memory = _get_obj_store_memory(dp=dp)
-            ray.init(address=ray_address,
-                     ignore_reinit_error=True,
-                     num_cpus=num_cpus,
-                     object_store_memory=object_store_memory)
-        except ValueError as e:
-            if e.args is not None and len(e.args) >= 1 and e.args[
-                    0] == 'When connecting to an existing cluster, num_cpus and num_gpus must not be provided.':
-                ray.init(address=ray_address, ignore_reinit_error=True)
-            else:
-                raise
-
-    device_str = get_device_str()
-
-    # Create placement group for worker processes
-    current_placement_group = ray.util.get_current_placement_group()
-    if not current_placement_group:
-        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
-        if world_size > num_devices_in_cluster:
-            logger.warning(
-                'The number of required %ss exceeds the total '
-                'number of available %ss in the placement group.', device_str, device_str)
-        # Create a new placement group
-        placement_group_specs: List[Dict[str, float]] = ([{device_str: 1.0} for _ in range(world_size)])
-
-        # gcs_addr = ray.get_runtime_context().gcs_address
-        # master_addr = gcs_addr.split(':')[0]
-        # current_ip = master_addr
-        # # This way, at least bundle is required to be created in a current
-        # # node.
-        # placement_group_specs[0][f'node:{current_ip}'] = 0.001
-
-        # By default, Ray packs resources as much as possible.
-        current_placement_group = ray.util.placement_group(placement_group_specs, strategy='PACK')
-        _wait_until_pg_ready(current_placement_group)
-
-    assert current_placement_group is not None
-    # Set the placement group in the parallel config
-    placement_group = current_placement_group
-    return placement_group
-
-
 def _get_master_addr():
     """Get master addr."""
     addr = _envs.dist_master_addr
@@ -379,7 +274,8 @@ def __init__(self,
             ray_world_size = self.world_size
             if self.dp > 1:
                 ray_world_size = 1
-            placement_group = init_ray_cluster(ray_world_size, dp=dist_config.dp)
+            self.ray_ctx = RayContext(ray_world_size, dp=dist_config.dp, device_type=device_type)
+            placement_group = self.ray_ctx.get_placement_group()
             self.placement_group = placement_group
 
             if self.dp == 1:
@@ -476,6 +372,8 @@ def sleep(self, level: int = 1):
 
     def wakeup(self, tags: Optional[List[str]] = None):
         """Wakeup."""
+        if tags is None or 'kv_cache' in tags:
+            self.update_configs()
         self.collective_rpc('wakeup', (tags, ))
 
     def get_input_processor(self):
@@ -537,10 +435,7 @@ def release(self):
         else:
             [ray.kill(worker) for worker in self.workers]
 
-        ray.util.remove_placement_group(self.placement_group)
-        logger.debug('RayExecutor placement group removed.')
-        ray.shutdown()
-        logger.debug('Ray shutdown.')
+        self.ray_ctx.shutdown()
 
     def _compile_dag(self):
         """Compile dag."""
@@ -653,7 +548,7 @@ def _init_workers_ray(self, placement_group: PlacementGroup, worker_kwargs: dict
                     runtime_env = _update_runtime_env_nsys(runtime_env)
                 worker = ray.remote(
                     num_cpus=0,
-                    num_gpus=1.0,
+                    num_gpus=0.01,
                     scheduling_strategy=scheduling_strategy,
                     runtime_env=runtime_env,
                 )(RayWorkerWrapper).remote(**worker_kwargs)
 
@@ -1 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from lmdeploy.messages import PytorchEngineConfig
+
+
+def build_mp_engine(backend: str,
+                    model_path: str,
+                    tokenizer: object,
+                    engine_config: PytorchEngineConfig = None,
+                    **kwargs):
+    """Build mp engine."""
+    if backend == 'mp':
+        from .zmq_engine import ZMQMPEngine
+        return ZMQMPEngine(model_path, tokenizer, engine_config=engine_config, **kwargs)
+    elif backend == 'ray':
+        from .ray_engine import RayMPEngine
+        return RayMPEngine(model_path, tokenizer, engine_config=engine_config, **kwargs)
+    else:
+        raise ValueError(f'Unsupported backend: {backend}')