diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py index 79ff9ea0e1..d3e315a95c 100644 --- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py +++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py @@ -20,7 +20,6 @@ import paddle.jit.dy2static.utils as jit_utils import paddle.nn.layer -from paddle.base.core import CUDAGraph from paddle.device.cuda import graphs from fastdeploy import envs @@ -93,7 +92,10 @@ def __init__(self, fd_config: FDConfig, runnable: Callable): self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups self.real_shape_to_captured_size = fd_config.graph_opt_config.real_shape_to_captured_size if self.fd_config.graph_opt_config.use_unique_memory_pool: - self.unique_memory_pool_id = CUDAGraph.gen_new_memory_pool_id() + if paddle.is_compiled_with_cuda(): + from paddle.base.core import CUDAGraph + + self.unique_memory_pool_id = CUDAGraph.gen_new_memory_pool_id() self._create_entry_dict() self.cuda_graph_manager = None diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py index 30794f8ff3..57f5df71c6 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py @@ -161,11 +161,11 @@ def apply( """ if layer.ep_size > 1: if layer.fd_config.parallel_config.moe_phase.phase == "prefill": - if layer.fd_config.parallel_config.splitwise_role == "mixed": + if layer.layer_idx == 0: self.ep_prefill_runner.clean_low_latency_buffer() return self.apply_ep_prefill(layer, x, gate) else: - if layer.fd_config.parallel_config.splitwise_role == "mixed": + if layer.layer_idx == 0: self.ep_decoder_runner.clean_low_latency_buffer() return self.apply_ep_decode(layer, x, gate) else: