PaddlePaddle · rainyfly · Aug 29, 2025 · Aug 30, 2025 · Aug 31, 2025 · Aug 31, 2025
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -127,13 +127,12 @@ def __init__(
         self.redundant_experts_num = 0
         self.seed = 0
         self.quantization = None
-        self.reasoning_parser = None
         self.pad_token_id: int = -1
         self.eos_tokens_lens: int = 2
         self.lm_head_fp32: bool = False
         self.model_format = "auto"
         for key, value in args.items():
-            if hasattr(self, key) and value != "None":
+            if hasattr(self, key):
                 setattr(self, key, value)
 
         assert self.model != ""
@@ -258,7 +257,7 @@ def __init__(
         self.sequence_parallel = False  # Whether to enable sequence parallelism.
         self.use_ep = False  # Whether to enable Expert Parallelism
         self.moe_phase = MoEPhase("prefill")  # Generation phase
-        self.msg_queue_id = 1  # message queue id
+        self.msg_queue_id = 1  # mesage queue id
 
         self.tensor_parallel_rank = 0  # TP rank ID
         self.tensor_parallel_size = 1  # TP degree
@@ -351,8 +350,8 @@ def set_tp_group(self):
             )
         )
         # same ep group id
+        # (TODO:gaoziyuan move this gid config to ep.py)
         dist.collective._set_custom_gid(self.data_parallel_size + tp_gid_offset)
-        self.ep_group = dist.new_group(range(self.expert_parallel_size))
         logger.info(
             f"data_parallel_size: {self.data_parallel_size}, tensor_parallel_size: {self.tensor_parallel_size}, expert_parallel_size: {self.expert_parallel_size}, data_parallel_rank: {self.data_parallel_rank}, tensor_parallel_rank: {self.tensor_parallel_rank}, expert_parallel_rank: {self.expert_parallel_rank}, tp_group: {self.tp_group}."
         )
@@ -550,7 +549,7 @@ def __init__(
             It requires that all input buffers have fixed addresses, and all
             splitting ops write their outputs to input buffers.
             - With dyncmic graph backend: ...
-            - With static graph backend: WIP
+            - With static grpah backend: WIP
         """
         self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
         """  Number of warmup runs for SOT warmup. """
@@ -1234,6 +1233,15 @@ def postprocess(self):
 
         self.paddle_commit_id = paddle.version.commit
 
+        if self.cache_config.enable_chunked_prefill:
+            self.force_chunked_prefill = int(envs.FD_FORCE_CHUNKED_PREFILL)
+            if (
+                self.speculative_config is not None
+                and self.speculative_config.method in ["mtp"]
+                and not self.force_chunked_prefill
+            ):
+                self.cache_config.enable_chunked_prefill = False
+
         if self.max_num_batched_tokens is None:
             if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
                 self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
@@ -1292,7 +1300,7 @@ def check(self):
         ), "TP and EP cannot be enabled at the same time"
 
         if not self.cache_config.enable_chunked_prefill:
-            if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
+            if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
                 assert self.max_num_batched_tokens >= self.max_model_len, (
                     f"max_num_batched_tokens: {self.max_num_batched_tokens} "
                     f"should be larger than or equal to max_model_len: {self.max_model_len}"

diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
@@ -392,6 +392,12 @@ def __post_init__(self):
                 raise NotImplementedError("Logprob does not support enable_expert_parallel.")
             if not current_platform.is_cuda():
                 raise NotImplementedError("Only CUDA platform supports logprob.")
+        if self.speculative_config is not None:
+            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
+        if self.splitwise_role != "mixed":
+            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
+        if (not current_platform.is_cuda()) and (not current_platform.is_xpu()):
+            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:

diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
@@ -132,6 +132,7 @@ def __init__(
         self.image_start = image_start
         self.video_start = video_start
         self.audio_start = audio_start
+        self.with_image = False
 
         self.image_end = image_end
         self.video_end = video_end

diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
@@ -81,7 +81,7 @@
     # set traec exporter_otlp_headers.
     "EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"),
     # enable kv cache block scheduler v1 (no need for kv_cache_ratio)
-    "ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")),
+    "ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")),
     # Whether to use PLUGINS.
     "FD_PLUGINS": lambda: None if "FD_PLUGINS" not in os.environ else os.environ["FD_PLUGINS"].split(","),
     # set trace attribute job_id.
@@ -105,5 +105,10 @@ def __getattr__(name: str):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
+def __setattr__(name: str, value: Any):
+    assert name in environment_variables
+    environment_variables[name] = lambda: value
+
+
 def __dir__():
     return list(environment_variables.keys())
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
@@ -748,6 +748,16 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
     logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}")
     logger.info(f"- Load strategy: {load_config.load_strategy}")
 
+    if args.speculative_config is not None:
+        logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not support speculative decoding now.")
+        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
+    if args.splitwise_role != "mixed":
+        logger.info(f"Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported {args.splitwise_role} now.")
+        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
+    if (not current_platform.is_cuda()) and (not current_platform.is_xpu()):
+        logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.")
+        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
+
     fd_config = FDConfig(
         model_config=model_config,
         parallel_config=parallel_config,