PaddlePaddle · rainyfly · Aug 29, 2025 · Aug 30, 2025 · Aug 31, 2025 · Aug 31, 2025
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -1237,7 +1237,7 @@ def postprocess(self):
             if self.cache_config.enable_chunked_prefill:
                 self.max_num_batched_tokens = 2048
             else:
-                if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
+                if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
                     self.max_num_batched_tokens = self.max_model_len
                 else:
                     self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
@@ -1290,7 +1290,7 @@ def check(self):
         ), "TP and EP cannot be enabled at the same time"
 
         if not self.cache_config.enable_chunked_prefill:
-            if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
+            if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
                 assert self.max_num_batched_tokens >= self.max_model_len, (
                     f"max_num_batched_tokens: {self.max_num_batched_tokens} "
                     f"should be larger than or equal to max_model_len: {self.max_model_len}"

diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
@@ -32,6 +32,7 @@
     SpeculativeConfig,
     TaskOption,
 )
+from fastdeploy import envs
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler.config import SchedulerConfig
 from fastdeploy.utils import (
@@ -388,6 +389,15 @@ def __post_init__(self):
                 raise NotImplementedError("Logprob does not support enable_expert_parallel.")
             if not current_platform.is_cuda():
                 raise NotImplementedError("Only CUDA platform supports logprob.")
+        if self.speculative_config is not None:
+            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
+        if self.splitwise_role != 'mixed':
+            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
+        if (not current_platform.is_cuda()) and (not current_platform.is_xpu()):
+            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
+
+
+
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@@ -975,7 +985,7 @@ def create_engine_config(self) -> FDConfig:
             if self.enable_chunked_prefill:
                 self.max_num_batched_tokens = 2048
             else:
-                if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
+                if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
                     self.max_num_batched_tokens = self.max_model_len
                 else:
                     self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM

diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
@@ -81,7 +81,7 @@
     # set traec exporter_otlp_headers.
     "EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"),
     # enable kv cache block scheduler v1 (no need for kv_cache_ratio)
-    "ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")),
+    "ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")),
     # Whether to use PLUGINS.
     "FD_PLUGINS": lambda: None if "FD_PLUGINS" not in os.environ else os.environ["FD_PLUGINS"].split(","),
     # set trace attribute job_id.
@@ -102,6 +102,10 @@ def __getattr__(name: str):
         return environment_variables[name]()
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
+def __setattr__(name: str, value: Any):
+    assert name in environment_variables
+    environment_variables[name] = lambda: value
+
 
 def __dir__():
     return list(environment_variables.keys())