Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions fastdeploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1237,7 +1237,7 @@ def postprocess(self):
if self.cache_config.enable_chunked_prefill:
self.max_num_batched_tokens = 2048
else:
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
self.max_num_batched_tokens = self.max_model_len
else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
Expand Down Expand Up @@ -1290,7 +1290,7 @@ def check(self):
), "TP and EP cannot be enabled at the same time"

if not self.cache_config.enable_chunked_prefill:
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
assert self.max_num_batched_tokens >= self.max_model_len, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
f"should be larger than or equal to max_model_len: {self.max_model_len}"
Expand Down
12 changes: 11 additions & 1 deletion fastdeploy/engine/args_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
SpeculativeConfig,
TaskOption,
)
from fastdeploy import envs
from fastdeploy.platforms import current_platform
from fastdeploy.scheduler.config import SchedulerConfig
from fastdeploy.utils import (
Expand Down Expand Up @@ -388,6 +389,15 @@ def __post_init__(self):
raise NotImplementedError("Logprob does not support enable_expert_parallel.")
if not current_platform.is_cuda():
raise NotImplementedError("Only CUDA platform supports logprob.")
if self.speculative_config is not None:
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if self.splitwise_role != 'mixed':
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if (not current_platform.is_cuda()) and (not current_platform.is_xpu()):
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0




@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
Expand Down Expand Up @@ -975,7 +985,7 @@ def create_engine_config(self) -> FDConfig:
if self.enable_chunked_prefill:
self.max_num_batched_tokens = 2048
else:
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
self.max_num_batched_tokens = self.max_model_len
else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
Expand Down
6 changes: 5 additions & 1 deletion fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
# set traec exporter_otlp_headers.
"EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"),
# enable kv cache block scheduler v1 (no need for kv_cache_ratio)
"ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")),
"ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")),
# Whether to use PLUGINS.
"FD_PLUGINS": lambda: None if "FD_PLUGINS" not in os.environ else os.environ["FD_PLUGINS"].split(","),
# set trace attribute job_id.
Expand All @@ -102,6 +102,10 @@ def __getattr__(name: str):
return environment_variables[name]()
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

def __setattr__(name: str, value: Any):
assert name in environment_variables
environment_variables[name] = lambda: value


def __dir__():
return list(environment_variables.keys())
Loading