Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions fastdeploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,12 @@ def __init__(
self.redundant_experts_num = 0
self.seed = 0
self.quantization = None
self.reasoning_parser = None
self.pad_token_id: int = -1
self.eos_tokens_lens: int = 2
self.lm_head_fp32: bool = False
self.model_format = "auto"
for key, value in args.items():
if hasattr(self, key) and value != "None":
if hasattr(self, key):
setattr(self, key, value)

assert self.model != ""
Expand Down Expand Up @@ -258,7 +257,7 @@ def __init__(
self.sequence_parallel = False # Whether to enable sequence parallelism.
self.use_ep = False # Whether to enable Expert Parallelism
self.moe_phase = MoEPhase("prefill") # Generation phase
self.msg_queue_id = 1 # message queue id
self.msg_queue_id = 1 # mesage queue id

self.tensor_parallel_rank = 0 # TP rank ID
self.tensor_parallel_size = 1 # TP degree
Expand Down Expand Up @@ -351,8 +350,8 @@ def set_tp_group(self):
)
)
# same ep group id
# (TODO:gaoziyuan move this gid config to ep.py)
dist.collective._set_custom_gid(self.data_parallel_size + tp_gid_offset)
self.ep_group = dist.new_group(range(self.expert_parallel_size))
logger.info(
f"data_parallel_size: {self.data_parallel_size}, tensor_parallel_size: {self.tensor_parallel_size}, expert_parallel_size: {self.expert_parallel_size}, data_parallel_rank: {self.data_parallel_rank}, tensor_parallel_rank: {self.tensor_parallel_rank}, expert_parallel_rank: {self.expert_parallel_rank}, tp_group: {self.tp_group}."
)
Expand Down Expand Up @@ -550,7 +549,7 @@ def __init__(
It requires that all input buffers have fixed addresses, and all
splitting ops write their outputs to input buffers.
- With dyncmic graph backend: ...
- With static graph backend: WIP
- With static grpah backend: WIP
"""
self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
""" Number of warmup runs for SOT warmup. """
Expand Down Expand Up @@ -1234,6 +1233,15 @@ def postprocess(self):

self.paddle_commit_id = paddle.version.commit

if self.cache_config.enable_chunked_prefill:
self.force_chunked_prefill = int(envs.FD_FORCE_CHUNKED_PREFILL)
if (
self.speculative_config is not None
and self.speculative_config.method in ["mtp"]
and not self.force_chunked_prefill
):
self.cache_config.enable_chunked_prefill = False

if self.max_num_batched_tokens is None:
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
Expand Down Expand Up @@ -1292,7 +1300,7 @@ def check(self):
), "TP and EP cannot be enabled at the same time"

if not self.cache_config.enable_chunked_prefill:
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
assert self.max_num_batched_tokens >= self.max_model_len, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
f"should be larger than or equal to max_model_len: {self.max_model_len}"
Expand Down
6 changes: 6 additions & 0 deletions fastdeploy/engine/args_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,12 @@ def __post_init__(self):
raise NotImplementedError("Logprob does not support enable_expert_parallel.")
if not current_platform.is_cuda():
raise NotImplementedError("Only CUDA platform supports logprob.")
if self.speculative_config is not None:
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if self.splitwise_role != "mixed":
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if (not current_platform.is_cuda()) and (not current_platform.is_xpu()):
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0

@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/engine/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def __init__(
self.image_start = image_start
self.video_start = video_start
self.audio_start = audio_start
self.with_image = False

self.image_end = image_end
self.video_end = video_end
Expand Down
7 changes: 6 additions & 1 deletion fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
# set traec exporter_otlp_headers.
"EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"),
# enable kv cache block scheduler v1 (no need for kv_cache_ratio)
"ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")),
"ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")),
# Whether to use PLUGINS.
"FD_PLUGINS": lambda: None if "FD_PLUGINS" not in os.environ else os.environ["FD_PLUGINS"].split(","),
# set trace attribute job_id.
Expand All @@ -105,5 +105,10 @@ def __getattr__(name: str):
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


def __setattr__(name: str, value: Any):
assert name in environment_variables
environment_variables[name] = lambda: value


def __dir__():
return list(environment_variables.keys())
10 changes: 10 additions & 0 deletions fastdeploy/worker/worker_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,16 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}")
logger.info(f"- Load strategy: {load_config.load_strategy}")

if args.speculative_config is not None:
logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not support speculative decoding now.")
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if args.splitwise_role != "mixed":
logger.info(f"Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported {args.splitwise_role} now.")
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if (not current_platform.is_cuda()) and (not current_platform.is_xpu()):
logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.")
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0

fd_config = FDConfig(
model_config=model_config,
parallel_config=parallel_config,
Expand Down
Loading