diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 1a461d81e5..c58c9a5b9c 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -127,13 +127,12 @@ def __init__( self.redundant_experts_num = 0 self.seed = 0 self.quantization = None - self.reasoning_parser = None self.pad_token_id: int = -1 self.eos_tokens_lens: int = 2 self.lm_head_fp32: bool = False self.model_format = "auto" for key, value in args.items(): - if hasattr(self, key) and value != "None": + if hasattr(self, key): setattr(self, key, value) assert self.model != "" @@ -258,7 +257,7 @@ def __init__( self.sequence_parallel = False # Whether to enable sequence parallelism. self.use_ep = False # Whether to enable Expert Parallelism self.moe_phase = MoEPhase("prefill") # Generation phase - self.msg_queue_id = 1 # message queue id + self.msg_queue_id = 1 # mesage queue id self.tensor_parallel_rank = 0 # TP rank ID self.tensor_parallel_size = 1 # TP degree @@ -351,8 +350,8 @@ def set_tp_group(self): ) ) # same ep group id + # (TODO:gaoziyuan move this gid config to ep.py) dist.collective._set_custom_gid(self.data_parallel_size + tp_gid_offset) - self.ep_group = dist.new_group(range(self.expert_parallel_size)) logger.info( f"data_parallel_size: {self.data_parallel_size}, tensor_parallel_size: {self.tensor_parallel_size}, expert_parallel_size: {self.expert_parallel_size}, data_parallel_rank: {self.data_parallel_rank}, tensor_parallel_rank: {self.tensor_parallel_rank}, expert_parallel_rank: {self.expert_parallel_rank}, tp_group: {self.tp_group}." ) @@ -550,7 +549,7 @@ def __init__( It requires that all input buffers have fixed addresses, and all splitting ops write their outputs to input buffers. - With dyncmic graph backend: ... - - With static graph backend: WIP + - With static grpah backend: WIP """ self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128] """ Number of warmup runs for SOT warmup. """ @@ -1234,6 +1233,15 @@ def postprocess(self): self.paddle_commit_id = paddle.version.commit + if self.cache_config.enable_chunked_prefill: + self.force_chunked_prefill = int(envs.FD_FORCE_CHUNKED_PREFILL) + if ( + self.speculative_config is not None + and self.speculative_config.method in ["mtp"] + and not self.force_chunked_prefill + ): + self.cache_config.enable_chunked_prefill = False + if self.max_num_batched_tokens is None: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER): self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM @@ -1292,7 +1300,7 @@ def check(self): ), "TP and EP cannot be enabled at the same time" if not self.cache_config.enable_chunked_prefill: - if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): + if not envs.ENABLE_V1_KVCACHE_SCHEDULER: assert self.max_num_batched_tokens >= self.max_model_len, ( f"max_num_batched_tokens: {self.max_num_batched_tokens} " f"should be larger than or equal to max_model_len: {self.max_model_len}" diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 0d0dedbe29..8e4be4f883 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -392,6 +392,12 @@ def __post_init__(self): raise NotImplementedError("Logprob does not support enable_expert_parallel.") if not current_platform.is_cuda(): raise NotImplementedError("Only CUDA platform supports logprob.") + if self.speculative_config is not None: + envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 + if self.splitwise_role != "mixed": + envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 + if (not current_platform.is_cuda()) and (not current_platform.is_xpu()): + envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index c1431c42ff..6f7340136b 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -132,6 +132,7 @@ def __init__( self.image_start = image_start self.video_start = video_start self.audio_start = audio_start + self.with_image = False self.image_end = image_end self.video_end = video_end diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index f6515f0614..6bf468788f 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -81,7 +81,7 @@ # set traec exporter_otlp_headers. "EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"), # enable kv cache block scheduler v1 (no need for kv_cache_ratio) - "ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")), + "ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")), # Whether to use PLUGINS. "FD_PLUGINS": lambda: None if "FD_PLUGINS" not in os.environ else os.environ["FD_PLUGINS"].split(","), # set trace attribute job_id. @@ -105,5 +105,10 @@ def __getattr__(name: str): raise AttributeError(f"module {__name__!r} has no attribute {name!r}") +def __setattr__(name: str, value: Any): + assert name in environment_variables + environment_variables[name] = lambda: value + + def __dir__(): return list(environment_variables.keys()) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 2df3222a38..52d4a4c8e3 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -748,6 +748,16 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}") logger.info(f"- Load strategy: {load_config.load_strategy}") + if args.speculative_config is not None: + logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not support speculative decoding now.") + envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 + if args.splitwise_role != "mixed": + logger.info(f"Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported {args.splitwise_role} now.") + envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 + if (not current_platform.is_cuda()) and (not current_platform.is_xpu()): + logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.") + envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 + fd_config = FDConfig( model_config=model_config, parallel_config=parallel_config,