3232from torch import nn
3333from transformers import PretrainedConfig
3434from vllm .attention import Attention , AttentionMetadata
35- from vllm .config import (CacheConfig , ModelConfig , VllmConfig ,
36- get_current_vllm_config )
35+ from vllm .config import CacheConfig , ModelConfig , VllmConfig
3736from vllm .distributed import (get_pp_group , get_tensor_model_parallel_rank ,
3837 get_tensor_model_parallel_world_size ,
3938 get_tp_group , split_tensor_along_last_dim ,
@@ -381,10 +380,6 @@ def __init__(
381380 self .tp_group = get_tp_group ().device_group
382381 self .tp_rank = get_tp_group ().rank_in_group
383382 self .ep_group = get_ep_group ()
384- self .kv_consumer = None
385- transfer_config = get_current_vllm_config ().kv_transfer_config
386- if transfer_config is not None :
387- self .kv_consumer = transfer_config .kv_role == "kv_consumer"
388383
389384 self .params_dtype = torch .get_default_dtype ()
390385 self .rm_router_logits = self .experts .rm_router_logits
@@ -408,12 +403,6 @@ def forward(self,
408403 enable_force_load_balance = forward_context .in_profile_run
409404 is_prefill = forward_context .with_prefill
410405
411- # If this node is kv_consumer, we force the moe always runs in decode path to make sure
412- # the behaviour aligned between dummy_run and normal model_execute.
413- if self .kv_consumer :
414- is_prefill = False
415- enable_force_load_balance = False
416-
417406 # router_logits: (num_tokens, n_experts)
418407 router_logits = None
419408 if not self .rm_router_logits and not self .enable_multistream_moe :
0 commit comments