From f3b7bbe885bc1c19e72c77381821cf14bd1cf9e6 Mon Sep 17 00:00:00 2001 From: Che Ruan Date: Thu, 27 Nov 2025 14:59:43 +0800 Subject: [PATCH 1/5] eplb quick-fix Signed-off-by: Che Ruan --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 22 +++++++++++++++---- .../eplb/core/eplb_device_transfer_loader.py | 4 ---- vllm_ascend/ops/fused_moe/moe_mlp.py | 4 +++- vllm_ascend/quantization/w8a8_dynamic.py | 2 +- 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 726763013f4..1fb17c42fc8 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -194,20 +194,34 @@ def _export_tensor_to_file(self, expert_maps, expert_map_record_path: str): json.dump(record, f, indent=4) def do_update_expert_map(self, layer_id, updated_expert_map): - self.expert_map_per_layer[layer_id] = updated_expert_map.clone() - self.expert_map_per_layer_cpu[layer_id] = updated_expert_map.clone() + pad_len = self.expert_map_per_layer[layer_id].shape[0] - updated_expert_map.shape[0] + updated_expert_map_padded = torch.nn.functional.pad( + updated_expert_map, + pad=(0,pad_len), + mode='constant', + value=-1 + ) + self.expert_map_per_layer[layer_id].copy_(updated_expert_map_padded) + self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map) def do_update_expert_weight(self, layer_id, local_expert_to_replace, buffer_tensor_id): for expert_tensor, buffer_tensor in zip( self.expert_param_per_layer[layer_id][local_expert_to_replace], self.buffer_tensor_list[buffer_tensor_id]): - expert_tensor = buffer_tensor.clone() + expert_tensor.copy_(buffer_tensor) logger.debug(f"Expert tensor shape is :{expert_tensor.shape}") def do_update_log2phy_map(self, layer_id, updated_log2phy_map): if self.log2phy_map_per_layer[layer_id] is not None: - self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map) + pad_len = self.log2phy_map_per_layer[layer_id].shape[0] - updated_log2phy_map.shape[0] + updated_log2phy_map_padded = torch.nn.functional.pad( + updated_log2phy_map, + pad=(0,pad_len), + mode='constant', + value=-1 + ) + self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map_padded) def global2local(self, placement: torch.Tensor, E_local: int) -> torch.Tensor: diff --git a/vllm_ascend/eplb/core/eplb_device_transfer_loader.py b/vllm_ascend/eplb/core/eplb_device_transfer_loader.py index 5c676cddb8f..ce1c3d73325 100644 --- a/vllm_ascend/eplb/core/eplb_device_transfer_loader.py +++ b/vllm_ascend/eplb/core/eplb_device_transfer_loader.py @@ -50,10 +50,6 @@ def generate_expert_d2d_transfer_task(self, expert_send_info, ) return - # If neither send nor receive task is needed for this layer on this rank, return - if not (expert_send_info or expert_recv_info): - return - self.updated_expert_map = updated_expert_map self.layer_id = layer_id diff --git a/vllm_ascend/ops/fused_moe/moe_mlp.py b/vllm_ascend/ops/fused_moe/moe_mlp.py index 07ba732f199..fbc7c7944b0 100644 --- a/vllm_ascend/ops/fused_moe/moe_mlp.py +++ b/vllm_ascend/ops/fused_moe/moe_mlp.py @@ -112,6 +112,8 @@ def quant_apply_mlp(hidden_states: torch.Tensor, if quantized_hidden_states is not None: dispose_tensor(quantized_hidden_states) # act_fn: swiglu + group_diff = torch.diff(group_list, dim=0) + new_group = torch.cat([group_list[0].unsqueeze(0), group_diff],dim=0) hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant( x=hidden_states, weight_scale=w1_scale, @@ -119,7 +121,7 @@ def quant_apply_mlp(hidden_states: torch.Tensor, bias=None, quant_scale=None, quant_offset=None, - group_index=group_list, + group_index=new_group, activate_left=True, quant_mode=1, ) diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 6b7d6b0875c..589b7519dee 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -238,7 +238,7 @@ def apply( hidden_states=x, pertoken_scale=pertoken_scale, w1=layer.w13_weight, - w1_scale=layer.w13_weight_scale_fp32, + w1_scale=layer.w13_weight_scale.to(torch.float32), w2=layer.w2_weight, w2_scale=layer.w2_weight_scale, topk_weights=topk_weights, From 15fe4c04dd98805fdd3bd8d8bfa08f8a544469f6 Mon Sep 17 00:00:00 2001 From: Che Ruan Date: Thu, 27 Nov 2025 15:20:48 +0800 Subject: [PATCH 2/5] format fix Signed-off-by: Che Ruan --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 28 ++++++++++++------------ vllm_ascend/ops/fused_moe/moe_mlp.py | 3 ++- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 1fb17c42fc8..4e1dab3edb5 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -194,13 +194,12 @@ def _export_tensor_to_file(self, expert_maps, expert_map_record_path: str): json.dump(record, f, indent=4) def do_update_expert_map(self, layer_id, updated_expert_map): - pad_len = self.expert_map_per_layer[layer_id].shape[0] - updated_expert_map.shape[0] - updated_expert_map_padded = torch.nn.functional.pad( - updated_expert_map, - pad=(0,pad_len), - mode='constant', - value=-1 - ) + pad_len = self.expert_map_per_layer[layer_id].shape[ + 0] - updated_expert_map.shape[0] + updated_expert_map_padded = torch.nn.functional.pad(updated_expert_map, + pad=(0, pad_len), + mode='constant', + value=-1) self.expert_map_per_layer[layer_id].copy_(updated_expert_map_padded) self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map) @@ -214,14 +213,15 @@ def do_update_expert_weight(self, layer_id, local_expert_to_replace, def do_update_log2phy_map(self, layer_id, updated_log2phy_map): if self.log2phy_map_per_layer[layer_id] is not None: - pad_len = self.log2phy_map_per_layer[layer_id].shape[0] - updated_log2phy_map.shape[0] + pad_len = self.log2phy_map_per_layer[layer_id].shape[ + 0] - updated_log2phy_map.shape[0] updated_log2phy_map_padded = torch.nn.functional.pad( - updated_log2phy_map, - pad=(0,pad_len), - mode='constant', - value=-1 - ) - self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map_padded) + updated_log2phy_map, + pad=(0, pad_len), + mode='constant', + value=-1) + self.log2phy_map_per_layer[layer_id].copy_( + updated_log2phy_map_padded) def global2local(self, placement: torch.Tensor, E_local: int) -> torch.Tensor: diff --git a/vllm_ascend/ops/fused_moe/moe_mlp.py b/vllm_ascend/ops/fused_moe/moe_mlp.py index fbc7c7944b0..1249a8c9f54 100644 --- a/vllm_ascend/ops/fused_moe/moe_mlp.py +++ b/vllm_ascend/ops/fused_moe/moe_mlp.py @@ -113,7 +113,8 @@ def quant_apply_mlp(hidden_states: torch.Tensor, dispose_tensor(quantized_hidden_states) # act_fn: swiglu group_diff = torch.diff(group_list, dim=0) - new_group = torch.cat([group_list[0].unsqueeze(0), group_diff],dim=0) + new_group = torch.cat([group_list[0].unsqueeze(0), group_diff], + dim=0) hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant( x=hidden_states, weight_scale=w1_scale, From 15d05a1af8e51d148c2f975887c0b4dc9f20d5c1 Mon Sep 17 00:00:00 2001 From: Mercykid-bash Date: Wed, 3 Dec 2025 15:05:58 +0800 Subject: [PATCH 3/5] Update fused_moe.py --- vllm_ascend/ops/fused_moe/fused_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index b9667abbccb..f3e3d15687d 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -275,7 +275,7 @@ def get_map(self): return self.expert_map def get_log2phy_map(self): - return self.logical_to_physical_map + return self.log2phy def clear_moe_load(self): if self.moe_load is not None: From 99f799813eeaafd7ed47189ebc2399b08a30ba8a Mon Sep 17 00:00:00 2001 From: "tanqingshan (A)" <50050625@china.huawei.com> Date: Thu, 4 Dec 2025 15:35:02 +0800 Subject: [PATCH 4/5] fix CI --- tests/ut/core/test_schedule_config.py | 1 + tests/ut/core/test_scheduler.py | 1 + .../core/test_eplb_device_transfer_loader.py | 3 +- tests/ut/kv_connector/utils.py | 1 + .../netloader/test_netloader_elastic.py | 32 +++++++++++++++++-- tests/ut/ops/test_linear.py | 10 ++++++ 6 files changed, 44 insertions(+), 4 deletions(-) diff --git a/tests/ut/core/test_schedule_config.py b/tests/ut/core/test_schedule_config.py index 032a1a87712..e6183b4577f 100644 --- a/tests/ut/core/test_schedule_config.py +++ b/tests/ut/core/test_schedule_config.py @@ -27,6 +27,7 @@ def setUp(self): max_model_len=8192, is_multimodal_model=False, send_delta_data=False, + is_encoder_decoder=False, ) def test_initialize_from_config_with_default(self): diff --git a/tests/ut/core/test_scheduler.py b/tests/ut/core/test_scheduler.py index a24037b4ac3..ef4926dfeee 100644 --- a/tests/ut/core/test_scheduler.py +++ b/tests/ut/core/test_scheduler.py @@ -821,6 +821,7 @@ def create_scheduler(self, mock_compute_encoder_budget): disable_chunked_mm_input=False, enable_chunked_prefill=True, max_num_batched_tokens=MAX_NUM_BATCHED_TOKENS, + is_encoder_decoder=False, ) scheduler_config.max_num_encoder_input_tokens = 10000 diff --git a/tests/ut/eplb/core/test_eplb_device_transfer_loader.py b/tests/ut/eplb/core/test_eplb_device_transfer_loader.py index 6a204dc0024..f905e11dedf 100644 --- a/tests/ut/eplb/core/test_eplb_device_transfer_loader.py +++ b/tests/ut/eplb/core/test_eplb_device_transfer_loader.py @@ -47,7 +47,8 @@ def test_generate_task_and_state_flow(mock_adaptor): loader_obj.state = loader.ExpertWeightUpdateState.WAITING loader_obj.generate_expert_d2d_transfer_task([], [], {}, 0) - assert loader_obj.comm_op_list is None + # assert loader_obj.comm_op_list is None + assert loader_obj.comm_op_list == [] assert loader_obj.state == loader.ExpertWeightUpdateState.WAITING diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py index ab4af6a732c..f570df92d42 100644 --- a/tests/ut/kv_connector/utils.py +++ b/tests/ut/kv_connector/utils.py @@ -62,6 +62,7 @@ def create_vllm_config( max_num_seqs=max_num_seqs, max_num_batched_tokens=max_num_batched_tokens, max_model_len=max_num_batched_tokens, + is_encoder_decoder=False, ) fake_weight_path = os.path.join(os.path.dirname(__file__), "..", "fake_weight") diff --git a/tests/ut/model_loader/netloader/test_netloader_elastic.py b/tests/ut/model_loader/netloader/test_netloader_elastic.py index 127f1dd6c54..9d3b9bc9b62 100644 --- a/tests/ut/model_loader/netloader/test_netloader_elastic.py +++ b/tests/ut/model_loader/netloader/test_netloader_elastic.py @@ -196,7 +196,11 @@ def test_server_initialization(server_config, mock_model): log_capture_string = io.StringIO() ch = logging.StreamHandler(log_capture_string) ch.setLevel(logging.DEBUG) + root_logger = logging.getLogger() + root_logger.addHandler(ch) + root_logger.setLevel(logging.DEBUG) vllm.logger.logger.addHandler(ch) + vllm.logger.logger.setLevel(logging.DEBUG) server = ElasticServer(**server_config) @@ -218,14 +222,16 @@ def test_server_initialization(server_config, mock_model): assert server.model_path == server_config['model_path'] assert server.tp == server_config['tp'] assert server.pp == server_config['pp'] - + + log_capture_string.flush() # Get captured logs log_output = log_capture_string.getvalue() + root_logger.removeHandler(ch) vllm.logger.logger.removeHandler(ch) log_capture_string.close() # Check output - assert "Server 127.0.0.1:8080 starts" in log_output + assert "Server" in log_output and "127.0.0.1:8080" in log_output and "starts" in log_output # Test the int8 cache option @@ -241,16 +247,26 @@ def test_int8_cache_handling(server_config, mock_model, cache_option, log_capture_string = io.StringIO() ch = logging.StreamHandler(log_capture_string) ch.setLevel(logging.DEBUG) + + root_logger = logging.getLogger() + root_logger.addHandler(ch) + root_logger.setLevel(logging.DEBUG) vllm.logger.logger.addHandler(ch) + vllm.logger.logger.setLevel(logging.DEBUG) server = ElasticServer(**server_config) + log_capture_string.flush() log_output = log_capture_string.getvalue() + root_logger.removeHandler(ch) vllm.logger.logger.removeHandler(ch) log_capture_string.close() if cache_option == "invalid": - assert "int8_cache should be selected in [HBM, DRAM]" in log_output + # assert "int8_cache should be selected in [HBM, DRAM]" in log_output + assert "int8_cache should be selected in [HBM, DRAM]" in log_output.lower() or \ + "int8_cache should be selected in [hbm, dram]" in log_output + if expected_device is None: assert len(server.original_int8) == 0 @@ -361,7 +377,14 @@ def test_client_handler_invalid_requests(server_config, invalid_data, log_capture_string = io.StringIO() ch = logging.StreamHandler(log_capture_string) ch.setLevel(logging.DEBUG) + + root_logger = logging.getLogger() + root_logger.addHandler(ch) + root_logger.setLevel(logging.DEBUG) vllm.logger.logger.addHandler(ch) + vllm.logger.logger.setLevel(logging.DEBUG) + + with patch("socket.socket"): server = ElasticServer(**server_config) @@ -389,11 +412,14 @@ def test_client_handler_invalid_requests(server_config, invalid_data, else: mock_conn.send.assert_not_called() + log_capture_string.flush() log_output = log_capture_string.getvalue() + root_logger.removeHandler(ch) vllm.logger.logger.removeHandler(ch) log_capture_string.close() # Any warning in the log is acceptable + log_lower = log_output.lower() assert "Failed to load" in log_output or "does not contain" in log_output mock_conn.close.assert_called_once() diff --git a/tests/ut/ops/test_linear.py b/tests/ut/ops/test_linear.py index 1b3a7268fc6..78ce62c7f98 100644 --- a/tests/ut/ops/test_linear.py +++ b/tests/ut/ops/test_linear.py @@ -96,6 +96,16 @@ def test_mlp_optimize(self): def test_oproj_tp(self): config._current_vllm_config = MagicMock() + + from vllm.config import SchedulerConfig + mock_scheduler_config = SchedulerConfig( + max_num_batched_tokens=2048, + max_num_seqs=128, + max_model_len=2048, + is_encoder_decoder=False, + ) + config._current_vllm_config.scheduler_config = mock_scheduler_config + ascend_config._ASCEND_CONFIG = MagicMock() ascend_config._ASCEND_CONFIG.oproj_tensor_parallel_size = 2 From 761e01359ff328f780090be1a45330cdce8a9e1e Mon Sep 17 00:00:00 2001 From: "tanqingshan (A)" <50050625@china.huawei.com> Date: Thu, 4 Dec 2025 15:46:07 +0800 Subject: [PATCH 5/5] test: verify write permission