Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/ut/core/test_schedule_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def setUp(self):
max_model_len=8192,
is_multimodal_model=False,
send_delta_data=False,
is_encoder_decoder=False,
)

def test_initialize_from_config_with_default(self):
Expand Down
1 change: 1 addition & 0 deletions tests/ut/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,6 +821,7 @@ def create_scheduler(self, mock_compute_encoder_budget):
disable_chunked_mm_input=False,
enable_chunked_prefill=True,
max_num_batched_tokens=MAX_NUM_BATCHED_TOKENS,
is_encoder_decoder=False,
)

scheduler_config.max_num_encoder_input_tokens = 10000
Expand Down
3 changes: 2 additions & 1 deletion tests/ut/eplb/core/test_eplb_device_transfer_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def test_generate_task_and_state_flow(mock_adaptor):
loader_obj.state = loader.ExpertWeightUpdateState.WAITING

loader_obj.generate_expert_d2d_transfer_task([], [], {}, 0)
assert loader_obj.comm_op_list is None
# assert loader_obj.comm_op_list is None
assert loader_obj.comm_op_list == []
assert loader_obj.state == loader.ExpertWeightUpdateState.WAITING


Expand Down
1 change: 1 addition & 0 deletions tests/ut/kv_connector/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def create_vllm_config(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_num_batched_tokens,
is_encoder_decoder=False,
)
fake_weight_path = os.path.join(os.path.dirname(__file__), "..",
"fake_weight")
Expand Down
32 changes: 29 additions & 3 deletions tests/ut/model_loader/netloader/test_netloader_elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,11 @@ def test_server_initialization(server_config, mock_model):
log_capture_string = io.StringIO()
ch = logging.StreamHandler(log_capture_string)
ch.setLevel(logging.DEBUG)
root_logger = logging.getLogger()
root_logger.addHandler(ch)
root_logger.setLevel(logging.DEBUG)
vllm.logger.logger.addHandler(ch)
vllm.logger.logger.setLevel(logging.DEBUG)

server = ElasticServer(**server_config)

Expand All @@ -218,14 +222,16 @@ def test_server_initialization(server_config, mock_model):
assert server.model_path == server_config['model_path']
assert server.tp == server_config['tp']
assert server.pp == server_config['pp']


log_capture_string.flush()
# Get captured logs
log_output = log_capture_string.getvalue()
root_logger.removeHandler(ch)
vllm.logger.logger.removeHandler(ch)
log_capture_string.close()

# Check output
assert "Server 127.0.0.1:8080 starts" in log_output
assert "Server" in log_output and "127.0.0.1:8080" in log_output and "starts" in log_output


# Test the int8 cache option
Expand All @@ -241,16 +247,26 @@ def test_int8_cache_handling(server_config, mock_model, cache_option,
log_capture_string = io.StringIO()
ch = logging.StreamHandler(log_capture_string)
ch.setLevel(logging.DEBUG)

root_logger = logging.getLogger()
root_logger.addHandler(ch)
root_logger.setLevel(logging.DEBUG)
vllm.logger.logger.addHandler(ch)
vllm.logger.logger.setLevel(logging.DEBUG)

server = ElasticServer(**server_config)

log_capture_string.flush()
log_output = log_capture_string.getvalue()
root_logger.removeHandler(ch)
vllm.logger.logger.removeHandler(ch)
log_capture_string.close()

if cache_option == "invalid":
assert "int8_cache should be selected in [HBM, DRAM]" in log_output
# assert "int8_cache should be selected in [HBM, DRAM]" in log_output
assert "int8_cache should be selected in [HBM, DRAM]" in log_output.lower() or \
"int8_cache should be selected in [hbm, dram]" in log_output


if expected_device is None:
assert len(server.original_int8) == 0
Expand Down Expand Up @@ -361,7 +377,14 @@ def test_client_handler_invalid_requests(server_config, invalid_data,
log_capture_string = io.StringIO()
ch = logging.StreamHandler(log_capture_string)
ch.setLevel(logging.DEBUG)

root_logger = logging.getLogger()
root_logger.addHandler(ch)
root_logger.setLevel(logging.DEBUG)
vllm.logger.logger.addHandler(ch)
vllm.logger.logger.setLevel(logging.DEBUG)



with patch("socket.socket"):
server = ElasticServer(**server_config)
Expand Down Expand Up @@ -389,11 +412,14 @@ def test_client_handler_invalid_requests(server_config, invalid_data,
else:
mock_conn.send.assert_not_called()

log_capture_string.flush()
log_output = log_capture_string.getvalue()
root_logger.removeHandler(ch)
vllm.logger.logger.removeHandler(ch)
log_capture_string.close()

# Any warning in the log is acceptable
log_lower = log_output.lower()
assert "Failed to load" in log_output or "does not contain" in log_output
mock_conn.close.assert_called_once()

Expand Down
10 changes: 10 additions & 0 deletions tests/ut/ops/test_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,16 @@ def test_mlp_optimize(self):
def test_oproj_tp(self):

config._current_vllm_config = MagicMock()

from vllm.config import SchedulerConfig
mock_scheduler_config = SchedulerConfig(
max_num_batched_tokens=2048,
max_num_seqs=128,
max_model_len=2048,
is_encoder_decoder=False,
)
config._current_vllm_config.scheduler_config = mock_scheduler_config


ascend_config._ASCEND_CONFIG = MagicMock()
ascend_config._ASCEND_CONFIG.oproj_tensor_parallel_size = 2
Expand Down
18 changes: 16 additions & 2 deletions vllm_ascend/eplb/adaptor/vllm_adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,13 @@ def _export_tensor_to_file(self, expert_maps, expert_map_record_path: str):
json.dump(record, f, indent=4)

def do_update_expert_map(self, layer_id, updated_expert_map):
self.expert_map_per_layer[layer_id].copy_(updated_expert_map)
pad_len = self.expert_map_per_layer[layer_id].shape[
0] - updated_expert_map.shape[0]
updated_expert_map_padded = torch.nn.functional.pad(updated_expert_map,
pad=(0, pad_len),
mode='constant',
value=-1)
self.expert_map_per_layer[layer_id].copy_(updated_expert_map_padded)
self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map)

def do_update_expert_weight(self, layer_id, local_expert_to_replace,
Expand All @@ -234,7 +240,15 @@ def do_update_expert_weight(self, layer_id, local_expert_to_replace,

def do_update_log2phy_map(self, layer_id, updated_log2phy_map):
if self.log2phy_map_per_layer[layer_id] is not None:
self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map)
pad_len = self.log2phy_map_per_layer[layer_id].shape[
0] - updated_log2phy_map.shape[0]
updated_log2phy_map_padded = torch.nn.functional.pad(
updated_log2phy_map,
pad=(0, pad_len),
mode='constant',
value=-1)
self.log2phy_map_per_layer[layer_id].copy_(
updated_log2phy_map_padded)

def global2local(self, placement: torch.Tensor,
E_local: int) -> torch.Tensor:
Expand Down
4 changes: 0 additions & 4 deletions vllm_ascend/eplb/core/eplb_device_transfer_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,6 @@ def generate_expert_d2d_transfer_task(self, expert_send_info,
)
return

# If neither send nor receive task is needed for this layer on this rank, return
if not (expert_send_info or expert_recv_info):
return

self.updated_expert_map = updated_expert_map

self.layer_id = layer_id
Expand Down
5 changes: 4 additions & 1 deletion vllm_ascend/ops/fused_moe/moe_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,17 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
if quantized_hidden_states is not None:
dispose_tensor(quantized_hidden_states)
# act_fn: swiglu
group_diff = torch.diff(group_list, dim=0)
new_group = torch.cat([group_list[0].unsqueeze(0), group_diff],
dim=0)
hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
x=hidden_states,
weight_scale=w1_scale,
activation_scale=pertoken_scale,
bias=None,
quant_scale=None,
quant_offset=None,
group_index=group_list,
group_index=new_group,
activate_left=True,
quant_mode=1,
)
Expand Down
Loading