Skip to content

Commit aa8a0bd

Browse files
committed
TEST: add evaluate result to github
1 parent a955b7d commit aa8a0bd

File tree

7 files changed

+79
-98
lines changed

7 files changed

+79
-98
lines changed

autotest/config.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ turbomind_chat_model:
7575
- Qwen/Qwen2.5-7B-Instruct
7676
- Qwen/Qwen2.5-32B-Instruct
7777
- Qwen/Qwen2.5-72B-Instruct
78-
- Qwen/Qwen2-7B-Instruct
7978
- Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
8079
- Qwen/Qwen2.5-VL-7B-Instruct
8180
- Qwen/Qwen2.5-VL-32B-Instruct
@@ -138,7 +137,6 @@ pytorch_chat_model:
138137
- Qwen/Qwen2.5-7B-Instruct
139138
- Qwen/Qwen2.5-32B-Instruct
140139
- Qwen/Qwen2.5-72B-Instruct
141-
- Qwen/Qwen2-7B-Instruct
142140
- Qwen/Qwen1.5-MoE-A2.7B-Chat
143141
- Qwen/Qwen2.5-VL-7B-Instruct
144142
- Qwen/Qwen2.5-VL-32B-Instruct
@@ -384,7 +382,6 @@ evaluate_model:
384382
- internlm/internlm3-8b-instruct
385383
- meta-llama/Meta-Llama-3-8B-Instruct
386384
- meta-llama/Meta-Llama-3-1-8B-Instruct
387-
- Qwen/Qwen2-7B-Instruct
388385
- Qwen/Qwen2.5-7B-Instruct
389386
- Qwen/Qwen2.5-32B-Instruct
390387
- meta-llama/Llama-2-7b-chat-hf

autotest/evaluate/eval_config_base.py

Lines changed: 0 additions & 46 deletions
This file was deleted.

autotest/evaluate/eval_config_chat.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88

99
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
1010

11-
MODEL_NAME = 'Qwen2-7B-Instruct'
12-
MODEL_PATH = '/nvme/qa_test_models/Qwen/Qwen2-7B-Instruct'
13-
API_BASE = 'http://127.0.0.1:65525/v1'
11+
MODEL_NAME = ''
12+
MODEL_PATH = ''
13+
API_BASE = ''
1414

1515
api_meta_template = dict(round=[
1616
dict(role='HUMAN', api_role='HUMAN'),

autotest/evaluate/test_api_evaluate_pytorch.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,6 @@ def getModelList(tp_num):
2121
model_list = get_evaluate_pytorch_model_list(tp_num, kvint_list=[4, 8])
2222
new_model_list = []
2323
for model in model_list:
24-
if model['backend'] == 'pytorch':
25-
model['extra'] += '--cache-max-entry-count 0.8'
26-
elif 'Llama-2' in model['model']:
27-
model['extra'] += '--cache-max-entry-count 0.95'
28-
elif 'internlm2' in model['model']:
29-
model['extra'] += '--cache-max-entry-count 0.9'
3024
model['cuda_prefix'] = None
3125
new_model_list.append(model)
3226
return new_model_list

autotest/evaluate/test_api_evaluate_turbomind.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,6 @@ def getModelList(tp_num):
2121
model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8])
2222
new_model_list = []
2323
for model in model_list:
24-
if model['backend'] == 'pytorch':
25-
model['extra'] += '--cache-max-entry-count 0.8'
26-
elif 'Llama-2' in model['model']:
27-
model['extra'] += '--cache-max-entry-count 0.95'
28-
elif 'internlm2' in model['model']:
29-
model['extra'] += '--cache-max-entry-count 0.9'
3024
model['cuda_prefix'] = None
3125
new_model_list.append(model)
3226
return new_model_list

autotest/utils/config_utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,10 @@ def get_evaluate_turbomind_model_list(tp_num, is_longtext: bool = False, kvint_l
254254
result = []
255255
if len(model_list) > 0:
256256

257-
communicators = ['native', 'nccl']
257+
if tp_num > 1:
258+
communicators = ['native', 'nccl']
259+
else:
260+
communicators = ['native']
258261
for communicator in communicators:
259262
for item in model_list:
260263
if item.replace('-inner-4bits', '') in config.get('turbomind_chat_model') or item.replace(

autotest/utils/evaluate_utils.py

Lines changed: 72 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import csv
2+
import glob
13
import os
24
import subprocess
35

@@ -6,58 +8,88 @@
68
DEFAULT_PORT = 23333
79

810

9-
def get_model_type(model_name):
10-
model_name_lower = model_name.lower()
11+
def write_to_summary(model_name, tp_num, result, msg, worker_id, work_dir=None):
12+
status = '✅ PASS' if result else '❌ FAIL'
1113

12-
chat_patterns = [
13-
'chat',
14-
'instruct',
15-
'gemma',
16-
'llama3',
17-
'llama2',
18-
'llama',
19-
]
20-
if any(pattern in model_name_lower for pattern in chat_patterns):
21-
return 'chat'
14+
metrics = {}
15+
16+
if work_dir and os.path.exists(work_dir):
17+
try:
18+
summary_dirs = glob.glob(os.path.join(work_dir, '*', 'summary'))
19+
if summary_dirs:
20+
summary_dir = summary_dirs[0]
21+
csv_files = glob.glob(os.path.join(summary_dir, 'summary_*.csv'))
22+
if csv_files:
23+
csv_file = sorted(csv_files)[-1]
24+
if os.path.exists(csv_file):
25+
with open(csv_file, 'r') as f:
26+
reader = csv.reader(f)
27+
next(reader)
28+
for row in reader:
29+
if len(row) >= 5 and row[4]:
30+
dataset = row[0]
31+
metric_value = row[4]
32+
try:
33+
metrics[dataset] = f'{float(metric_value):.2f}'
34+
except ValueError:
35+
metrics[dataset] = metric_value
36+
except Exception as e:
37+
print(f'Error reading metrics: {str(e)}')
38+
39+
mmlu_value = metrics.get('mmlu', '')
40+
gsm8k_value = metrics.get('gsm8k', '')
41+
42+
summary_line = f'| {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n'
43+
44+
summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None)
45+
if summary_file:
46+
write_header = False
47+
if not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0:
48+
write_header = True
49+
else:
50+
with open(summary_file, 'r') as f:
51+
first_lines = f.read(200)
52+
if '| Model | TP | Status | mmlu | gsm8k |' not in first_lines:
53+
write_header = True
54+
55+
with open(summary_file, 'a') as f:
56+
if write_header:
57+
f.write('## Model Evaluation Results\n')
58+
f.write('| Model | TP | Status | mmlu | gsm8k |\n')
59+
f.write('|-------|----|--------|------|-------|\n')
60+
f.write(summary_line)
2261
else:
23-
return 'base'
62+
print(f'Summary: {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}')
2463

2564

2665
def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT):
66+
work_dir = None
2767
try:
2868
model_name = prepare_environment['model']
2969
backend_type = prepare_environment['backend']
3070
tp_num = prepare_environment.get('tp_num', 1)
3171
communicator = prepare_environment.get('communicator', 'native')
3272
quant_policy = prepare_environment.get('quant_policy', 0)
3373

34-
model_type = get_model_type(model_name)
35-
print(f'Model {model_name} identified as {model_type} model')
36-
3774
current_dir = os.path.dirname(os.path.abspath(__file__))
3875
parent_dir = os.path.dirname(current_dir)
3976

40-
if model_type == 'base':
41-
config_file = os.path.join(parent_dir, 'evaluate/eval_config_base.py')
42-
else:
43-
config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py')
77+
config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py')
4478

4579
model_base_path = config.get('model_path', '/nvme/qa_test_models')
4680
model_path = os.path.join(model_base_path, model_name)
4781

4882
print(f'Starting OpenCompass evaluation for model: {model_name}')
4983
print(f'Model path: {model_path}')
5084
print(f'Backend: {backend_type}')
51-
print(f'Model type: {model_type}')
5285
print(f'Config file: {config_file}')
5386

5487
log_path = config.get('log_path', '/nvme/qa_test_models/autotest_model/log')
5588
os.makedirs(log_path, exist_ok=True)
5689

5790
original_cwd = os.getcwd()
5891
work_dir = os.path.join(
59-
log_path,
60-
f"wk_{backend_type}_{model_name.replace('/', '_')}_{model_type}_{communicator}_{worker_id}_{quant_policy}")
92+
log_path, f"wk_{backend_type}_{model_name.replace('/', '_')}_{communicator}_{worker_id}_{quant_policy}")
6193
os.makedirs(work_dir, exist_ok=True)
6294

6395
try:
@@ -99,15 +131,13 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
99131

100132
log_filename = (f'eval_{backend_type}_'
101133
f"{model_name.replace('/', '_')}_"
102-
f'{model_type}_'
103134
f'{communicator}_'
104135
f'{worker_id}_'
105136
f'{quant_policy}.log')
106137
log_file = os.path.join(log_path, log_filename)
107138

108139
with open(log_file, 'w', encoding='utf-8') as f:
109140
f.write(f'Model: {model_name}\n')
110-
f.write(f'Model type: {model_type}\n')
111141
f.write(f'Config file: {temp_config_file}\n')
112142
f.write(f'Backend: {backend_type}\n')
113143
f.write(f'TP Num: {tp_num}\n')
@@ -131,25 +161,29 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
131161
break
132162

133163
if result.returncode == 0 and not evaluation_failed:
134-
return True, f'Evaluation completed successfully for {model_name} ({model_type})'
164+
final_result = True
165+
final_msg = f'Evaluation completed successfully for {model_name}'
135166
else:
136-
error_msg = f'Evaluation failed for {model_name} ({model_type}) '
167+
final_result = False
168+
final_msg = f'Evaluation failed for {model_name}'
137169
if result.returncode != 0:
138-
error_msg += f'with return code {result.returncode}'
170+
final_msg += f'with return code {result.returncode}'
139171
elif evaluation_failed:
140-
error_msg += 'with internal errors detected in logs'
172+
final_msg += 'with internal errors detected in logs'
141173

142174
if stderr_output:
143-
error_msg += f'\nSTDERR: {stderr_output}'
175+
final_msg += f'\nSTDERR: {stderr_output}'
144176
else:
145177
error_lines = []
146178
for line in stdout_output.split('\n'):
147179
if any(keyword in line for keyword in error_keywords):
148180
error_lines.append(line)
149181
if error_lines:
150-
error_msg += f'\nLog errors: {" | ".join(error_lines[:3])}'
182+
final_msg += f'\nLog errors: {" | ".join(error_lines[:3])}'
183+
184+
write_to_summary(model_name, tp_num, final_result, final_msg, worker_id, work_dir)
151185

152-
return False, error_msg
186+
return final_result, final_msg
153187

154188
finally:
155189
os.chdir(original_cwd)
@@ -158,6 +192,11 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
158192
except subprocess.TimeoutExpired:
159193
timeout_msg = (f'Evaluation timed out for {model_name} '
160194
f'after 7200 seconds')
195+
if work_dir:
196+
write_to_summary(model_name, tp_num, False, timeout_msg, worker_id, work_dir)
161197
return False, timeout_msg
162198
except Exception as e:
163-
return False, f'Error during evaluation for {model_name}: {str(e)}'
199+
error_msg = f'Error during evaluation for {model_name}: {str(e)}'
200+
if work_dir:
201+
write_to_summary(model_name, tp_num, False, error_msg, worker_id, work_dir)
202+
return False, error_msg

0 commit comments

Comments
 (0)