TEST: add evaluate result to github

littlegy · littlegy · commit aa8a0bd656db · 2025-09-18T17:49:43.000+08:00
diff --git a/autotest/config.yaml b/autotest/config.yaml
@@ -75,7 +75,6 @@ turbomind_chat_model:
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2.5-32B-Instruct
     - Qwen/Qwen2.5-72B-Instruct
-    - Qwen/Qwen2-7B-Instruct
     - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
     - Qwen/Qwen2.5-VL-7B-Instruct
     - Qwen/Qwen2.5-VL-32B-Instruct
@@ -138,7 +137,6 @@ pytorch_chat_model:
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2.5-32B-Instruct
     - Qwen/Qwen2.5-72B-Instruct
-    - Qwen/Qwen2-7B-Instruct
     - Qwen/Qwen1.5-MoE-A2.7B-Chat
     - Qwen/Qwen2.5-VL-7B-Instruct
     - Qwen/Qwen2.5-VL-32B-Instruct
@@ -384,7 +382,6 @@ evaluate_model:
   - internlm/internlm3-8b-instruct
   - meta-llama/Meta-Llama-3-8B-Instruct
   - meta-llama/Meta-Llama-3-1-8B-Instruct
-  - Qwen/Qwen2-7B-Instruct
   - Qwen/Qwen2.5-7B-Instruct
   - Qwen/Qwen2.5-32B-Instruct
   - meta-llama/Llama-2-7b-chat-hf
diff --git a/autotest/evaluate/eval_config_base.py b/autotest/evaluate/eval_config_base.py
diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py
@@ -8,9 +8,9 @@
 
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
 
-MODEL_NAME = 'Qwen2-7B-Instruct'
-MODEL_PATH = '/nvme/qa_test_models/Qwen/Qwen2-7B-Instruct'
-API_BASE = 'http://127.0.0.1:65525/v1'
+MODEL_NAME = ''
+MODEL_PATH = ''
+API_BASE = ''
 
 api_meta_template = dict(round=[
     dict(role='HUMAN', api_role='HUMAN'),
diff --git a/autotest/evaluate/test_api_evaluate_pytorch.py b/autotest/evaluate/test_api_evaluate_pytorch.py
@@ -21,12 +21,6 @@ def getModelList(tp_num):
     model_list = get_evaluate_pytorch_model_list(tp_num, kvint_list=[4, 8])
     new_model_list = []
     for model in model_list:
-        if model['backend'] == 'pytorch':
-            model['extra'] += '--cache-max-entry-count 0.8'
-        elif 'Llama-2' in model['model']:
-            model['extra'] += '--cache-max-entry-count 0.95'
-        elif 'internlm2' in model['model']:
-            model['extra'] += '--cache-max-entry-count 0.9'
         model['cuda_prefix'] = None
         new_model_list.append(model)
     return new_model_list
diff --git a/autotest/evaluate/test_api_evaluate_turbomind.py b/autotest/evaluate/test_api_evaluate_turbomind.py
@@ -21,12 +21,6 @@ def getModelList(tp_num):
     model_list = get_evaluate_turbomind_model_list(tp_num, kvint_list=[4, 8])
     new_model_list = []
     for model in model_list:
-        if model['backend'] == 'pytorch':
-            model['extra'] += '--cache-max-entry-count 0.8'
-        elif 'Llama-2' in model['model']:
-            model['extra'] += '--cache-max-entry-count 0.95'
-        elif 'internlm2' in model['model']:
-            model['extra'] += '--cache-max-entry-count 0.9'
         model['cuda_prefix'] = None
         new_model_list.append(model)
     return new_model_list
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
@@ -254,7 +254,10 @@ def get_evaluate_turbomind_model_list(tp_num, is_longtext: bool = False, kvint_l
     result = []
     if len(model_list) > 0:
 
-        communicators = ['native', 'nccl']
+        if tp_num > 1:
+            communicators = ['native', 'nccl']
+        else:
+            communicators = ['native']
         for communicator in communicators:
             for item in model_list:
                 if item.replace('-inner-4bits', '') in config.get('turbomind_chat_model') or item.replace(
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
@@ -1,3 +1,5 @@
+import csv
+import glob
 import os
 import subprocess
 
@@ -6,58 +8,88 @@
 DEFAULT_PORT = 23333
 
 
-def get_model_type(model_name):
-    model_name_lower = model_name.lower()
+def write_to_summary(model_name, tp_num, result, msg, worker_id, work_dir=None):
+    status = '✅ PASS' if result else '❌ FAIL'
 
-    chat_patterns = [
-        'chat',
-        'instruct',
-        'gemma',
-        'llama3',
-        'llama2',
-        'llama',
-    ]
-    if any(pattern in model_name_lower for pattern in chat_patterns):
-        return 'chat'
+    metrics = {}
+
+    if work_dir and os.path.exists(work_dir):
+        try:
+            summary_dirs = glob.glob(os.path.join(work_dir, '*', 'summary'))
+            if summary_dirs:
+                summary_dir = summary_dirs[0]
+                csv_files = glob.glob(os.path.join(summary_dir, 'summary_*.csv'))
+                if csv_files:
+                    csv_file = sorted(csv_files)[-1]
+                    if os.path.exists(csv_file):
+                        with open(csv_file, 'r') as f:
+                            reader = csv.reader(f)
+                            next(reader)
+                            for row in reader:
+                                if len(row) >= 5 and row[4]:
+                                    dataset = row[0]
+                                    metric_value = row[4]
+                                    try:
+                                        metrics[dataset] = f'{float(metric_value):.2f}'
+                                    except ValueError:
+                                        metrics[dataset] = metric_value
+        except Exception as e:
+            print(f'Error reading metrics: {str(e)}')
+
+    mmlu_value = metrics.get('mmlu', '')
+    gsm8k_value = metrics.get('gsm8k', '')
+
+    summary_line = f'| {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value} |\n'
+
+    summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None)
+    if summary_file:
+        write_header = False
+        if not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0:
+            write_header = True
+        else:
+            with open(summary_file, 'r') as f:
+                first_lines = f.read(200)
+                if '| Model | TP | Status | mmlu | gsm8k |' not in first_lines:
+                    write_header = True
+
+        with open(summary_file, 'a') as f:
+            if write_header:
+                f.write('## Model Evaluation Results\n')
+                f.write('| Model | TP | Status | mmlu | gsm8k |\n')
+                f.write('|-------|----|--------|------|-------|\n')
+            f.write(summary_line)
     else:
-        return 'base'
+        print(f'Summary: {model_name} | TP{tp_num} | {status} | {mmlu_value} | {gsm8k_value}')
 
 
 def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT):
+    work_dir = None
     try:
         model_name = prepare_environment['model']
         backend_type = prepare_environment['backend']
         tp_num = prepare_environment.get('tp_num', 1)
         communicator = prepare_environment.get('communicator', 'native')
         quant_policy = prepare_environment.get('quant_policy', 0)
 
-        model_type = get_model_type(model_name)
-        print(f'Model {model_name} identified as {model_type} model')
-
         current_dir = os.path.dirname(os.path.abspath(__file__))
         parent_dir = os.path.dirname(current_dir)
 
-        if model_type == 'base':
-            config_file = os.path.join(parent_dir, 'evaluate/eval_config_base.py')
-        else:
-            config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py')
+        config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py')
 
         model_base_path = config.get('model_path', '/nvme/qa_test_models')
         model_path = os.path.join(model_base_path, model_name)
 
         print(f'Starting OpenCompass evaluation for model: {model_name}')
         print(f'Model path: {model_path}')
         print(f'Backend: {backend_type}')
-        print(f'Model type: {model_type}')
         print(f'Config file: {config_file}')
 
         log_path = config.get('log_path', '/nvme/qa_test_models/autotest_model/log')
         os.makedirs(log_path, exist_ok=True)
 
         original_cwd = os.getcwd()
         work_dir = os.path.join(
-            log_path,
-            f"wk_{backend_type}_{model_name.replace('/', '_')}_{model_type}_{communicator}_{worker_id}_{quant_policy}")
+            log_path, f"wk_{backend_type}_{model_name.replace('/', '_')}_{communicator}_{worker_id}_{quant_policy}")
         os.makedirs(work_dir, exist_ok=True)
 
         try:
@@ -99,15 +131,13 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
 
             log_filename = (f'eval_{backend_type}_'
                             f"{model_name.replace('/', '_')}_"
-                            f'{model_type}_'
                             f'{communicator}_'
                             f'{worker_id}_'
                             f'{quant_policy}.log')
             log_file = os.path.join(log_path, log_filename)
 
             with open(log_file, 'w', encoding='utf-8') as f:
                 f.write(f'Model: {model_name}\n')
-                f.write(f'Model type: {model_type}\n')
                 f.write(f'Config file: {temp_config_file}\n')
                 f.write(f'Backend: {backend_type}\n')
                 f.write(f'TP Num: {tp_num}\n')
@@ -131,25 +161,29 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                     break
 
             if result.returncode == 0 and not evaluation_failed:
-                return True, f'Evaluation completed successfully for {model_name} ({model_type})'
+                final_result = True
+                final_msg = f'Evaluation completed successfully for {model_name}'
             else:
-                error_msg = f'Evaluation failed for {model_name} ({model_type}) '
+                final_result = False
+                final_msg = f'Evaluation failed for {model_name}'
                 if result.returncode != 0:
-                    error_msg += f'with return code {result.returncode}'
+                    final_msg += f'with return code {result.returncode}'
                 elif evaluation_failed:
-                    error_msg += 'with internal errors detected in logs'
+                    final_msg += 'with internal errors detected in logs'
 
                 if stderr_output:
-                    error_msg += f'\nSTDERR: {stderr_output}'
+                    final_msg += f'\nSTDERR: {stderr_output}'
                 else:
                     error_lines = []
                     for line in stdout_output.split('\n'):
                         if any(keyword in line for keyword in error_keywords):
                             error_lines.append(line)
                     if error_lines:
-                        error_msg += f'\nLog errors: {" | ".join(error_lines[:3])}'
+                        final_msg += f'\nLog errors: {" | ".join(error_lines[:3])}'
+
+            write_to_summary(model_name, tp_num, final_result, final_msg, worker_id, work_dir)
 
-                return False, error_msg
+            return final_result, final_msg
 
         finally:
             os.chdir(original_cwd)
@@ -158,6 +192,11 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
     except subprocess.TimeoutExpired:
         timeout_msg = (f'Evaluation timed out for {model_name} '
                        f'after 7200 seconds')
+        if work_dir:
+            write_to_summary(model_name, tp_num, False, timeout_msg, worker_id, work_dir)
         return False, timeout_msg
     except Exception as e:
-        return False, f'Error during evaluation for {model_name}: {str(e)}'
+        error_msg = f'Error during evaluation for {model_name}: {str(e)}'
+        if work_dir:
+            write_to_summary(model_name, tp_num, False, error_msg, worker_id, work_dir)
+        return False, error_msg