InternLM · lvhan028 · Oct 10, 2025 · Oct 9, 2025 · Oct 10, 2025
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -17,7 +17,7 @@ on:
         required: true
         description: 'Set benchmark type. Default is "["generation", "throughput", "api_server"]"'
         type: string
-        default: "['apiserver', 'generation', 'throughput']"
+        default: "['apiserver', 'throughput']"
       offline_mode:
         required: true
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'

diff --git a/autotest/benchmark/test_generation_performance.py b/autotest/benchmark/test_generation_performance.py
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
@@ -12,54 +12,6 @@
 GENERATION_LONGTEXT_CONFIG = ' -c 1 --session-len 200000 -ct 1024 -pt 198000'
 
 
-def generation_test(config,
-                    run_id,
-                    run_config,
-                    is_longtext: bool = False,
-                    cuda_prefix: str = None,
-                    worker_id: str = '',
-                    is_smoke: bool = False):
-    model = run_config['model']
-    backend = run_config['backend']
-    tp_num = run_config['tp_num']
-    model_path = '/'.join([config.get('model_path'), model])
-    log_path = config.get('log_path')
-    benchmark_log = os.path.join(log_path, 'benchmark_generation_' + model.split('/')[1] + worker_id + '.log')
-    benchmark_path = '/'.join([config.get('benchmark_path'), run_id, model, f'benchmark-generation-{backend}'])
-
-    create_multi_level_directory(benchmark_path)
-
-    print(cuda_prefix)
-    command = f'python3 benchmark/profile_generation.py {model_path} '
-    command = get_command_with_extra(command, cuda_prefix)
-
-    run_config = ''
-    if backend == 'pytorch':
-        command += ' --backend pytorch'
-        if not _is_bf16_supported_by_device():
-            command += ' --dtype float16'
-    else:
-        if '4bit' in model:
-            command += ' --model-format awq'
-
-    if is_longtext:
-        run_config = run_config + GENERATION_LONGTEXT_CONFIG
-        csv_path = f'{benchmark_path}/generation_longtext.csv'
-    else:
-        run_config = run_config + GENERATION_CONFIG
-        csv_path = f'{benchmark_path}/generation.csv'
-    if is_smoke:
-        run_config = ' -c 1 -ct 128 -pt 128'
-
-    cmd = ' '.join([command, run_config, '--tp', str(tp_num), get_max_cache_entry(model, backend), '--csv', csv_path])
-
-    returncode, stderr = run_testcase(cmd, benchmark_log)
-    allure.attach.file(benchmark_log, attachment_type=allure.attachment_type.TEXT)
-    if returncode == 0 and not os.path.isfile(csv_path):
-        return False, 'result is empty'
-    return returncode == 0, stderr
-
-
 def throughput_test(config, run_id, run_config, cuda_prefix: str = None, worker_id: str = '', is_smoke: bool = False):
     model = run_config['model']
     backend = run_config['backend']

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -19,20 +19,6 @@ python profile_throughput.py \
  --concurrency 64
 ```
 
-## profile without dataset
-
-`profile_generation.py` perform benchmark with dummy data.
-
-```shell
-pip install nvidia-ml-py
-```
-
-```bash
-python profile_generation.py \
- /path/to/your/model \
- --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
-```
-
 ## profile restful api
 
 `profile_restful_api.py` is used to do benchmark on api server.