diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index bfe515c15ce..765d0ac394c 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -20,7 +20,6 @@ from pathlib import Path from types import SimpleNamespace from typing import Any, Awaitable, Callable, List, Optional, Tuple -from urllib.parse import quote import aiohttp import numpy as np @@ -1641,15 +1640,26 @@ def _ensure_remove_suffix(text: str, suffix: str): return text.removesuffix(suffix) -class ModelDeploySetup: - def __init__(self, model_path: str, extra_args: List[str] = []): +class ModelLaunchSettings: + def __init__( + self, + model_path: str, + tp_size: int = 1, + extra_args: Optional[List[str]] = None, + env: Optional[dict] = None, + ): self.model_path = model_path - if "--enable-multimodal" not in extra_args: - extra_args.append("--enable-multimodal") - if "--trust-remote-code" not in extra_args: - extra_args.append("--trust-remote-code") + self.tp_size = tp_size + self.extra_args = list(extra_args) if extra_args else [] + self.env = env - self.extra_args = extra_args + if self.tp_size > 1 and "--tp" not in self.extra_args: + self.extra_args.extend(["--tp", str(self.tp_size)]) + + fixed_args = ["--enable-multimodal", "--trust-remote-code"] + for fixed_arg in fixed_args: + if fixed_arg not in self.extra_args: + self.extra_args.append(fixed_arg) class ModelEvalMetrics: diff --git a/test/srt/test_nightly_text_models_gsm8k_eval.py b/test/srt/test_nightly_text_models_gsm8k_eval.py index add936f55cc..8cd62e604ef 100644 --- a/test/srt/test_nightly_text_models_gsm8k_eval.py +++ b/test/srt/test_nightly_text_models_gsm8k_eval.py @@ -12,6 +12,7 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, + ModelLaunchSettings, check_evaluation_test_results, parse_models, popen_launch_server, @@ -44,12 +45,19 @@ class TestNightlyGsm8KEval(unittest.TestCase): @classmethod def setUpClass(cls): - cls.model_groups = [ - (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), - (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), - (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False), - (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), - ] + cls.models = [] + models_tp1 = parse_models( + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 + ) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1) + for model_path in models_tp1: + cls.models.append(ModelLaunchSettings(model_path, tp_size=1)) + + models_tp2 = parse_models( + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 + ) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2) + for model_path in models_tp2: + cls.models.append(ModelLaunchSettings(model_path, tp_size=2)) + cls.base_url = DEFAULT_URL_FOR_TEST def test_mgsm_en_all_models(self): @@ -58,26 +66,24 @@ def test_mgsm_en_all_models(self): ) is_first = True all_results = [] - model_count = 0 - for model_group, is_fp8, is_tp2 in self.model_groups: - for model in model_group: - model_count += 1 - with self.subTest(model=model): - other_args = ["--tp", "2"] if is_tp2 else [] - - if model == "meta-llama/Llama-3.1-70B-Instruct": - other_args.extend(["--mem-fraction-static", "0.9"]) - - process = popen_launch_server( - model=model, - other_args=other_args, - base_url=self.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - ) + for model_setup in self.models: + with self.subTest(model=model_setup.model_path): + other_args = list(model_setup.extra_args) + if model_setup.model_path == "meta-llama/Llama-3.1-70B-Instruct": + other_args.extend(["--mem-fraction-static", "0.9"]) + + process = popen_launch_server( + model=model_setup.model_path, + other_args=other_args, + base_url=self.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + ) + + try: args = SimpleNamespace( base_url=self.base_url, - model=model, + model=model_setup.model_path, eval_name="mgsm_en", num_examples=None, num_threads=1024, @@ -85,14 +91,17 @@ def test_mgsm_en_all_models(self): metrics = run_eval(args) print( - f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n" + f"{'=' * 42}\n{model_setup.model_path} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n" ) - write_results_to_json(model, metrics, "w" if is_first else "a") + write_results_to_json( + model_setup.model_path, metrics, "w" if is_first else "a" + ) is_first = False # 0.0 for empty latency - all_results.append((model, metrics["score"], 0.0)) + all_results.append((model_setup.model_path, metrics["score"], 0.0)) + finally: kill_process_tree(process.pid) try: @@ -107,7 +116,7 @@ def test_mgsm_en_all_models(self): all_results, self.__class__.__name__, model_accuracy_thresholds=MODEL_SCORE_THRESHOLDS, - model_count=model_count, + model_count=len(self.models), ) diff --git a/test/srt/test_nightly_text_models_perf.py b/test/srt/test_nightly_text_models_perf.py index a9ab6d003d2..999d2628949 100644 --- a/test/srt/test_nightly_text_models_perf.py +++ b/test/srt/test_nightly_text_models_perf.py @@ -8,6 +8,7 @@ from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, + ModelLaunchSettings, _parse_int_list_env, is_in_ci, parse_models, @@ -21,14 +22,16 @@ class TestNightlyTextModelsPerformance(unittest.TestCase): @classmethod def setUpClass(cls): - cls.model_groups = [ - (parse_models("meta-llama/Llama-3.1-8B-Instruct"), False, False), - (parse_models("Qwen/Qwen2-57B-A14B-Instruct"), False, True), - # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), - # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), - # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False), - # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), - ] + cls.models = [] + # TODO: replace with DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 or other model lists + for model_path in parse_models("meta-llama/Llama-3.1-8B-Instruct"): + cls.models.append(ModelLaunchSettings(model_path, tp_size=1)) + for model_path in parse_models("Qwen/Qwen2-57B-A14B-Instruct"): + cls.models.append(ModelLaunchSettings(model_path, tp_size=2)) + # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), + # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), + # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False), + # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), cls.base_url = DEFAULT_URL_FOR_TEST cls.batch_sizes = [1, 1, 8, 16, 64] cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) @@ -39,93 +42,86 @@ def setUpClass(cls): def test_bench_one_batch(self): all_benchmark_results = [] - for model_group, is_fp8, is_tp2 in self.model_groups: - for model in model_group: - benchmark_results = [] - with self.subTest(model=model): - process = popen_launch_server( - model=model, - base_url=self.base_url, - other_args=["--tp", "2"] if is_tp2 else [], - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + for model_setup in self.models: + benchmark_results = [] + with self.subTest(model=model_setup.model_path): + process = popen_launch_server( + model=model_setup.model_path, + base_url=self.base_url, + other_args=model_setup.extra_args, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + ) + try: + + profile_filename = ( + f"{model_setup.model_path.replace('/', '_')}_{int(time.time())}" ) - try: - - profile_filename = ( - f"{model.replace('/', '_')}_{int(time.time())}" - ) - profile_path_prefix = os.path.join( - PROFILE_DIR, profile_filename + profile_path_prefix = os.path.join(PROFILE_DIR, profile_filename) + json_output_file = f"results_{model_setup.model_path.replace('/', '_')}_{int(time.time())}.json" + + command = [ + "python3", + "-m", + "sglang.bench_one_batch_server", + "--model", + model_setup.model_path, + "--base-url", + self.base_url, + "--batch-size", + *[str(x) for x in self.batch_sizes], + "--input-len", + *[str(x) for x in self.input_lens], + "--output-len", + *[str(x) for x in self.output_lens], + "--show-report", + "--profile", + "--profile-by-stage", + "--profile-filename-prefix", + profile_path_prefix, + f"--output-path={json_output_file}", + "--no-append-to-github-summary", + ] + + print(f"Running command: {' '.join(command)}") + result = subprocess.run(command, capture_output=True, text=True) + + if result.returncode != 0: + print( + f"Error running benchmark for {model_setup.model_path} with batch size:" ) - json_output_file = ( - f"results_{model.replace('/', '_')}_{int(time.time())}.json" + print(result.stderr) + # Continue to next batch size even if one fails + continue + + # Load and deserialize JSON results + if os.path.exists(json_output_file): + import json + + with open(json_output_file, "r") as f: + json_data = json.load(f) + + # Convert JSON data to BenchmarkResult objects + for data in json_data: + benchmark_result = BenchmarkResult(**data) + all_benchmark_results.append(benchmark_result) + benchmark_results.append(benchmark_result) + + print( + f"Loaded {len(benchmark_results)} benchmark results from {json_output_file}" ) - command = [ - "python3", - "-m", - "sglang.bench_one_batch_server", - "--model", - model, - "--base-url", - self.base_url, - "--batch-size", - *[str(x) for x in self.batch_sizes], - "--input-len", - *[str(x) for x in self.input_lens], - "--output-len", - *[str(x) for x in self.output_lens], - "--show-report", - "--profile", - "--profile-by-stage", - "--profile-filename-prefix", - profile_path_prefix, - f"--output-path={json_output_file}", - "--no-append-to-github-summary", - ] - - print(f"Running command: {' '.join(command)}") - result = subprocess.run(command, capture_output=True, text=True) - - if result.returncode != 0: - print( - f"Error running benchmark for {model} with batch size:" - ) - print(result.stderr) - # Continue to next batch size even if one fails - continue - - # Load and deserialize JSON results - if os.path.exists(json_output_file): - import json - - with open(json_output_file, "r") as f: - json_data = json.load(f) - - # Convert JSON data to BenchmarkResult objects - for data in json_data: - benchmark_result = BenchmarkResult(**data) - all_benchmark_results.append(benchmark_result) - benchmark_results.append(benchmark_result) - - print( - f"Loaded {len(benchmark_results)} benchmark results from {json_output_file}" - ) - - # Clean up JSON file - os.remove(json_output_file) - else: - print( - f"Warning: JSON output file {json_output_file} not found" - ) - - finally: - kill_process_tree(process.pid) - - report_part = BenchmarkResult.generate_markdown_report( - PROFILE_DIR, benchmark_results - ) - self.full_report += report_part + "\n" + # Clean up JSON file + os.remove(json_output_file) + else: + print(f"Warning: JSON output file {json_output_file} not found") + + finally: + kill_process_tree(process.pid) + + report_part = BenchmarkResult.generate_markdown_report( + PROFILE_DIR, benchmark_results + ) + self.full_report += report_part + "\n" if is_in_ci(): write_github_step_summary(self.full_report) diff --git a/test/srt/test_nightly_vlms_mmmu_eval.py b/test/srt/test_nightly_vlms_mmmu_eval.py index be3230a66f0..34ba4b31a26 100644 --- a/test/srt/test_nightly_vlms_mmmu_eval.py +++ b/test/srt/test_nightly_vlms_mmmu_eval.py @@ -1,6 +1,7 @@ import json import unittest import warnings +from functools import partial from types import SimpleNamespace from sglang.srt.utils import kill_process_tree @@ -8,8 +9,8 @@ from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, - ModelDeploySetup, ModelEvalMetrics, + ModelLaunchSettings, check_evaluation_test_results, popen_launch_server, write_results_to_json, @@ -17,25 +18,29 @@ MODEL_THRESHOLDS = { # Conservative thresholds on 100 MMMU samples, especially for latency thresholds - ModelDeploySetup("deepseek-ai/deepseek-vl2-small"): ModelEvalMetrics(0.330, 56.1), - ModelDeploySetup("deepseek-ai/Janus-Pro-7B"): ModelEvalMetrics(0.285, 39.9), - ModelDeploySetup("Efficient-Large-Model/NVILA-Lite-2B-hf-0626"): ModelEvalMetrics( - 0.305, 23.8 + ModelLaunchSettings("deepseek-ai/deepseek-vl2-small"): ModelEvalMetrics( + 0.330, 56.1 ), - ModelDeploySetup("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9), - ModelDeploySetup("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3), - ModelDeploySetup("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6), - ModelDeploySetup("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(0.330, 22.3), - ModelDeploySetup("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3), - ModelDeploySetup("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5), - ModelDeploySetup("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0), - ModelDeploySetup("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3), - ModelDeploySetup("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9), - ModelDeploySetup("unsloth/Mistral-Small-3.1-24B-Instruct-2503"): ModelEvalMetrics( - 0.310, 16.7 + ModelLaunchSettings("deepseek-ai/Janus-Pro-7B"): ModelEvalMetrics(0.285, 40.3), + ModelLaunchSettings( + "Efficient-Large-Model/NVILA-Lite-2B-hf-0626" + ): ModelEvalMetrics(0.305, 23.8), + ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9), + ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3), + ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6), + ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics( + 0.330, 22.3 ), - ModelDeploySetup("XiaomiMiMo/MiMo-VL-7B-RL"): ModelEvalMetrics(0.28, 32.0), - ModelDeploySetup("zai-org/GLM-4.1V-9B-Thinking"): ModelEvalMetrics(0.280, 30.4), + ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3), + ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5), + ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0), + ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3), + ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9), + ModelLaunchSettings( + "unsloth/Mistral-Small-3.1-24B-Instruct-2503" + ): ModelEvalMetrics(0.310, 16.7), + ModelLaunchSettings("XiaomiMiMo/MiMo-VL-7B-RL"): ModelEvalMetrics(0.28, 32.0), + ModelLaunchSettings("zai-org/GLM-4.1V-9B-Thinking"): ModelEvalMetrics(0.280, 30.4), } diff --git a/test/srt/test_nightly_vlms_perf.py b/test/srt/test_nightly_vlms_perf.py index c4d10a56eaa..03d2e164af3 100644 --- a/test/srt/test_nightly_vlms_perf.py +++ b/test/srt/test_nightly_vlms_perf.py @@ -8,6 +8,7 @@ from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, + ModelLaunchSettings, _parse_int_list_env, is_in_ci, parse_models, @@ -19,8 +20,13 @@ MODEL_DEFAULTS = [ # Keep conservative defaults. Can be overridden by env NIGHTLY_VLM_MODELS - "Qwen/Qwen2.5-VL-7B-Instruct", - "google/gemma-3-27b-it", + ModelLaunchSettings( + "Qwen/Qwen2.5-VL-7B-Instruct", + extra_args=["--mem-fraction-static=0.7"], + ), + ModelLaunchSettings( + "google/gemma-3-27b-it", + ), # "OpenGVLab/InternVL2_5-2B", # buggy in official transformers impl # "openbmb/MiniCPM-V-2_6", @@ -33,9 +39,18 @@ def setUpClass(cls): warnings.filterwarnings( "ignore", category=ResourceWarning, message="unclosed.*socket" ) - cls.models = parse_models( - os.environ.get("NIGHTLY_VLM_MODELS", ",".join(MODEL_DEFAULTS)) - ) + + nightly_vlm_models_str = os.environ.get("NIGHTLY_VLM_MODELS") + if nightly_vlm_models_str: + cls.models = [] + model_paths = parse_models(nightly_vlm_models_str) + for model_path in model_paths: + cls.models.append( + ModelLaunchSettings(model_path, extra_args=VLM_EXTRA_ARGS) + ) + else: + cls.models = MODEL_DEFAULTS + cls.base_url = DEFAULT_URL_FOR_TEST cls.batch_sizes = _parse_int_list_env("NIGHTLY_VLM_BATCH_SIZES", "1,1,2,8,16") @@ -46,29 +61,31 @@ def setUpClass(cls): def test_bench_one_batch(self): all_benchmark_results = [] - for model in self.models: + for model_setup in self.models: benchmark_results = [] - with self.subTest(model=model): + with self.subTest(model=model_setup.model_path): process = popen_launch_server( - model=model, + model=model_setup.model_path, base_url=self.base_url, - other_args=["--mem-fraction-static=0.7"], + other_args=model_setup.extra_args, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, ) try: # Run bench_one_batch_server against the launched server - profile_filename = f"{model.replace('/', '_')}" + profile_filename = f"{model_setup.model_path.replace('/', '_')}" # path for this run profile_path_prefix = os.path.join(PROFILE_DIR, profile_filename) # JSON output file for this model - json_output_file = f"results_{model.replace('/', '_')}.json" + json_output_file = ( + f"results_{model_setup.model_path.replace('/', '_')}.json" + ) command = [ "python3", "-m", "sglang.bench_one_batch_server", - f"--model={model}", + f"--model={model_setup.model_path}", "--base-url", self.base_url, "--batch-size", @@ -91,12 +108,14 @@ def test_bench_one_batch(self): result = subprocess.run(command, capture_output=True, text=True) if result.returncode != 0: - print(f"Error running benchmark for {model} with batch size:") + print( + f"Error running benchmark for {model_setup.model_path} with batch size:" + ) print(result.stderr) # Continue to next batch size even if one fails continue - print(f"Output for {model} with batch size:") + print(f"Output for {model_setup.model_path} with batch size:") print(result.stdout) # Load and deserialize JSON results