Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions python/sglang/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1641,15 +1641,21 @@ def _ensure_remove_suffix(text: str, suffix: str):
return text.removesuffix(suffix)


class ModelDeploySetup:
def __init__(self, model_path: str, extra_args: List[str] = []):
class ModelLaunchSettings:
def __init__(
self,
model_path: str,
tp_size: int = 1,
extra_args: Optional[List[str]] = None,
env: Optional[dict] = None,
):
self.model_path = model_path
if "--enable-multimodal" not in extra_args:
extra_args.append("--enable-multimodal")
if "--trust-remote-code" not in extra_args:
extra_args.append("--trust-remote-code")
self.tp_size = tp_size
self.extra_args = list(extra_args) if extra_args else []
self.env = env

self.extra_args = extra_args
if self.tp_size > 1 and "--tp" not in self.extra_args:
self.extra_args.extend(["--tp", str(self.tp_size)])


class ModelEvalMetrics:
Expand Down
154 changes: 69 additions & 85 deletions test/srt/test_nightly_gsm8k_eval_amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
ModelLaunchSettings,
is_in_ci,
parse_models,
popen_launch_server,
Expand All @@ -38,30 +39,6 @@
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84,
}

failing_models = {
"neuralmagic/gemma-2-2b-it-FP8",
}


def remove_failing_models(model_str):
models = model_str.split(",")
filtered = [m for m in models if m not in failing_models]
return ",".join(filtered)


DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = remove_failing_models(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
)
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = remove_failing_models(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
)
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = remove_failing_models(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
)
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = remove_failing_models(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
)

NO_MOE_PADDING_MODELS = {"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"}
DISABLE_HF_XET_MODELS = {
"Qwen/Qwen2-57B-A14B-Instruct",
Expand All @@ -75,20 +52,6 @@ def remove_failing_models(model_str):
}


def popen_launch_server_wrapper(base_url, model, is_tp2):
other_args = ["--log-level-http", "warning", "--trust-remote-code"]
if is_tp2:
other_args.extend(["--tp", "2"])

process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
return process


def check_model_scores(results):
failed_models = []
summary = " | model | score | threshold |\n"
Expand Down Expand Up @@ -122,13 +85,37 @@ def check_model_scores(results):
class TestNightlyGsm8KEval(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_groups = [
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
]
cls.models = []
cls.base_url = DEFAULT_URL_FOR_TEST
extra_args = ["--log-level-http", "warning", "--trust-remote-code"]

def create_model_setup(model_path, tp_size):
env = {
"SGLANG_MOE_PADDING": (
"0" if model_path in NO_MOE_PADDING_MODELS else "1"
),
"HF_HUB_DISABLE_XET": (
"1" if model_path in DISABLE_HF_XET_MODELS else "0"
),
"SGLANG_USE_AITER": "0" if model_path in TRITON_MOE_MODELS else "1",
}
cls.models.append(
ModelLaunchSettings(
model_path, tp_size=tp_size, extra_args=extra_args, env=env
)
)

models_tp1 = parse_models(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1)
for model_path in models_tp1:
create_model_setup(model_path, 1)

models_tp2 = parse_models(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2)
for model_path in models_tp2:
create_model_setup(model_path, 2)

def test_mgsm_en_all_models(self):
warnings.filterwarnings(
Expand All @@ -137,47 +124,44 @@ def test_mgsm_en_all_models(self):
is_first = True
all_results = []

for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
with self.subTest(model=model):
os.environ["SGLANG_MOE_PADDING"] = (
"0" if model in NO_MOE_PADDING_MODELS else "1"
)
os.environ["HF_HUB_DISABLE_XET"] = (
"1" if model in DISABLE_HF_XET_MODELS else "0"
)
os.environ["SGLANG_USE_AITER"] = (
"0" if model in TRITON_MOE_MODELS else "1"
)

process = popen_launch_server_wrapper(self.base_url, model, is_tp2)

args = SimpleNamespace(
base_url=self.base_url,
model=model,
eval_name="mgsm_en",
num_examples=None,
num_threads=1024,
)
# Allow retries, so flaky errors are avoided.
threshold = MODEL_SCORE_THRESHOLDS.get(model)
for attempt in range(3):
try:
metrics = run_eval(args)
score = metrics["score"]
if score >= threshold:
break
except Exception as e:
print(f"Attempt {attempt + 1} failed with error: {e}")
print(
f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
)

write_results_to_json(model, metrics, "w" if is_first else "a")
is_first = False

all_results.append((model, metrics["score"]))
kill_process_tree(process.pid)
for model_setup in self.models:
with self.subTest(model=model_setup.model_path):
process = popen_launch_server(
model=model_setup.model_path,
base_url=self.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=model_setup.extra_args,
env=model_setup.env,
)

args = SimpleNamespace(
base_url=self.base_url,
model=model_setup.model_path,
eval_name="mgsm_en",
num_examples=None,
num_threads=1024,
)
# Allow retries, so flaky errors are avoided.
threshold = MODEL_SCORE_THRESHOLDS.get(model_setup.model_path)
for attempt in range(3):
try:
metrics = run_eval(args)
score = metrics["score"]
if score >= threshold:
break
except Exception as e:
print(f"Attempt {attempt + 1} failed with error: {e}")
print(
f"{'=' * 42}\n{model_setup.model_path} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
)

write_results_to_json(
model_setup.model_path, metrics, "w" if is_first else "a"
)
is_first = False

all_results.append((model_setup.model_path, metrics["score"]))
kill_process_tree(process.pid)

try:
with open("results.json", "r") as f:
Expand Down
63 changes: 36 additions & 27 deletions test/srt/test_nightly_text_models_gsm8k_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
ModelLaunchSettings,
check_evaluation_test_results,
parse_models,
popen_launch_server,
Expand Down Expand Up @@ -44,12 +45,19 @@
class TestNightlyGsm8KEval(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_groups = [
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
]
cls.models = []
models_tp1 = parse_models(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1)
for model_path in models_tp1:
cls.models.append(ModelLaunchSettings(model_path, tp_size=1))

models_tp2 = parse_models(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2)
for model_path in models_tp2:
cls.models.append(ModelLaunchSettings(model_path, tp_size=2))

cls.base_url = DEFAULT_URL_FOR_TEST

def test_mgsm_en_all_models(self):
Expand All @@ -58,41 +66,42 @@ def test_mgsm_en_all_models(self):
)
is_first = True
all_results = []
model_count = 0
for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
model_count += 1
with self.subTest(model=model):
other_args = ["--tp", "2"] if is_tp2 else []

if model == "meta-llama/Llama-3.1-70B-Instruct":
other_args.extend(["--mem-fraction-static", "0.9"])

process = popen_launch_server(
model=model,
other_args=other_args,
base_url=self.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
)
for model_setup in self.models:
with self.subTest(model=model_setup.model_path):
other_args = list(model_setup.extra_args)

if model_setup.model_path == "meta-llama/Llama-3.1-70B-Instruct":
other_args.extend(["--mem-fraction-static", "0.9"])

process = popen_launch_server(
model=model_setup.model_path,
other_args=other_args,
base_url=self.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
)

try:
args = SimpleNamespace(
base_url=self.base_url,
model=model,
model=model_setup.model_path,
eval_name="mgsm_en",
num_examples=None,
num_threads=1024,
)

metrics = run_eval(args)
print(
f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
f"{'=' * 42}\n{model_setup.model_path} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
)

write_results_to_json(model, metrics, "w" if is_first else "a")
write_results_to_json(
model_setup.model_path, metrics, "w" if is_first else "a"
)
is_first = False

# 0.0 for empty latency
all_results.append((model, metrics["score"], 0.0))
all_results.append((model_setup.model_path, metrics["score"], 0.0))
finally:
kill_process_tree(process.pid)

try:
Expand All @@ -107,7 +116,7 @@ def test_mgsm_en_all_models(self):
all_results,
self.__class__.__name__,
model_accuracy_thresholds=MODEL_SCORE_THRESHOLDS,
model_count=model_count,
model_count=len(self.models),
)


Expand Down
Loading
Loading