vllm-project
diff --git a/‎src/guidellm/__main__.py‎
Lines changed: 11 additions & 26 deletions b/‎src/guidellm/__main__.py‎
Lines changed: 11 additions & 26 deletions
diff --git a/‎src/guidellm/benchmark/entrypoints.py‎
Lines changed: 2 additions & 2 deletions b/‎src/guidellm/benchmark/entrypoints.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/guidellm/benchmark/schemas/generative/entrypoints.py‎
Lines changed: 4 additions & 4 deletions b/‎src/guidellm/benchmark/schemas/generative/entrypoints.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/guidellm/scheduler/constraints/factory.py‎
Lines changed: 2 additions & 3 deletions b/‎src/guidellm/scheduler/constraints/factory.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/guidellm/scheduler/constraints/saturation.py‎
Lines changed: 59 additions & 56 deletions b/‎src/guidellm/scheduler/constraints/saturation.py‎
Lines changed: 59 additions & 56 deletions
diff --git a/‎src/guidellm/utils/cli.py‎
Lines changed: 25 additions & 1 deletion b/‎src/guidellm/utils/cli.py‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎tests/e2e/test_over_saturated_benchmark.py‎
Lines changed: 5 additions & 4 deletions b/‎tests/e2e/test_over_saturated_benchmark.py‎
Lines changed: 5 additions & 4 deletions
@@ -25,7 +25,6 @@
 
 import asyncio
 import codecs
-import json
 from pathlib import Path
 
 import click
@@ -388,40 +387,26 @@ def benchmark():
 @click.option(
     "--over-saturation",
     "--detect-saturation",  # alias
+    "over_saturation",
+    callback=cli_tools.parse_json,
     default=None,
     help=(
         "Enable over-saturation detection. "
-        "Use --over-saturation=True for boolean flag, "
-        "or a JSON dict with configuration "
+        "Pass a JSON dict with configuration "
         '(e.g., \'{"enabled": true, "min_seconds": 30}\'). '
         "Defaults to None (disabled)."
     ),
-    type=click.UNPROCESSED,
+)
+@click.option(
+    "--default-over-saturation",
+    "over_saturation",
+    flag_value={"enabled": True},
+    help="Enable over-saturation detection with default settings.",
 )
 def run(**kwargs):  # noqa: C901
     # Only set CLI args that differ from click defaults
     kwargs = cli_tools.set_if_not_default(click.get_current_context(), **kwargs)
 
-    # Handle over_saturation parsing (can be bool flag or JSON dict string)
-    if "over_saturation" in kwargs and kwargs["over_saturation"] is not None:
-        over_sat = kwargs["over_saturation"]
-        if isinstance(over_sat, str):
-            try:
-                # Try parsing as JSON dict
-                kwargs["over_saturation"] = json.loads(over_sat)
-            except (json.JSONDecodeError, ValueError):
-                # If not valid JSON, treat as bool flag
-                kwargs["over_saturation"] = over_sat.lower() in (
-                    "true",
-                    "1",
-                    "yes",
-                    "on",
-                )
-        elif isinstance(over_sat, bool):
-            # Already a bool, keep as is
-            pass
-        # If it's already a dict, keep as is
-
     # Handle remapping for request params
     request_type = kwargs.pop("request_type", None)
     request_formatter_kwargs = kwargs.pop("request_formatter_kwargs", None)
@@ -557,8 +542,8 @@ def preprocess():
         "PreprocessDatasetConfig as JSON string, key=value pairs, "
         "or file path (.json, .yaml, .yml, .config). "
         "Example: 'prompt_tokens=100,output_tokens=50,prefix_tokens_max=10'"
-        " or '{\"prompt_tokens\": 100, \"output_tokens\": 50, "
-        "\"prefix_tokens_max\": 10}'"
+        ' or \'{"prompt_tokens": 100, "output_tokens": 50, '
+        '"prefix_tokens_max": 10}\''
     ),
 )
 @click.option(
 
@@ -323,7 +323,7 @@ async def resolve_profile(
     max_errors: int | None,
     max_error_rate: float | None,
     max_global_error_rate: float | None,
-    over_saturation: bool | dict[str, Any] | None = None,
+    over_saturation: dict[str, Any] | None = None,
     console: Console | None = None,
 ) -> Profile:
     """
@@ -344,7 +344,7 @@ async def resolve_profile(
     :param max_errors: Maximum number of errors before stopping
     :param max_error_rate: Maximum error rate threshold before stopping
     :param max_global_error_rate: Maximum global error rate threshold before stopping
-    :param over_saturation: Over-saturation detection configuration (bool or dict)
+    :param over_saturation: Over-saturation detection configuration (dict)
     :param console: Console instance for progress reporting, or None
     :return: Configured Profile instance ready for benchmarking
     :raises ValueError: If constraints are provided with a pre-configured Profile
 
@@ -283,12 +283,12 @@ def get_default(cls: type[BenchmarkGenerativeTextArgs], field: str) -> Any:
     max_global_error_rate: float | None = Field(
         default=None, description="Maximum global error rate (0-1) before stopping"
     )
-    over_saturation: bool | dict[str, Any] | None = Field(
+    over_saturation: dict[str, Any] | None = Field(
         default=None,
         description=(
-            "Over-saturation detection configuration. Can be a bool to enable/disable "
-            "with defaults, or a dict with configuration parameters (enabled, "
-            "min_seconds, max_window_seconds, moe_threshold, etc.)."
+            "Over-saturation detection configuration. A dict with configuration "
+            "parameters (enabled, min_seconds, max_window_seconds, "
+            "moe_threshold, etc.)."
         ),
     )
 
 
@@ -10,14 +10,13 @@
 
 from typing import Any
 
-from guidellm.utils import InfoMixin, RegistryMixin
-
-from .constraint import (
+from guidellm.scheduler.constraints.constraint import (
     Constraint,
     ConstraintInitializer,
     SerializableConstraintInitializer,
     UnserializableConstraintInitializer,
 )
+from guidellm.utils import InfoMixin, RegistryMixin
 
 __all__ = ["ConstraintsInitializerFactory"]
 
 
@@ -57,15 +57,16 @@
 
 from pydantic import Field
 
+from guidellm.scheduler.constraints.constraint import (
+    Constraint,
+    PydanticConstraintInitializer,
+)
+from guidellm.scheduler.constraints.factory import ConstraintsInitializerFactory
 from guidellm.scheduler.schemas import (
     SchedulerState,
     SchedulerUpdateAction,
 )
 from guidellm.schemas import RequestInfo
-from guidellm.settings import settings
-
-from .constraint import Constraint, PydanticConstraintInitializer
-from .factory import ConstraintsInitializerFactory
 
 __all__ = [
     "OverSaturationConstraint",
@@ -355,7 +356,12 @@ def reset(self) -> None:
         )
 
     def _add_finished(self, request: dict[str, Any]) -> None:
-        """Add a finished request to tracking."""
+        """
+        Add a finished request to tracking.
+
+        :param request: Dictionary containing request data with 'ttft' and
+            'duration' keys.
+        """
         ttft = request["ttft"]
         duration = request["duration"]
         if ttft is not None:
@@ -366,7 +372,12 @@ def _add_finished(self, request: dict[str, Any]) -> None:
             self.ttft_slope_checker.add_data_point(duration, ttft)
 
     def _remove_finished(self, request: dict[str, Any]) -> None:
-        """Remove a finished request from tracking."""
+        """
+        Remove a finished request from tracking.
+
+        :param request: Dictionary containing request data with 'ttft' and
+            'duration' keys.
+        """
         del self.finished_requests[0]
         ttft = request["ttft"]
         duration = request["duration"]
@@ -375,7 +386,12 @@ def _remove_finished(self, request: dict[str, Any]) -> None:
         self.ttft_slope_checker.remove_data_point(duration, ttft)
 
     def _add_started(self, request: dict[str, Any]) -> None:
-        """Add a started request to tracking."""
+        """
+        Add a started request to tracking.
+
+        :param request: Dictionary containing request data with
+            'concurrent_requests' and 'duration' keys.
+        """
         concurrent = request["concurrent_requests"]
         duration = request["duration"]
         if concurrent is not None:
@@ -384,14 +400,26 @@ def _add_started(self, request: dict[str, Any]) -> None:
             self.concurrent_slope_checker.add_data_point(duration, concurrent)
 
     def _remove_started(self, request: dict[str, Any]) -> None:
-        """Remove a started request from tracking."""
+        """
+        Remove a started request from tracking.
+
+        :param request: Dictionary containing request data with
+            'concurrent_requests' and 'duration' keys.
+        """
         del self.started_requests[0]
         concurrent = request["concurrent_requests"]
         duration = request["duration"]
         self.concurrent_slope_checker.remove_data_point(duration, concurrent)
 
     def _update_duration(self, duration: float) -> None:
-        """Update duration and prune old data points."""
+        """
+        Update duration and prune old data points.
+
+        Updates the current duration and removes data points that exceed the maximum
+        window size (by ratio or time) to maintain bounded memory usage.
+
+        :param duration: Current duration in seconds since benchmark start.
+        """
         self.duration = duration
 
         maximum_finished_window_size = int(
@@ -428,8 +456,7 @@ def _check_alert(self) -> bool:
         """
         Check if over-saturation is currently detected.
 
-        Returns:
-            True if over-saturation is detected, False otherwise.
+        :return: True if over-saturation is detected, False otherwise.
         """
         # Use duration as the maximum n value since requests from the
         # same second are highly correlated, this is simple and good enough
@@ -521,13 +548,13 @@ class OverSaturationConstraintInitializer(PydanticConstraintInitializer):
     Factory for creating OverSaturationConstraint instances from configuration.
 
     Provides a Pydantic-based initializer for over-saturation detection constraints
-    with support for flexible configuration patterns. Supports both simple boolean
-    flags and detailed configuration dictionaries, enabling easy integration with
-    CLI arguments, configuration files, and programmatic constraint creation.
+    with support for flexible configuration patterns. Supports detailed configuration
+    dictionaries, enabling easy integration with CLI arguments, configuration files,
+    and programmatic constraint creation.
 
     Example:
     ::
-        # Simple boolean configuration
+        # Configuration with defaults
         initializer = OverSaturationConstraintInitializer(enabled=True)
         constraint = initializer.create_constraint()
 
@@ -618,18 +645,18 @@ def create_constraint(self, **_kwargs) -> Constraint:
 
     @classmethod
     def validated_kwargs(
-        cls, over_saturation: bool | dict[str, Any] | None = None, **kwargs
+        cls, over_saturation: dict[str, Any] | None = None, **kwargs
     ) -> dict[str, Any]:
         """
         Validate and process arguments for OverSaturationConstraint creation.
 
-        Processes flexible input formats to create validated constraint configuration.
-        Supports boolean flags for simple enable/disable, dictionary inputs for detailed
-        configuration, and alias parameters for compatibility. Handles parameter
-        normalization and default value application.
+        Processes flexible input formats to create validated constraint
+        configuration. Supports dictionary inputs for detailed configuration, and
+        alias parameters for compatibility. Handles parameter normalization and
+        default value application.
 
-        :param over_saturation: Boolean to enable/disable with defaults, or dictionary
-            with configuration parameters (min_seconds, max_window_seconds, etc.)
+        :param over_saturation: Dictionary with configuration parameters
+            (min_seconds, max_window_seconds, etc.)
         :param kwargs: Additional keyword arguments supporting aliases like
             "detect_saturation" for compatibility, or unpacked dict values when
             dict is passed to factory
@@ -638,7 +665,7 @@ def validated_kwargs(
         """
         # Check for aliases in kwargs
         aliases = ["over_saturation", "detect_saturation"]
-        result: bool | dict[str, Any] | None = over_saturation
+        result: dict[str, Any] | None = over_saturation
 
         for alias in aliases:
             alias_value = kwargs.get(alias)
@@ -664,37 +691,13 @@ def validated_kwargs(
                 result = {key: kwargs[key] for key in constraint_keys if key in kwargs}
 
         if result is None:
-            return {}
-
-        if isinstance(result, bool):
-            # When a boolean is passed, read defaults from settings
-            return {
-                "enabled": result,
-                "min_seconds": kwargs.get(
-                    "min_seconds", settings.constraint_over_saturation_min_seconds
-                ),
-                "max_window_seconds": kwargs.get(
-                    "max_window_seconds",
-                    settings.constraint_over_saturation_max_window_seconds,
-                ),
-            }
-        elif isinstance(result, dict):
-            # Extract configuration from dict, reading from settings for missing values
-            return {
-                "enabled": result.get("enabled", True),
-                "min_seconds": result.get(
-                    "min_seconds", settings.constraint_over_saturation_min_seconds
-                ),
-                "max_window_seconds": result.get(
-                    "max_window_seconds",
-                    settings.constraint_over_saturation_max_window_seconds,
-                ),
-                "moe_threshold": result.get("moe_threshold", 2.0),
-                "minimum_ttft": result.get("minimum_ttft", 2.5),
-                "maximum_window_ratio": result.get("maximum_window_ratio", 0.75),
-                "minimum_window_size": result.get("minimum_window_size", 5),
-                "confidence": result.get("confidence", 0.95),
-            }
+            return {"enabled": False}
+
+        if isinstance(result, dict):
+            # Return dict as-is, defaults come from fields above
+            return result
         else:
-            # Convert to bool if it's truthy
-            return {"enabled": bool(result)}
+            # Type signature only accepts dict or None, so this should never happen
+            raise TypeError(
+                f"over_saturation must be a dict or None, got {type(result).__name__}"
+            )
@@ -65,12 +65,36 @@ def parse_list_floats(ctx, param, value):
         ) from err
 
 
-def parse_json(ctx, param, value):  # noqa: ARG001
+def parse_json(ctx, param, value):  # noqa: ARG001, C901, PLR0911, PLR0912
     if value is None or value == [None]:
         return None
+    if isinstance(value, dict | list):
+        # Already parsed (e.g., from flag_value), return as-is
+        return value
     if isinstance(value, list | tuple):
         return [parse_json(ctx, param, val) for val in value]
 
+    # Handle empty strings (can occur when multiple options map to same parameter)
+    if isinstance(value, str) and not value.strip():
+        return None
+
+    # Handle string representation of dict (can occur when flag_value dict is
+    # converted to string)
+    if isinstance(value, str) and value.startswith("{") and value.endswith("}"):
+        # Try to parse as JSON first
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError:
+            # If JSON parsing fails, try ast.literal_eval for Python dict syntax
+            try:
+                import ast
+
+                parsed = ast.literal_eval(value)
+                if isinstance(parsed, dict):
+                    return parsed
+            except (ValueError, SyntaxError):
+                pass  # Fall through to normal processing
+
     if "{" not in value and "}" not in value and "=" in value:
         # Treat it as a key=value pair if it doesn't look like JSON.
         result = {}
 
@@ -36,7 +36,7 @@ def server():
 @pytest.mark.timeout(60)
 def test_over_saturated_benchmark(server: VllmSimServer):
     """
-    Another example test interacting with the server.
+    Test over-saturation detection using the --default-over-saturation flag.
     """
     report_path = Path("tests/e2e/over_saturated_benchmarks.json")
     rate = 10
@@ -45,14 +45,15 @@ def test_over_saturated_benchmark(server: VllmSimServer):
     client = GuidellmClient(target=server.get_url(), output_path=report_path)
 
     cleanup_report_file(report_path)
-    # Start the benchmark
+    # Start the benchmark with --default-over-saturation flag
     client.start_benchmark(
         rate=rate,
         max_seconds=20,
-        over_saturation=True,
+        over_saturation={},  # Empty dict triggers --default-over-saturation flag
         extra_env={
-            "GUIDELLM__CONSTRAINT_OVER_SATURATION_MIN_SECONDS": "0",
             "GOMAXPROCS": "1",
+            # Set min_seconds via env var for faster test
+            "GUIDELLM__CONSTRAINT_OVER_SATURATION_MIN_SECONDS": "0",
         },
     )