move to compilation config

Conless · Conless · commit 335aab1e5474 · 2025-09-03T16:17:47.000-07:00
Signed-off-by: Yi Pan &lt;conlesspan@outlook.com&gt;
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -596,9 +596,10 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
         if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE or \
             not self.compilation_config.cudagraph_copy_inputs:
-            if self.vllm_config.model_config.enable_nano_batch_split:
+            if self.compilation_config.enable_nano_batch_split:
                 return nano_manager.get_callable(self.split_gm,
-                                                 self.vllm_config)
+                                                 self.compilation_config,
+                                                 local_cache_dir)
             else:
                 return self.split_gm
 
diff --git a/vllm/compilation/nanoflow/manager.py b/vllm/compilation/nanoflow/manager.py
@@ -3,6 +3,7 @@
 
 import contextlib
 import copy
+import os
 from typing import Callable, Optional
 
 import torch
@@ -13,24 +14,23 @@
                                                    analyze_graph,
                                                    get_split_config,
                                                    split_graph, tag_graph)
-from vllm.config import VllmConfig
+from vllm.config import CompilationConfig
 
 
 class NanoSplitManager:
 
     def __init__(
         self,
         graph_module: torch.fx.GraphModule,
-        vllm_config: VllmConfig,
+        compilation_config: CompilationConfig,
+        local_cache_dir: Optional[str],
     ) -> None:
         self.original_graph_module = graph_module
         self.original_graph = graph_module.graph
 
         # Nano split preparation
-        self.min_nano_split_tokens = \
-            vllm_config.model_config.min_nano_split_tokens
-        self.max_num_nano_batches = \
-            vllm_config.model_config.max_num_nano_batches
+        self.min_nano_split_tokens = compilation_config.min_nano_split_tokens
+        self.max_num_nano_batches = compilation_config.max_num_nano_batches
         # Initialize the base graph
         tag_graph(
             self.original_graph_module,
@@ -75,6 +75,16 @@ def __init__(
                 torch.fx.graph_module._copy_attr(self.original_graph_module,
                                                  new_graph_module, name)
             self.graph_modules[num_splits] = new_graph_module
+            if local_cache_dir is not None:
+                graph_path = os.path.join(local_cache_dir,
+                                          f"nano_split_{num_splits}.py")
+                if not os.path.exists(graph_path):
+                    src = (
+                        "from __future__ import annotations\nimport torch\n" +
+                        new_graph_module.print_readable(print_output=False))
+                    src = src.replace("<lambda>", "GraphModule")
+                    with open(graph_path, "w") as f:
+                        f.write(src)
 
     @staticmethod
     def get_batch_size(idx: int, cached_config: NanoSplitConfig):
@@ -215,11 +225,15 @@ def set_hooks(self,
 _split_manager = None
 
 
-def get_callable(graph_module: torch.fx.GraphModule,
-                 vllm_config: VllmConfig) -> Callable:
+def get_callable(
+    graph_module: torch.fx.GraphModule,
+    compilation_config: CompilationConfig,
+    local_cache_dir: Optional[str] = None,
+) -> Callable:
     global _split_manager
     if _split_manager is None:
-        _split_manager = NanoSplitManager(graph_module, vllm_config)
+        _split_manager = NanoSplitManager(graph_module, compilation_config,
+                                          local_cache_dir)
     return _split_manager.get_callable()
 
 
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
@@ -503,13 +503,6 @@ class ModelConfig:
     definitions"""
     io_processor_plugin: Optional[str] = None
     """IOProcessor plugin name to load at model startup"""
-    enable_nano_batch_split: bool = False
-    """Enable splitting the input batch into nano-batches for intra-device
-    parallelism"""
-    max_num_nano_batches: int = 2
-    """Maximum number of nano-batches to split the input batch into"""
-    min_nano_split_tokens: int = 1024
-    """Minimum number of tokens to split the input batch"""
 
     def compute_hash(self) -> str:
         """
@@ -538,9 +531,6 @@ def compute_hash(self) -> str:
         factors.append(self.override_generation_config)
         factors.append(self.rope_scaling)
         factors.append(self.rope_theta)
-        factors.append(self.enable_nano_batch_split)
-        factors.append(self.max_num_nano_batches)
-        factors.append(self.min_nano_split_tokens)
         # hf_config can control how the model looks!
         factors.append(self.hf_config.to_json_string())
         str_factors = str(factors)
@@ -3603,25 +3593,27 @@ def __post_init__(self):
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
 
-        if self.model_config.enable_nano_batch_split:
+        if self.compilation_config.enable_nano_batch_split:
             if self.model_config.enforce_eager:
                 logger.info("nano batch split is not supported with "
                             "enforce_eager. Disabling nano batch split.")
-                self.model_config.enable_nano_batch_split = False
-            elif self.compilation_config.use_cudagraph:
+                self.compilation_config.enable_nano_batch_split = False
+            elif self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
                 logger.info("nano batch split is currently not supported with "
                             "cudagraph. Disabling nano batch split.")
-                self.model_config.enable_nano_batch_split = False
+                self.compilation_config.enable_nano_batch_split = False
             elif self.compilation_config.full_cuda_graph:
                 logger.info("full_cuda_graph is not supported with "
                             "nano batch split. Disabling nano batch split.")
-                self.model_config.enable_nano_batch_split = False
+                self.compilation_config.enable_nano_batch_split = False
             elif self.compilation_config.splitting_ops:
                 logger.info("splitting_ops is not supported with "
                             "nano batch split. Disabling nano batch split.")
-                self.model_config.enable_nano_batch_split = False
+                self.compilation_config.enable_nano_batch_split = False
             else:
-                self.compilation_config.splitting_ops = ["vllm.all_reduce"]
+                self.compilation_config.splitting_ops = [
+                    "vllm.all_reduce",
+                ]
         # If the user does not explicitly set a compilation level, then
         # we use the default level. The default level depends on other
         # settings (see the below code).
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -299,6 +299,14 @@ class CompilationConfig:
     minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
     """
 
+    enable_nano_batch_split: bool = False
+    """Enable splitting the input batch into nano-batches for intra-device
+    parallelism"""
+    max_num_nano_batches: int = 2
+    """Maximum number of nano-batches to split the input batch into"""
+    min_nano_split_tokens: int = 1024
+    """Minimum number of tokens to split the input batch"""
+
     pass_config: PassConfig = field(default_factory=PassConfig)
     """Custom inductor passes, see PassConfig for more details"""
 
@@ -363,6 +371,9 @@ def compute_hash(self) -> str:
         factors.append(self.inductor_compile_config)
         factors.append(self.inductor_passes)
         factors.append(self.pass_config.uuid())
+        factors.append(self.enable_nano_batch_split)
+        factors.append(self.max_num_nano_batches)
+        factors.append(self.min_nano_split_tokens)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __repr__(self) -> str:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -435,9 +435,6 @@ class EngineArgs:
         get_field(ModelConfig, "override_generation_config")
     model_impl: str = ModelConfig.model_impl
     override_attention_dtype: str = ModelConfig.override_attention_dtype
-    enable_nano_batch_split: bool = ModelConfig.enable_nano_batch_split
-    max_num_nano_batches: int = ModelConfig.max_num_nano_batches
-    min_nano_split_tokens: int = ModelConfig.min_nano_split_tokens
 
     calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
     mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype
@@ -583,12 +580,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                  **model_kwargs["logits_processors"])
         model_group.add_argument("--io-processor-plugin",
                                  **model_kwargs["io_processor_plugin"])
-        model_group.add_argument("--enable-nano-batch-split",
-                                 **model_kwargs["enable_nano_batch_split"])
-        model_group.add_argument("--max-num-nano-batches",
-                                 **model_kwargs["max_num_nano_batches"])
-        model_group.add_argument("--min-nano-split-tokens",
-                                 **model_kwargs["min_nano_split_tokens"])
         # Model loading arguments
         load_kwargs = get_kwargs(LoadConfig)
         load_group = parser.add_argument_group(
@@ -1005,9 +996,6 @@ def create_model_config(self) -> ModelConfig:
             override_attention_dtype=self.override_attention_dtype,
             logits_processors=self.logits_processors,
             io_processor_plugin=self.io_processor_plugin,
-            enable_nano_batch_split=self.enable_nano_batch_split,
-            max_num_nano_batches=self.max_num_nano_batches,
-            min_nano_split_tokens=self.min_nano_split_tokens,
         )
 
     def validate_tensorizer_args(self):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1584,7 +1584,7 @@ def execute_model(
                 batch_descriptor=batch_descriptor,
         ), self.maybe_get_kv_connector_output(
                 scheduler_output) as kv_connector_output:
-            if self.vllm_config.model_config.enable_nano_batch_split:
+            if self.vllm_config.compilation_config.enable_nano_batch_split:
                 self._prepare_nano_split(scheduler_output)
 
             model_output = self.model(