pytorch
diff --git a/‎.github/workflows/build-test-linux-x86_64.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/build-test-linux-x86_64.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎py/torch_tensorrt/dynamo/partitioning/_resource_partitioner.py‎
Lines changed: 20 additions & 12 deletions b/‎py/torch_tensorrt/dynamo/partitioning/_resource_partitioner.py‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎tests/py/dynamo/partitioning/__init__.py‎ renamed to ‎tests/py/dynamo/partitioning/L0/__init__.py‎ b/‎tests/py/dynamo/partitioning/__init__.py‎ renamed to ‎tests/py/dynamo/partitioning/L0/__init__.py‎
diff --git a/‎tests/py/dynamo/partitioning/test_fast_partitioning.py‎ renamed to ‎tests/py/dynamo/partitioning/L0/test_fast_partitioning.py‎ b/‎tests/py/dynamo/partitioning/test_fast_partitioning.py‎ renamed to ‎tests/py/dynamo/partitioning/L0/test_fast_partitioning.py‎
diff --git a/‎tests/py/dynamo/partitioning/test_flaky_global_partitioning.py‎ renamed to ‎tests/py/dynamo/partitioning/L0/test_flaky_global_partitioning.py‎ b/‎tests/py/dynamo/partitioning/test_flaky_global_partitioning.py‎ renamed to ‎tests/py/dynamo/partitioning/L0/test_flaky_global_partitioning.py‎
diff --git a/‎tests/py/dynamo/partitioning/test_global_partitioning.py‎ renamed to ‎tests/py/dynamo/partitioning/L0/test_global_partitioning.py‎ b/‎tests/py/dynamo/partitioning/test_global_partitioning.py‎ renamed to ‎tests/py/dynamo/partitioning/L0/test_global_partitioning.py‎
diff --git a/‎tests/py/dynamo/partitioning/test_hierarchical_partitioning.py‎ renamed to ‎tests/py/dynamo/partitioning/L0/test_hierarchical_partitioning.py‎ b/‎tests/py/dynamo/partitioning/test_hierarchical_partitioning.py‎ renamed to ‎tests/py/dynamo/partitioning/L0/test_hierarchical_partitioning.py‎
diff --git a/‎tests/py/dynamo/partitioning/L0/test_resource_partitioning.py‎
Lines changed: 113 additions & 0 deletions b/‎tests/py/dynamo/partitioning/L0/test_resource_partitioning.py‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎tests/py/dynamo/partitioning/test_resource_partitioning.py‎ renamed to ‎tests/py/dynamo/partitioning/L1/test_resource_partitioning.py‎
Lines changed: 0 additions & 91 deletions b/‎tests/py/dynamo/partitioning/test_resource_partitioning.py‎ renamed to ‎tests/py/dynamo/partitioning/L1/test_resource_partitioning.py‎
Lines changed: 0 additions & 91 deletions
@@ -136,7 +136,7 @@ jobs:
         cd tests/py
         cd dynamo
         python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_core_runtime_tests_results.xml runtime/test_000_*
-        python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_core_partitioning_tests_results.xml partitioning/
+        python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_core_partitioning_tests_results.xml partitioning/L0/
         python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_core_lowering_tests_results.xml lowering/
         popd
 
@@ -229,6 +229,8 @@ jobs:
         pushd .
         cd tests/py/dynamo
         python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_core_tests_results.xml  runtime/test_001_*
+        python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_core_partitioning_tests_results.xml partitioning/L1/
+
         popd
 
   L1-dynamo-compile-tests:
 
@@ -60,6 +60,7 @@
 logger = logging.getLogger(__name__)
 
 MAX_NUM_OF_ENGINES = 40
+ENGINE_COMPILATION_MEMORY_USAGE_MULTIPLIER = 4
 
 
 class ResourcePartitioner(_SplitterBase):  # type: ignore
@@ -87,8 +88,9 @@ def __init__(
         assert isinstance(module, torch.fx.GraphModule)
 
         self.module = module
-        self.cpu_memory_budget = (
-            cpu_memory_budget
+        used_rss: int = psutil.Process().memory_info().rss
+        self.remaining_memory_budget = (
+            cpu_memory_budget - used_rss
             if cpu_memory_budget is not None
             else psutil.virtual_memory().available
         )
@@ -114,6 +116,12 @@ def partition_graph(self) -> torch.fx.GraphModule:
         """
         # Delegate nodes based on operator coverage
         subgraphs = self.put_nodes_into_subgraphs()
+        sizes = self.size_of_subgraphs(subgraphs)
+        if (
+            sum(sizes) * ENGINE_COMPILATION_MEMORY_USAGE_MULTIPLIER
+            < self.remaining_memory_budget
+        ):
+            return self.module
 
         subgraphs = self.break_subgraphs(
             subgraphs, subgraph_size_budget=self.calculate_size_budget()
@@ -172,7 +180,8 @@ def check_topological_order(self, subgraphs: List[Subgraph]) -> bool:
         return True
 
     def calculate_size_budget(
-        self, engine_compilation_memory_usage_multiplier: int = 4
+        self,
+        engine_compilation_memory_usage_multiplier: int = ENGINE_COMPILATION_MEMORY_USAGE_MULTIPLIER,
     ) -> int:
         """Compute the per-engine size budget in bytes.
 
@@ -188,13 +197,9 @@ def calculate_size_budget(
             int: Budget in bytes for a single accelerated subgraph.
         """
 
-        used_rss: int = psutil.Process().memory_info().rss
-        available_rss = (
-            self.cpu_memory_budget
-            if self.not_set_limit
-            else self.cpu_memory_budget - used_rss
+        return (
+            self.remaining_memory_budget // engine_compilation_memory_usage_multiplier
         )
-        return available_rss // engine_compilation_memory_usage_multiplier
 
     def break_subgraphs(
         self, subgraphs: List[Subgraph], subgraph_size_budget: int
@@ -229,7 +234,7 @@ def break_subgraphs(
             else:
                 raise ValueError(
                     "CPU memory budget is too small to compile the model. "
-                    + f"CPU memory budget: {self.cpu_memory_budget // (1024 * 1024)} MB, Model size: {sum(sizes) // (1024 * 1024)} MB. "
+                    + f"CPU memory budget: {self.remaining_memory_budget // (1024 * 1024)} MB, Model size: {sum(sizes) // (1024 * 1024)} MB. "
                     + "Consider setting cpu_memory_budget to a larger value."
                 )
         for subgraph, size in zip(subgraphs, sizes):
@@ -548,12 +553,15 @@ def resource_partition(
         setattr(gm, name, partitioned_graph)
 
     for name, module in list(gm.named_children()):
+        split = False
         if "_run_on_acc" in name:
             for subname, submodule in module.named_children():
                 if "resource_split" in subname:
+                    split = True
                     setattr(gm, subname, submodule)
-            _inline_module(gm, name)
-            delattr(gm, name)
+            if split:
+                _inline_module(gm, name)
+                delattr(gm, name)
 
     gm.recompile()
     return gm
@@ -0,0 +1,113 @@
+import torch
+import torch.nn as nn
+from torch.fx.passes.splitter_base import Subgraph
+from torch.ops import aten
+from torch.testing._internal.common_utils import TestCase, run_tests
+from torch_tensorrt.dynamo import partitioning
+from torch_tensorrt.dynamo.conversion import CompilationSettings
+from torch_tensorrt.dynamo.lowering import (
+    get_decompositions,
+    post_lowering,
+    pre_export_lowering,
+)
+from torch_tensorrt.dynamo.lowering.passes import post_lowering, pre_export_lowering
+from torch_tensorrt.dynamo.partitioning._resource_partitioner import (
+    ResourcePartitioner,
+)
+
+
+class TestResourcePartitioning(TestCase):
+    def test_atomic_subgraph_correction(self):
+        class net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = nn.Conv2d(3, 3, 3, padding=1)
+                self.bn1 = nn.BatchNorm2d(3)
+                self.relu = nn.ReLU()
+                self.fc = nn.Linear(3 * 224 * 224, 10)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.bn1(x)
+                x = self.relu(x)
+                x = torch.flatten(x, 1)
+                x = self.fc(x)
+                return x
+
+        model = net().eval()
+        model.to("cuda")
+        inputs = [torch.randn((1, 3, 224, 224)).to("cuda")]
+
+        enabled_precisions = {torch.float}
+        use_python_runtime = False
+
+        exp_program = torch.export.export(model, tuple(inputs))
+
+        compilation_options = {
+            "use_python_runtime": use_python_runtime,
+            "enabled_precisions": enabled_precisions,
+            "min_block_size": 1,
+            "immutable_weights": True,
+            "reuse_cached_engines": False,
+            "enable_resource_partitioning": True,
+        }
+        settings = CompilationSettings(**compilation_options)
+
+        exported_program = pre_export_lowering(exp_program, settings)
+        exported_program = exported_program.run_decompositions(
+            get_decompositions(False)
+        )
+
+        gm = exported_program.module()
+        gm = post_lowering(gm, settings)
+
+        partitioned_module, supported_ops = partitioning.fast_partition(
+            gm,
+            min_block_size=settings.min_block_size,
+            torch_executed_ops=settings.torch_executed_ops,
+            require_full_compilation=settings.require_full_compilation,
+            skip_fusion=True,
+        )
+
+        for name, _ in partitioned_module.named_children():
+            submodule = getattr(partitioned_module, name)
+            if (
+                not isinstance(submodule, torch.fx.graph_module.GraphModule)
+                or "_run_on_acc" not in name
+            ):
+                continue
+            partitioner = ResourcePartitioner(
+                submodule,
+                submodule_name=name,
+                cpu_memory_budget=2 * 1024 * 1024 * 1024,
+            )
+            subgraphs = partitioner.put_nodes_into_subgraphs()
+            new_subgraphs = []
+            current_subgraph = []
+            # Split the subgraph into two subgraphs by the ReLU node, which breaks the fusion group.
+            for node in subgraphs[0].nodes:
+                if node.op == "call_function" and node.target == aten.relu.default:
+                    new_subgraphs.append(Subgraph(is_acc=True, nodes=current_subgraph))
+                    current_subgraph = []
+                current_subgraph.append(node)
+            if current_subgraph:
+                new_subgraphs.append(Subgraph(is_acc=True, nodes=current_subgraph))
+
+            leaf_node = partitioner.get_leaf_node(new_subgraphs[0].nodes)
+            broken_fusion = partitioner.step_if_break_fusion(
+                new_subgraphs,
+                leaf_node,
+                set(new_subgraphs[0].nodes),
+                set(new_subgraphs[1].nodes),
+            )
+            # The fusion was broken
+            assert broken_fusion
+
+            # The fusion should be fixed after the step
+            partitioner._verify_all_fusion_nodes_in_same_subgraph(new_subgraphs)
+
+            break
+
+
+if __name__ == "__main__":
+    run_tests()
@@ -326,97 +326,6 @@ def forward(self, x):
 
         torch._dynamo.reset()
 
-    def test_atomic_subgraph_correction(self):
-        class net(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv1 = nn.Conv2d(3, 3, 3, padding=1)
-                self.bn1 = nn.BatchNorm2d(3)
-                self.relu = nn.ReLU()
-                self.fc = nn.Linear(3 * 224 * 224, 10)
-
-            def forward(self, x):
-                x = self.conv1(x)
-                x = self.bn1(x)
-                x = self.relu(x)
-                x = torch.flatten(x, 1)
-                x = self.fc(x)
-                return x
-
-        model = net().eval()
-        model.to("cuda")
-        inputs = [torch.randn((1, 3, 224, 224)).to("cuda")]
-
-        enabled_precisions = {torch.float}
-        use_python_runtime = False
-
-        exp_program = torch.export.export(model, tuple(inputs))
-
-        compilation_options = {
-            "use_python_runtime": use_python_runtime,
-            "enabled_precisions": enabled_precisions,
-            "min_block_size": 1,
-            "immutable_weights": True,
-            "reuse_cached_engines": False,
-            "enable_resource_partitioning": True,
-        }
-        settings = CompilationSettings(**compilation_options)
-
-        exported_program = pre_export_lowering(exp_program, settings)
-        exported_program = exported_program.run_decompositions(
-            get_decompositions(False)
-        )
-
-        gm = exported_program.module()
-        gm = post_lowering(gm, settings)
-
-        partitioned_module, supported_ops = partitioning.fast_partition(
-            gm,
-            min_block_size=settings.min_block_size,
-            torch_executed_ops=settings.torch_executed_ops,
-            require_full_compilation=settings.require_full_compilation,
-            skip_fusion=True,
-        )
-
-        for name, _ in partitioned_module.named_children():
-            submodule = getattr(partitioned_module, name)
-            if (
-                not isinstance(submodule, torch.fx.graph_module.GraphModule)
-                or "_run_on_acc" not in name
-            ):
-                continue
-            partitioner = ResourcePartitioner(
-                submodule,
-                submodule_name=name,
-                cpu_memory_budget=2 * 1024 * 1024 * 1024,
-            )
-            subgraphs = partitioner.put_nodes_into_subgraphs()
-            new_subgraphs = []
-            current_subgraph = []
-            # Split the subgraph into two subgraphs by the ReLU node, which breaks the fusion group.
-            for node in subgraphs[0].nodes:
-                if node.op == "call_function" and node.target == aten.relu.default:
-                    new_subgraphs.append(Subgraph(is_acc=True, nodes=current_subgraph))
-                    current_subgraph = []
-                current_subgraph.append(node)
-            if current_subgraph:
-                new_subgraphs.append(Subgraph(is_acc=True, nodes=current_subgraph))
-
-            leaf_node = partitioner.get_leaf_node(new_subgraphs[0].nodes)
-            broken_fusion = partitioner.step_if_break_fusion(
-                new_subgraphs,
-                leaf_node,
-                set(new_subgraphs[0].nodes),
-                set(new_subgraphs[1].nodes),
-            )
-            # The fusion was broken
-            assert broken_fusion
-
-            # The fusion should be fixed after the step
-            partitioner._verify_all_fusion_nodes_in_same_subgraph(new_subgraphs)
-
-            break
-
     def test_resource_partitioning_with_global_capability_partitioning(self):
         class net(nn.Module):
             def __init__(self):