feat: add ulimits support to aws_batch (#1126)

azzhipa · azzhipa · commit 01ef2f678a19 · 2025-09-24T22:37:53.000-04:00
diff --git a/torchx/schedulers/aws_batch_scheduler.py b/torchx/schedulers/aws_batch_scheduler.py
@@ -99,6 +99,46 @@
 TAG_TORCHX_USER = "torchx.pytorch.org/user"
 
 
+def parse_ulimits(ulimits_str: str) -> List[Dict[str, Any]]:
+    """
+    Parse ulimit string in format: name=nofile,softLimit=65536,hardLimit=65536
+    Multiple ulimits separated by semicolons.
+    """
+    if not ulimits_str:
+        return []
+
+    ulimits = []
+    for ulimit_str in ulimits_str.split(";"):
+        if not ulimit_str.strip():
+            continue
+
+        ulimit = {}
+        for opt in ulimit_str.split(","):
+            key, _, val = opt.partition("=")
+            key = key.strip()
+            val = val.strip()
+
+            if key == "name":
+                ulimit["name"] = val
+            elif key == "softLimit":
+                ulimit["softLimit"] = int(val) if val != "-1" else -1
+            elif key == "hardLimit":
+                ulimit["hardLimit"] = int(val) if val != "-1" else -1
+            else:
+                raise ValueError(f"Unknown ulimit option: {key}")
+
+        if "name" not in ulimit:
+            raise ValueError("ulimit must specify 'name'")
+        if "softLimit" not in ulimit:
+            raise ValueError("ulimit must specify 'softLimit'")
+        if "hardLimit" not in ulimit:
+            raise ValueError("ulimit must specify 'hardLimit'")
+
+        ulimits.append(ulimit)
+
+    return ulimits
+
+
 if TYPE_CHECKING:
     from docker import DockerClient
 
@@ -177,6 +217,7 @@ def _role_to_node_properties(
     privileged: bool = False,
     job_role_arn: Optional[str] = None,
     execution_role_arn: Optional[str] = None,
+    ulimits: Optional[List[Dict[str, Any]]] = None,
 ) -> Dict[str, object]:
     role.mounts += get_device_mounts(role.resource.devices)
 
@@ -239,6 +280,7 @@ def _role_to_node_properties(
         "environment": [{"name": k, "value": v} for k, v in role.env.items()],
         "privileged": privileged,
         "resourceRequirements": resource_requirements_from_resource(role.resource),
+        **({"ulimits": ulimits} if ulimits else {}),
         "linuxParameters": {
             # To support PyTorch dataloaders we need to set /dev/shm to larger
             # than the 64M default.
@@ -361,6 +403,7 @@ class AWSBatchOpts(TypedDict, total=False):
     priority: int
     job_role_arn: Optional[str]
     execution_role_arn: Optional[str]
+    ulimits: Optional[str]
 
 
 class AWSBatchScheduler(
@@ -514,6 +557,7 @@ def _submit_dryrun(self, app: AppDef, cfg: AWSBatchOpts) -> AppDryRunInfo[BatchJ
                     privileged=cfg["privileged"],
                     job_role_arn=cfg.get("job_role_arn"),
                     execution_role_arn=cfg.get("execution_role_arn"),
+                    ulimits=parse_ulimits(cfg.get("ulimits") or ""),
                 )
             )
             node_idx += role.num_replicas
@@ -599,6 +643,11 @@ def _run_opts(self) -> runopts:
             type_=str,
             help="The Amazon Resource Name (ARN) of the IAM role that the ECS agent can assume for AWS permissions.",
         )
+        opts.add(
+            "ulimits",
+            type_=str,
+            help="Ulimit settings in format: name=nofile,softLimit=65536,hardLimit=65536 (multiple separated by semicolons)",
+        )
         return opts
 
     def _get_job_id(self, app_id: str) -> Optional[str]:
diff --git a/torchx/schedulers/test/aws_batch_scheduler_test.py b/torchx/schedulers/test/aws_batch_scheduler_test.py
@@ -23,6 +23,7 @@
     AWSBatchScheduler,
     create_scheduler,
     ENV_TORCHX_ROLE_NAME,
+    parse_ulimits,
     resource_from_resource_requirements,
     resource_requirements_from_resource,
     to_millis_since_epoch,
@@ -396,6 +397,48 @@ def test_resource_devices(self) -> None:
             ],
         )
 
+    def test_role_to_node_properties_ulimits(self) -> None:
+        role = specs.Role(
+            name="test",
+            image="test:latest",
+            entrypoint="test",
+            args=["test"],
+            resource=specs.Resource(cpu=1, memMB=1000, gpu=0),
+        )
+        ulimits = [
+            {"name": "nofile", "softLimit": 65536, "hardLimit": 65536},
+            {"name": "memlock", "softLimit": -1, "hardLimit": -1},
+        ]
+        props = _role_to_node_properties(role, 0, ulimits=ulimits)
+        self.assertEqual(
+            props["container"]["ulimits"],
+            ulimits,
+        )
+
+    def test_parse_ulimits(self) -> None:
+        # Test single ulimit
+        result = parse_ulimits("name=nofile,softLimit=65536,hardLimit=65536")
+        expected = [{"name": "nofile", "softLimit": 65536, "hardLimit": 65536}]
+        self.assertEqual(result, expected)
+
+        # Test multiple ulimits
+        result = parse_ulimits(
+            "name=nofile,softLimit=65536,hardLimit=65536;name=memlock,softLimit=-1,hardLimit=-1"
+        )
+        expected = [
+            {"name": "nofile", "softLimit": 65536, "hardLimit": 65536},
+            {"name": "memlock", "softLimit": -1, "hardLimit": -1},
+        ]
+        self.assertEqual(result, expected)
+
+        # Test empty string
+        result = parse_ulimits("")
+        self.assertEqual(result, [])
+
+        # Test invalid format
+        with self.assertRaises(ValueError):
+            parse_ulimits("invalid")
+
     def _mock_scheduler_running_job(self) -> AWSBatchScheduler:
         scheduler = AWSBatchScheduler(
             "test",