|
99 | 99 | TAG_TORCHX_USER = "torchx.pytorch.org/user"
|
100 | 100 |
|
101 | 101 |
|
| 102 | +def parse_ulimits(ulimits_str: str) -> List[Dict[str, Any]]: |
| 103 | + """ |
| 104 | + Parse ulimit string in format: name=nofile,softLimit=65536,hardLimit=65536 |
| 105 | + Multiple ulimits separated by semicolons. |
| 106 | + """ |
| 107 | + if not ulimits_str: |
| 108 | + return [] |
| 109 | + |
| 110 | + ulimits = [] |
| 111 | + for ulimit_str in ulimits_str.split(";"): |
| 112 | + if not ulimit_str.strip(): |
| 113 | + continue |
| 114 | + |
| 115 | + ulimit = {} |
| 116 | + for opt in ulimit_str.split(","): |
| 117 | + key, _, val = opt.partition("=") |
| 118 | + key = key.strip() |
| 119 | + val = val.strip() |
| 120 | + |
| 121 | + if key == "name": |
| 122 | + ulimit["name"] = val |
| 123 | + elif key == "softLimit": |
| 124 | + ulimit["softLimit"] = int(val) if val != "-1" else -1 |
| 125 | + elif key == "hardLimit": |
| 126 | + ulimit["hardLimit"] = int(val) if val != "-1" else -1 |
| 127 | + else: |
| 128 | + raise ValueError(f"Unknown ulimit option: {key}") |
| 129 | + |
| 130 | + if "name" not in ulimit: |
| 131 | + raise ValueError("ulimit must specify 'name'") |
| 132 | + if "softLimit" not in ulimit: |
| 133 | + raise ValueError("ulimit must specify 'softLimit'") |
| 134 | + if "hardLimit" not in ulimit: |
| 135 | + raise ValueError("ulimit must specify 'hardLimit'") |
| 136 | + |
| 137 | + ulimits.append(ulimit) |
| 138 | + |
| 139 | + return ulimits |
| 140 | + |
| 141 | + |
102 | 142 | if TYPE_CHECKING:
|
103 | 143 | from docker import DockerClient
|
104 | 144 |
|
@@ -177,6 +217,7 @@ def _role_to_node_properties(
|
177 | 217 | privileged: bool = False,
|
178 | 218 | job_role_arn: Optional[str] = None,
|
179 | 219 | execution_role_arn: Optional[str] = None,
|
| 220 | + ulimits: Optional[List[Dict[str, Any]]] = None, |
180 | 221 | ) -> Dict[str, object]:
|
181 | 222 | role.mounts += get_device_mounts(role.resource.devices)
|
182 | 223 |
|
@@ -239,6 +280,7 @@ def _role_to_node_properties(
|
239 | 280 | "environment": [{"name": k, "value": v} for k, v in role.env.items()],
|
240 | 281 | "privileged": privileged,
|
241 | 282 | "resourceRequirements": resource_requirements_from_resource(role.resource),
|
| 283 | + **({"ulimits": ulimits} if ulimits else {}), |
242 | 284 | "linuxParameters": {
|
243 | 285 | # To support PyTorch dataloaders we need to set /dev/shm to larger
|
244 | 286 | # than the 64M default.
|
@@ -361,6 +403,7 @@ class AWSBatchOpts(TypedDict, total=False):
|
361 | 403 | priority: int
|
362 | 404 | job_role_arn: Optional[str]
|
363 | 405 | execution_role_arn: Optional[str]
|
| 406 | + ulimits: Optional[str] |
364 | 407 |
|
365 | 408 |
|
366 | 409 | class AWSBatchScheduler(
|
@@ -514,6 +557,7 @@ def _submit_dryrun(self, app: AppDef, cfg: AWSBatchOpts) -> AppDryRunInfo[BatchJ
|
514 | 557 | privileged=cfg["privileged"],
|
515 | 558 | job_role_arn=cfg.get("job_role_arn"),
|
516 | 559 | execution_role_arn=cfg.get("execution_role_arn"),
|
| 560 | + ulimits=parse_ulimits(cfg.get("ulimits") or ""), |
517 | 561 | )
|
518 | 562 | )
|
519 | 563 | node_idx += role.num_replicas
|
@@ -599,6 +643,11 @@ def _run_opts(self) -> runopts:
|
599 | 643 | type_=str,
|
600 | 644 | help="The Amazon Resource Name (ARN) of the IAM role that the ECS agent can assume for AWS permissions.",
|
601 | 645 | )
|
| 646 | + opts.add( |
| 647 | + "ulimits", |
| 648 | + type_=str, |
| 649 | + help="Ulimit settings in format: name=nofile,softLimit=65536,hardLimit=65536 (multiple separated by semicolons)", |
| 650 | + ) |
602 | 651 | return opts
|
603 | 652 |
|
604 | 653 | def _get_job_id(self, app_id: str) -> Optional[str]:
|
|
0 commit comments