Skip to content

Commit bedb24a

Browse files
kiukchungfacebook-github-bot
authored andcommitted
(torchx/specs) Allow roles to specify their own workspaces (#1139)
Summary: So far you can only specify the workspace in the runner's API: ``` runner.dryrun(appdef, cfg, workspace=...) ``` in which case the workspace applies to ONLY `role[0]`. This behavior was intentional since multi-role usecases of TorchX typically had a single "main" role that the application owner actually owned and the other roles were prepackaged apps (not part of your project). This is no longer the case with applications such as reenforcement learning where the project encompasses multiple applications (e.g. trainer, generator, etc) therefore we need a more flexible way to specify a workspace per Role. For BC this I'm maintaining the following behavior: 1. If `workspace` is specified as a runner argument then it takes precedence over `role[0].workspace` 2. Non-zero roles (e.g. `role[1], role[2], ...`) are unaffected by the workspace argument. That is their workspace attributes (e.g. `role[1].workspace`) are respected as is. 3. "disabling" workspace (e.g. passing `workspace=None` from the runner argument) can still build a workspace if the role's workspace attribute is not `None`. NOTE: we need to do further optimization for cases where multiple roles have the same "image" and "workspace". In this case we only need to build the image+workspace once. But as it stands we end up building a separate ephemeral per role (even if the ephemeral is the SAME across all the roles). This isn't an issue practically since image builders like Docker are content addressed and caches layers. Reviewed By: AbishekS Differential Revision: D83793199
1 parent 1e3df20 commit bedb24a

File tree

7 files changed

+279
-178
lines changed

7 files changed

+279
-178
lines changed

torchx/runner/api.py

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -426,26 +426,42 @@ def dryrun(
426426

427427
sched._pre_build_validate(app, scheduler, resolved_cfg)
428428

429-
if workspace and isinstance(sched, WorkspaceMixin):
430-
role = app.roles[0]
431-
old_img = role.image
432-
433-
logger.info(f"Checking for changes in workspace `{workspace}`...")
434-
logger.info(
435-
'To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.'
436-
)
437-
sched.build_workspace_and_update_role2(role, workspace, resolved_cfg)
438-
439-
if old_img != role.image:
440-
logger.info(
441-
f"Built new image `{role.image}` based on original image `{old_img}`"
442-
f" and changes in workspace `{workspace}` for role[0]={role.name}."
443-
)
444-
else:
445-
logger.info(
446-
f"Reusing original image `{old_img}` for role[0]={role.name}."
447-
" Either a patch was built or no changes to workspace was detected."
448-
)
429+
if isinstance(sched, WorkspaceMixin):
430+
for i, role in enumerate(app.roles):
431+
role_workspace = role.workspace
432+
433+
if i == 0 and workspace:
434+
# NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
435+
# later, torchx added support for the workspace attr in Role
436+
# for BC, give precedence to the workspace argument over the workspace attr for role[0]
437+
if role_workspace:
438+
logger.info(
439+
f"Using workspace={workspace} over role[{i}].workspace={role_workspace} for role[{i}]={role.name}."
440+
" To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically." # noqa: B950
441+
)
442+
role_workspace = workspace
443+
444+
if role_workspace:
445+
old_img = role.image
446+
logger.info(
447+
f"Checking for changes in workspace `{role_workspace}` for role[{i}]={role.name}..."
448+
)
449+
# TODO kiuk@ once we deprecate the `workspace` argument in runner APIs we can simplify the signature of
450+
# build_workspace_and_update_role2() to just taking the role and resolved_cfg
451+
sched.build_workspace_and_update_role2(
452+
role, role_workspace, resolved_cfg
453+
)
454+
455+
if old_img != role.image:
456+
logger.info(
457+
f"Built new image `{role.image}` based on original image `{old_img}`"
458+
f" and changes in workspace `{role_workspace}` for role[{i}]={role.name}."
459+
)
460+
else:
461+
logger.info(
462+
f"Reusing original image `{old_img}` for role[{i}]={role.name}."
463+
" Either a patch was built or no changes to workspace was detected."
464+
)
449465

450466
sched._validate(app, scheduler, resolved_cfg)
451467
dryrun_info = sched.submit_dryrun(app, resolved_cfg)

torchx/runner/test/api_test.py

Lines changed: 74 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,18 @@
2020
create_scheduler,
2121
LocalDirectoryImageProvider,
2222
)
23-
from torchx.specs import AppDryRunInfo, CfgVal
24-
from torchx.specs.api import (
23+
from torchx.specs import (
2524
AppDef,
25+
AppDryRunInfo,
2626
AppHandle,
2727
AppState,
28+
CfgVal,
2829
parse_app_handle,
2930
Resource,
3031
Role,
3132
runopts,
3233
UnknownAppException,
34+
Workspace,
3335
)
3436
from torchx.specs.finder import ComponentNotFoundException
3537
from torchx.test.fixtures import TestWithTmpDir
@@ -400,6 +402,16 @@ def build_workspace_and_update_role(
400402
) -> None:
401403
if self.build_new_img:
402404
role.image = f"{role.image}_new"
405+
role.env["SRC_WORKSPACE"] = workspace
406+
407+
def create_role(image: str, workspace: str | None = None) -> Role:
408+
return Role(
409+
name="noop",
410+
image=image,
411+
resource=resource.SMALL,
412+
entrypoint="/bin/true",
413+
workspace=Workspace.from_str(workspace),
414+
)
403415

404416
with Runner(
405417
name=SESSION_NAME,
@@ -411,33 +423,71 @@ def build_workspace_and_update_role(
411423
"builds-img": lambda name, **kwargs: TestScheduler(build_new_img=True),
412424
},
413425
) as runner:
426+
app = AppDef(
427+
"ignored",
428+
roles=[create_role(image="foo"), create_role(image="bar")],
429+
)
430+
roles = runner.dryrun(
431+
app, "no-build-img", workspace="//workspace"
432+
).request.roles
433+
self.assertEqual("foo", roles[0].image)
434+
self.assertEqual("bar", roles[1].image)
435+
436+
roles = runner.dryrun(
437+
app, "builds-img", workspace="//workspace"
438+
).request.roles
439+
440+
# workspace is attached to role[0] when role[0].workspace is `None`
441+
self.assertEqual("foo_new", roles[0].image)
442+
self.assertEqual("bar", roles[1].image)
443+
444+
# now run with role[0] having workspace attribute defined
414445
app = AppDef(
415446
"ignored",
416447
roles=[
417-
Role(
418-
name="sleep",
419-
image="foo",
420-
resource=resource.SMALL,
421-
entrypoint="sleep",
422-
args=["1"],
423-
),
424-
Role(
425-
name="sleep",
426-
image="bar",
427-
resource=resource.SMALL,
428-
entrypoint="sleep",
429-
args=["1"],
430-
),
448+
create_role(image="foo", workspace="//should_be_overriden"),
449+
create_role(image="bar"),
450+
],
451+
)
452+
roles = runner.dryrun(
453+
app, "builds-img", workspace="//workspace"
454+
).request.roles
455+
# workspace argument should override role[0].workspace attribute
456+
self.assertEqual("foo_new", roles[0].image)
457+
self.assertEqual("//workspace", roles[0].env["SRC_WORKSPACE"])
458+
self.assertEqual("bar", roles[1].image)
459+
460+
# now run with both role[0] and role[1] having workspace attr
461+
app = AppDef(
462+
"ignored",
463+
roles=[
464+
create_role(image="foo", workspace="//foo"),
465+
create_role(image="bar", workspace="//bar"),
466+
],
467+
)
468+
roles = runner.dryrun(
469+
app, "builds-img", workspace="//workspace"
470+
).request.roles
471+
472+
# workspace argument should override role[0].workspace attribute
473+
self.assertEqual("foo_new", roles[0].image)
474+
self.assertEqual("//workspace", roles[0].env["SRC_WORKSPACE"])
475+
self.assertEqual("bar_new", roles[1].image)
476+
self.assertEqual("//bar", roles[1].env["SRC_WORKSPACE"])
477+
478+
# now run with both role[0] and role[1] having workspace attr but no workspace arg
479+
app = AppDef(
480+
"ignored",
481+
roles=[
482+
create_role(image="foo", workspace="//foo"),
483+
create_role(image="bar", workspace="//bar"),
431484
],
432485
)
433-
dryruninfo = runner.dryrun(app, "no-build-img", workspace="//workspace")
434-
self.assertEqual("foo", dryruninfo.request.roles[0].image)
435-
self.assertEqual("bar", dryruninfo.request.roles[1].image)
436-
437-
dryruninfo = runner.dryrun(app, "builds-img", workspace="//workspace")
438-
# workspace is attached to role[0] by default
439-
self.assertEqual("foo_new", dryruninfo.request.roles[0].image)
440-
self.assertEqual("bar", dryruninfo.request.roles[1].image)
486+
roles = runner.dryrun(app, "builds-img", workspace=None).request.roles
487+
self.assertEqual("foo_new", roles[0].image)
488+
self.assertEqual("//foo", roles[0].env["SRC_WORKSPACE"])
489+
self.assertEqual("bar_new", roles[1].image)
490+
self.assertEqual("//bar", roles[1].env["SRC_WORKSPACE"])
441491

442492
def test_describe(self, _) -> None:
443493
with self.get_runner() as runner:

torchx/specs/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
UnknownAppException,
4646
UnknownSchedulerException,
4747
VolumeMount,
48+
Workspace,
4849
)
4950
from torchx.specs.builders import make_app_handle, materialize_appdef, parse_mounts
5051

@@ -236,4 +237,6 @@ def gpu_x_1() -> Dict[str, Resource]:
236237
"torchx_run_args_from_json",
237238
"TorchXRunArgs",
238239
"ALL",
240+
"TORCHX_HOME",
241+
"Workspace",
239242
]

torchx/specs/api.py

Lines changed: 80 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,78 @@ class DeviceMount:
350350
permissions: str = "rwm"
351351

352352

353+
@dataclass
354+
class Workspace:
355+
"""
356+
Specifies a local "workspace" (a set of directories). Workspaces are ad-hoc built
357+
into an (usually ephemeral) image. This effectively mirrors the local code changes
358+
at job submission time.
359+
360+
For example:
361+
362+
1. ``projects={"~/github/torch": "torch"}`` copies ``~/github/torch/**`` into ``$REMOTE_WORKSPACE_ROOT/torch/**``
363+
2. ``projects={"~/github/torch": ""}`` copies ``~/github/torch/**`` into ``$REMOTE_WORKSPACE_ROOT/**``
364+
365+
The exact location of ``$REMOTE_WORKSPACE_ROOT`` is implementation dependent and varies between
366+
different implementations of :py:class:`~torchx.workspace.api.WorkspaceMixin`.
367+
Check the scheduler documentation for details on which workspace it supports.
368+
369+
Note: ``projects`` maps the location of the local project to a sub-directory in the remote workspace root directory.
370+
Typically the local project location is a directory path (e.g. ``/home/foo/github/torch``).
371+
372+
373+
Attributes:
374+
projects: mapping of local project to the sub-dir in the remote workspace dir.
375+
"""
376+
377+
projects: dict[str, str]
378+
379+
def __bool__(self) -> bool:
380+
"""False if no projects mapping. Lets us use workspace object in an if-statement"""
381+
return bool(self.projects)
382+
383+
def is_unmapped_single_project(self) -> bool:
384+
"""
385+
Returns ``True`` if this workspace only has 1 project
386+
and its target mapping is an empty string.
387+
"""
388+
return len(self.projects) == 1 and not next(iter(self.projects.values()))
389+
390+
@staticmethod
391+
def from_str(workspace: str | None) -> "Workspace":
392+
import yaml
393+
394+
if not workspace:
395+
return Workspace({})
396+
397+
projects = yaml.safe_load(workspace)
398+
if isinstance(projects, str): # single project workspace
399+
projects = {projects: ""}
400+
else: # multi-project workspace
401+
# Replace None mappings with "" (empty string)
402+
projects = {k: ("" if v is None else v) for k, v in projects.items()}
403+
404+
return Workspace(projects)
405+
406+
def __str__(self) -> str:
407+
"""
408+
Returns a string representation of the Workspace by concatenating
409+
the project mappings using ';' as a delimiter and ':' between key and value.
410+
If the single-project workspace with no target mapping, then simply
411+
returns the src (local project dir)
412+
413+
NOTE: meant to be used for logging purposes not serde.
414+
Therefore not symmetric with :py:func:`Workspace.from_str`.
415+
416+
"""
417+
if self.is_unmapped_single_project():
418+
return next(iter(self.projects))
419+
else:
420+
return ";".join(
421+
k if not v else f"{k}:{v}" for k, v in self.projects.items()
422+
)
423+
424+
353425
@dataclass
354426
class Role:
355427
"""
@@ -402,6 +474,10 @@ class Role:
402474
metadata: Free form information that is associated with the role, for example
403475
scheduler specific data. The key should follow the pattern: ``$scheduler.$key``
404476
mounts: a list of mounts on the machine
477+
workspace: local project directories to be mirrored on the remote job.
478+
NOTE: The workspace argument provided to the :py:class:`~torchx.runner.api.Runner` APIs
479+
only takes effect on ``appdef.role[0]`` and overrides this attribute.
480+
405481
"""
406482

407483
name: str
@@ -417,9 +493,10 @@ class Role:
417493
resource: Resource = field(default_factory=_null_resource)
418494
port_map: Dict[str, int] = field(default_factory=dict)
419495
metadata: Dict[str, Any] = field(default_factory=dict)
420-
mounts: List[Union[BindMount, VolumeMount, DeviceMount]] = field(
421-
default_factory=list
422-
)
496+
mounts: List[BindMount | VolumeMount | DeviceMount] = field(default_factory=list)
497+
workspace: Workspace | None = None
498+
499+
# DEPRECATED DO NOT SET, WILL BE REMOVED SOON
423500
overrides: Dict[str, Any] = field(default_factory=dict)
424501

425502
# pyre-ignore

0 commit comments

Comments
 (0)