Device fallback on model load

kartik4949 · thejumpman2323 · commit 9aee8e976df9 · 2023-09-14T19:37:49.000+05:30
Former-commit-id: 5549f74
diff --git a/superduperdb/container/model.py b/superduperdb/container/model.py
@@ -397,6 +397,9 @@ class Model(Component, PredictMixin):
     #: The method to use for prediction (optional)
     predict_method: t.Optional[str] = None
 
+    #: The method to transfer the model to a device
+    model_to_device_method: t.Optional[str] = None
+
     #: Whether to batch predict (optional)
     batch_predict: bool = False
 
@@ -413,6 +416,9 @@ class Model(Component, PredictMixin):
     future: t.Optional[Future] = None
     device: str = "cpu"
 
+    # TODO: handle situation with multiple GPUs
+    preferred_devices: t.Sequence[str] = ("cuda", "mps", "cpu")
+
     artifacts: t.ClassVar[t.Sequence[str]] = ['object']
 
     type_id: t.ClassVar[str] = 'model'
@@ -429,6 +435,21 @@ def __post_init__(self):
         else:
             self.to_call = getattr(self.object.artifact, self.predict_method)
 
+        self.artifact_to_method = None
+        if self.model_to_device_method is not None:
+            self.artifact_to_method = getattr(self, self.model_to_device_method)
+
+    def on_load(self, db: DB) -> None:
+        if self.artifact_to_method:
+            for i, device in enumerate(self.preferred_devices):
+                try:
+                    self.artifact_to_method(device)
+                    self.device = device
+                    return
+                except Exception:
+                    if i == len(self.preferred_devices) - 1:
+                        raise
+
     @property
     def child_components(self) -> t.Sequence[t.Tuple[str, str]]:
         out = []
diff --git a/superduperdb/ext/torch/model.py b/superduperdb/ext/torch/model.py
@@ -364,6 +364,8 @@ class TorchModel(Base, Model):  # type: ignore[misc]
     train_forward_method: str = '__call__'
 
     def __post_init__(self):
+        self.model_to_device_method = 'move_to_device'
+
         super().__post_init__()
 
         self.object.serializer = 'torch'
@@ -404,6 +406,9 @@ def parameters(self):
     def state_dict(self):
         return self.object.state_dict()
 
+    def move_to_device(self, device):
+        self.object.artifact.to(device)
+
     @contextmanager
     def saving(self):
         with super().saving():
@@ -476,6 +481,10 @@ def func(x):
             return out
 
     def train_forward(self, X, y=None):
+        X = X.to(self.device)
+        if y is not None:
+            y = y.to(self.device)
+
         method = getattr(self.object.artifact, self.train_forward_method)
         if hasattr(self.object.artifact, 'train_forward'):
             if y is None: