enable activation offloading on XPU (#3444)

yao-matrix · kashif · web-flow · commit 64aa06499b2e · 2025-05-19T11:56:14.000+02:00
Signed-off-by: Matrix Yao &lt;matrix.yao@intel.com&gt;
Co-authored-by: Kashif Rasul &lt;kashif.rasul@gmail.com&gt;
diff --git a/tests/test_activation_offloading.py b/tests/test_activation_offloading.py
@@ -17,7 +17,7 @@
 import torch
 from torch import nn
 from transformers import AutoModelForCausalLM
-from transformers.testing_utils import require_peft, require_torch_accelerator
+from transformers.testing_utils import require_peft, require_torch_accelerator, torch_device
 from transformers.utils import is_peft_available
 
 from trl.models.activation_offloading import NoOpManager, OffloadActivations
@@ -33,7 +33,7 @@ class TestActivationOffloading(unittest.TestCase):
     def test_offloading_with_peft_models(self) -> None:
         """Test that activation offloading works with PEFT models."""
         model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        model = AutoModelForCausalLM.from_pretrained(model_id).cuda()
+        model = AutoModelForCausalLM.from_pretrained(model_id).to(torch_device)
         peft_config = LoraConfig(
             lora_alpha=16,
             lora_dropout=0.1,
@@ -43,7 +43,7 @@ def test_offloading_with_peft_models(self) -> None:
         )
 
         model = get_peft_model(model, peft_config)
-        inp = torch.randint(0, 100, (2, 10), device="cuda")
+        inp = torch.randint(0, 100, (2, 10), device=torch_device)
 
         # First forward-backward pass without offloading
         torch.manual_seed(42)
@@ -79,8 +79,8 @@ def test_offloading_with_peft_models(self) -> None:
     @require_torch_accelerator
     def test_noop_manager_with_offloading(self):
         model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        model = AutoModelForCausalLM.from_pretrained(model_id).cuda()
-        inp = torch.randint(0, 100, (2, 10), device="cuda")
+        model = AutoModelForCausalLM.from_pretrained(model_id).to(torch_device)
+        inp = torch.randint(0, 100, (2, 10), device=torch_device)
 
         # Run with offloading but disable for specific section
         with OffloadActivations():
@@ -112,9 +112,9 @@ def test_min_offload_size(self):
         model = nn.Sequential(
             nn.Linear(5, 5),  # Small layer that shouldn't be offloaded
             nn.Linear(5, 1000),  # Large layer that should be offloaded
-        ).cuda()
+        ).to(torch_device)
 
-        inp = torch.randn(2, 5, device="cuda")
+        inp = torch.randn(2, 5, device=torch_device)
 
         with OffloadActivations(min_offload_size=1000):
             out = model(inp)
@@ -127,10 +127,10 @@ def test_min_offload_size(self):
     def test_real_hf_model(self):
         """Test with an actual HuggingFace model"""
         model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        model = AutoModelForCausalLM.from_pretrained(model_id).cuda()
+        model = AutoModelForCausalLM.from_pretrained(model_id).to(torch_device)
 
         # Create small input
-        inp = torch.randint(0, 100, (2, 10), device="cuda")
+        inp = torch.randint(0, 100, (2, 10), device=torch_device)
 
         # Baseline without offloading
         torch.manual_seed(42)
diff --git a/trl/models/activation_offloading.py b/trl/models/activation_offloading.py
@@ -85,11 +85,17 @@ def __init__(
         self.use_pin_memory = use_pin_memory
         self.virtual_memory_safe_pct = 60  # we should not exceed this percentage of memory
 
-        self.s0 = torch.cuda.default_stream()  # comp stream
+        self.accelerator_type = (
+            torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+        )
+        # NOTE: xpu doesn't have `default_stream` API, use `current_stream` instead
+        self.s0 = (
+            torch.xpu.current_stream() if self.accelerator_type == "xpu" else torch.cuda.default_stream()
+        )  # comp stream
 
         # For streaming
         if self.use_streams:
-            self.s1 = torch.cuda.Stream()  # comms stream
+            self.s1 = torch.Stream() if self.accelerator_type == "xpu" else torch.cuda.Stream()  # comms stream
             self.fwd_stash = {}  # tensor_id => (activation, ev1)
             if max_fwd_stash_size < 1:
                 raise ValueError(f"max_fwd_stash_size should be at least 1 but is {max_fwd_stash_size}")
@@ -136,7 +142,7 @@ def pack_tensor(activation: torch.Tensor) -> int:
             # only offload hefty bois if they're activations on CUDA (our heuristic
             # for that is to check if they're not params or buffers)!
             if (
-                activation.is_cuda
+                activation.device.type in ["cuda", "xpu"]
                 and num_bytes >= self.min_tensor_size_bytes
                 and (
                     not isinstance(activation, torch.nn.Parameter)
@@ -158,7 +164,7 @@ def pack_tensor(activation: torch.Tensor) -> int:
                     self.s1.wait_stream(self.s0)
 
                 stream = self.s1 if self.use_streams else self.s0
-                with torch.cuda.stream(stream):
+                with stream if self.accelerator_type == "xpu" else torch.cuda.stream(stream):
                     cpu_tensor = torch.empty_like(activation, pin_memory=self.use_pin_memory, device="cpu")
                     cpu_tensor.copy_(activation, non_blocking=True)
                     self.tracker[tensor_id] = (
@@ -194,14 +200,14 @@ def unpack_tensor_single_stream(unpack_tensor_id: int) -> torch.Tensor:
             if unpack_tensor_id not in self.tracker:
                 raise ValueError(f"Untracked tensor with id {unpack_tensor_id}")
 
-            maybe_gpu_tensor, modified = self.tracker[unpack_tensor_id]
+            maybe_accelerator_tensor, modified = self.tracker[unpack_tensor_id]
             if modified:
-                gpu_tensor = maybe_gpu_tensor.to("cuda", non_blocking=True)
-                maybe_gpu_tensor = gpu_tensor
+                accelerator_tensor = maybe_accelerator_tensor.to(self.accelerator_type, non_blocking=True)
+                maybe_accelerator_tensor = accelerator_tensor
 
             # clear tensor from tracking
             del self.tracker[unpack_tensor_id]
-            return maybe_gpu_tensor
+            return maybe_accelerator_tensor
 
         def unpack_tensor_with_streams(unpack_tensor_id: int) -> torch.Tensor:
             # backward pass - we are called with the tensor_id, which
@@ -229,7 +235,7 @@ def wait_and_del_remaining_references() -> None:
             if unpack_tensor_id not in self.tracker:
                 raise ValueError(f"untracked tensor with id {unpack_tensor_id}")
 
-            maybe_gpu_tensor, modified = self.tracker[unpack_tensor_id]
+            maybe_accelerator_tensor, modified = self.tracker[unpack_tensor_id]
             if modified:
                 # Get data on the current autograd node
                 graph_id = torch._C._current_graph_task_id()
@@ -243,19 +249,19 @@ def wait_and_del_remaining_references() -> None:
 
                 brought_back_from_cpu = True
                 if unpack_tensor_id in self.fwd_stash:
-                    maybe_gpu_tensor = self.fwd_stash[unpack_tensor_id][0]
+                    maybe_accelerator_tensor = self.fwd_stash[unpack_tensor_id][0]
                     brought_back_from_cpu = False
                 else:
                     # Kick off the process to bring tensors back
-                    with torch.cuda.stream(self.s1):
-                        gpu_tensor = maybe_gpu_tensor.to("cuda", non_blocking=True)
-                        maybe_gpu_tensor = gpu_tensor
+                    with self.s1 if self.accelerator_type == "xpu" else torch.cuda.stream(self.s1):
+                        accelerator_tensor = maybe_accelerator_tensor.to(self.accelerator_type, non_blocking=True)
+                        maybe_accelerator_tensor = accelerator_tensor
 
                     # Tell comp stream to wait for the info to be loaded before executing
                     self.s0.wait_stream(self.s1)
 
                     # Stash the tensor to keep memory alive until compute stream is complete
-                    self.bwd_tensor_stash[unpack_tensor_id] = maybe_gpu_tensor
+                    self.bwd_tensor_stash[unpack_tensor_id] = maybe_accelerator_tensor
 
                     # Note: [Track views of the unpacked]
                     # Why do we get the use count of the unpacked tensor here? We want an
@@ -270,7 +276,7 @@ def wait_and_del_remaining_references() -> None:
                     #    up as a view of the unpacked tensor.
                     # 3. The user abuses the system somehow and manually relies on the
                     #    unpacked tensor to exist after the backward node has executed.
-                    storage_refcount = torch._C._storage_Use_Count(maybe_gpu_tensor.untyped_storage()._cdata)
+                    storage_refcount = torch._C._storage_Use_Count(maybe_accelerator_tensor.untyped_storage()._cdata)
 
                 def hook(outputs, inputs):
                     # create events for the current node inputs/outputs if they were streamed in
@@ -312,7 +318,7 @@ def hook(outputs, inputs):
 
             # clear tensor from tracking
             del self.tracker[unpack_tensor_id]
-            return maybe_gpu_tensor
+            return maybe_accelerator_tensor
 
         unpack_tensor = unpack_tensor_with_streams if self.use_streams else unpack_tensor_single_stream
         super().__init__(pack_tensor, unpack_tensor)