fixing the device issue for build_mask

oyazdanb · oyazdanb · commit ff80fcb2f1fc · 2025-09-24T23:21:13.000Z
diff --git a/sharktank/sharktank/layers/paged_attention.py b/sharktank/sharktank/layers/paged_attention.py
@@ -745,7 +745,7 @@ def build_mask(
         kv_size: int,
         n_tokens: int,
         dtype: torch.dtype,
-        device: torch.device,
+        device: Optional[torch.device] = None,
     ):
         """
         Returns a causal (and optional sliding-window) mask of shape [n_tokens, kv_size].
@@ -809,7 +809,12 @@ def attention(
                 )
 
         effective_mask = self.build_mask(
-            mask, sliding_window, k.shape[-2], q.shape[-2], self.attn_dtype, q.device
+            mask,
+            sliding_window,
+            k.shape[-2],
+            q.shape[-2],
+            self.attn_dtype,
+            mask.device if mask is not None else None,
         )
 
         return ops.scaled_dot_product_attention(