enable vllm c-s tests on XPU (#3445)

yao-matrix · kashif · web-flow · commit be93a0c30ce5 · 2025-05-19T11:55:57.000+02:00
Signed-off-by: Matrix Yao &lt;matrix.yao@intel.com&gt;
Co-authored-by: Kashif Rasul &lt;kashif.rasul@gmail.com&gt;
diff --git a/tests/test_vllm_client_server.py b/tests/test_vllm_client_server.py
@@ -20,12 +20,12 @@
 import psutil
 import pytest
 from transformers import AutoModelForCausalLM
-from transformers.testing_utils import require_torch_multi_gpu
+from transformers.testing_utils import require_torch_multi_accelerator, torch_device
 
 from trl.extras.vllm_client import VLLMClient
 from trl.scripts.vllm_serve import chunk_list
 
-from .testing_utils import require_3_gpus
+from .testing_utils import require_3_accelerators
 
 
 class TestChunkList(unittest.TestCase):
@@ -55,15 +55,16 @@ def test_any_dtype(self):
 
 
 @pytest.mark.slow
-@require_torch_multi_gpu
+@require_torch_multi_accelerator
 class TestVLLMClientServer(unittest.TestCase):
     model_id = "Qwen/Qwen2.5-1.5B"
 
     @classmethod
     def setUpClass(cls):
-        # We want the server to run on GPU 1, so we set CUDA_VISIBLE_DEVICES to "1"
+        # We want the server to run on accelerator 1, so we set VISIBLE_DEVICES to "1"
         env = os.environ.copy()
-        env["CUDA_VISIBLE_DEVICES"] = "1"  # Restrict to GPU 1
+        VISIBLE_DEVICES = "ZE_AFFINITY_MASK" if torch_device == "xpu" else "CUDA_VISIBLE_DEVICES"
+        env[VISIBLE_DEVICES] = "1"  # Restrict to accelerator 1
 
         # Start the server process
         cls.server_process = subprocess.Popen(
@@ -107,7 +108,7 @@ def test_generate_with_params(self):
             self.assertLessEqual(len(seq), 32)
 
     def test_update_model_params(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map="cuda")
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map=torch_device)
         self.client.update_model_params(model)
 
     def test_reset_prefix_cache(self):
@@ -132,15 +133,16 @@ def tearDownClass(cls):
 
 
 @pytest.mark.slow
-@require_3_gpus
+@require_3_accelerators
 class TestVLLMClientServerTP(unittest.TestCase):
     model_id = "Qwen/Qwen2.5-1.5B"
 
     @classmethod
     def setUpClass(cls):
-        # We want the server to run on GPU 1 and 2, so we set CUDA_VISIBLE_DEVICES to "1,2"
+        # We want the server to run on accelerator 1 and 2, so we set VISIBLE_DEVICES to "1,2"
         env = os.environ.copy()
-        env["CUDA_VISIBLE_DEVICES"] = "1,2"  # Restrict to GPU 1 and 2
+        VISIBLE_DEVICES = "ZE_AFFINITY_MASK" if torch_device == "xpu" else "CUDA_VISIBLE_DEVICES"
+        env[VISIBLE_DEVICES] = "1,2"  # Restrict to accelerator 1 and 2
 
         # Start the server process
         cls.server_process = subprocess.Popen(
@@ -169,7 +171,7 @@ def test_generate(self):
             self.assertTrue(all(isinstance(tok, int) for tok in seq))
 
     def test_update_model_params(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map="cuda")
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map=torch_device)
         self.client.update_model_params(model)
 
     def test_reset_prefix_cache(self):
@@ -194,15 +196,16 @@ def tearDownClass(cls):
 
 
 @pytest.mark.slow
-@require_3_gpus
+@require_3_accelerators
 class TestVLLMClientServerDP(unittest.TestCase):
     model_id = "Qwen/Qwen2.5-1.5B"
 
     @classmethod
     def setUpClass(cls):
-        # We want the server to run on GPU 1 and 2, so we set CUDA_VISIBLE_DEVICES to "1,2"
+        # We want the server to run on accelerator 1 and 2, so we set VISIBLE_DEVICES to "1,2"
         env = os.environ.copy()
-        env["CUDA_VISIBLE_DEVICES"] = "1,2"  # Restrict to GPU 1 and 2
+        VISIBLE_DEVICES = "ZE_AFFINITY_MASK" if torch_device == "xpu" else "CUDA_VISIBLE_DEVICES"
+        env[VISIBLE_DEVICES] = "1,2"  # Restrict to accelerator 1 and 2
 
         # Start the server process
         cls.server_process = subprocess.Popen(
@@ -230,7 +233,7 @@ def test_generate(self):
             self.assertTrue(all(isinstance(tok, int) for tok in seq))
 
     def test_update_model_params(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map="cuda")
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map=torch_device)
         self.client.update_model_params(model)
 
     def test_reset_prefix_cache(self):
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -17,6 +17,7 @@
 
 import torch
 from transformers import is_bitsandbytes_available, is_comet_available, is_sklearn_available, is_wandb_available
+from transformers.testing_utils import torch_device
 from transformers.utils import is_rich_available
 
 from trl import BaseBinaryJudge, BasePairwiseJudge
@@ -94,11 +95,14 @@ def require_no_wandb(test_case):
     return unittest.skipUnless(not is_wandb_available(), "test requires no wandb")(test_case)
 
 
-def require_3_gpus(test_case):
+def require_3_accelerators(test_case):
     """
-    Decorator marking a test that requires at least num_gpus GPUs. Skips the test if num_gpus is not available.
+    Decorator marking a test that requires at least 3 accelerators. Skips the test if 3 accelerators are not available.
     """
-    return unittest.skipUnless(torch.cuda.device_count() > 3, "test requires at least 3 GPUs")(test_case)
+    torch_accelerator_module = getattr(torch, torch_device, torch.cuda)
+    return unittest.skipUnless(
+        torch_accelerator_module.device_count() > 3, f"test requires at least 3 {torch_device}s"
+    )(test_case)
 
 
 class RandomBinaryJudge(BaseBinaryJudge):