[CI][UT]Compat with cuda and npu

ponix-j · ponix-j · commit 73d3ec353185 · 2025-04-24T17:39:41.000+08:00
Signed-off-by: jiangpeng &lt;jiangpeng36@huawei.com&gt;
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -272,7 +272,8 @@ class HfRunner:
     def get_default_device(self):
         from vllm.platforms import current_platform
 
-        return ("cpu" if current_platform.is_cpu() else "cuda")
+        return ("cpu"
+                if current_platform.is_cpu() else current_platform.device_type)
 
     def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
         if x is None or isinstance(x, (bool, )):
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
@@ -6,14 +6,16 @@
 import pytest
 import torch
 
+from vllm.platforms import current_platform
 from vllm.utils import make_tensor_with_pad
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
 
 VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+TORCH_DEVICES = [
+    f"{current_platform.device_type}:{i}"
+    for i in range(1 if current_platform.get_device_count() == 1 else 2)
 ]
 MAX_NUM_PROMPT_TOKENS = 64
 
@@ -224,7 +226,7 @@ def _create_weighted_output_token_list(
     return output_token_ids, sorted_token_ids_in_output
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", TORCH_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 def test_sampler_min_tokens_penalty(device: str, batch_size: int):
     """
@@ -254,7 +256,7 @@ def test_sampler_min_tokens_penalty(device: str, batch_size: int):
                 assert logits[batch_idx][token_id] != -float("inf")
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", TORCH_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
 def test_sampler_presence_penalty(device: str, batch_size: int,
@@ -299,7 +301,7 @@ def test_sampler_presence_penalty(device: str, batch_size: int,
             assert penalized_token_id not in output_token_ids[batch_idx]
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", TORCH_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
 def test_sampler_frequency_penalty(device: str, batch_size: int,
@@ -352,7 +354,7 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
             assert penalized_token_id not in distinct_sorted_token_ids_in_output
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", TORCH_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
 def test_sampler_repetition_penalty(device: str, batch_size: int,
@@ -398,7 +400,7 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
                     or non_penalized_token_id in output_tokens)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", TORCH_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("min_p", [0.0, 0.1])
 def test_sampler_min_p(device: str, batch_size: int, min_p: float):
@@ -438,7 +440,7 @@ def test_sampler_min_p(device: str, batch_size: int, min_p: float):
                     assert logits[batch_idx][token_id] != -float("inf")
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", TORCH_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("bias_value", [-0.1, 1.2])
 def test_sampler_logit_bias(device: str, batch_size: int, bias_value: float):
@@ -472,7 +474,7 @@ def test_sampler_logit_bias(device: str, batch_size: int, bias_value: float):
                 assert logits_for_req[token_id] == pytest.approx(1e-2)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", TORCH_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
 def test_sampler_allowed_token_ids(device: str, batch_size: int,
@@ -513,7 +515,7 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
                 assert logits_for_req[token_id] != -float("inf")
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", TORCH_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("bad_words_lengths", [(1, ), (1, 3), (2, 2)])
 def test_sampler_bad_words(device: str, batch_size: int,
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -86,6 +86,14 @@ def get_device_capability(cls,
     def get_device_name(cls, device_id: int = 0) -> str:
         raise NotImplementedError
 
+    @classmethod
+    def get_device_count(cls) -> int:
+        return torch.cuda.device_count()
+
+    @classmethod
+    def get_device_event(cls, blocking) -> torch.cuda.Event:
+        return torch.cuda.Event(blocking=blocking)
+
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         raise NotImplementedError
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -196,6 +196,16 @@ def get_device_name(cls, device_id: int = 0) -> str:
         """Get the name of a device."""
         raise NotImplementedError
 
+    @classmethod
+    def get_device_count(cls) -> int:
+        """Get the tensor core number of a device."""
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_event(cls, blocking):
+        """Get the tensor core event of a device."""
+        raise NotImplementedError
+
     @classmethod
     def get_device_uuid(cls, device_id: int = 0) -> str:
         """Get the uuid of a device, e.g. the PCI bus ID."""
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
@@ -14,6 +14,7 @@
                                                 SamplerOutput,
                                                 SamplingMetadata, get_logprobs,
                                                 get_pythonized_sample_results)
+from vllm.platforms import current_platform
 from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
                            Logprob, SequenceGroupMetadata, SequenceOutput)
 from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream
@@ -158,8 +159,9 @@ class StatefulModelInput(BroadcastableModelInput):
     is_first_multi_step: bool = False
     base_output_proc_callback: Optional[Callable] = None
     # ping-pong data structures for multi-step to wait on the previous step
-    step_cuda_events: List[torch.cuda.Event] = field(
-        default_factory=lambda: [torch.cuda.Event(blocking=True)] * 2)
+    step_cuda_events: List = field(
+        default_factory=lambda:
+        [current_platform.get_device_event(blocking=True)] * 2)
     num_seqs: int = -1
     num_queries: int = -1
     num_single_step_prefills: int = 0