|
21 | 21 | from tests.models.utils import (TokensTextLogprobs,
|
22 | 22 | TokensTextLogprobsPromptLogprobs)
|
23 | 23 | from vllm import LLM, SamplingParams
|
| 24 | +from vllm.assets.audio import AudioAsset |
24 | 25 | from vllm.assets.image import ImageAsset
|
25 | 26 | from vllm.assets.video import VideoAsset
|
26 | 27 | from vllm.config import TaskOption, _get_and_verify_dtype
|
@@ -103,10 +104,25 @@ def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
|
103 | 104 | return [prompts["sample_demo_1"]]
|
104 | 105 |
|
105 | 106 |
|
| 107 | +class _AudioAssetsBase(UserList[AudioAsset]): |
| 108 | + pass |
| 109 | + |
| 110 | + |
| 111 | +class _AudioAssets(_AudioAssetsBase): |
| 112 | + |
| 113 | + def __init__(self) -> None: |
| 114 | + super().__init__([ |
| 115 | + AudioAsset("mary_had_lamb"), |
| 116 | + AudioAsset("winning_call"), |
| 117 | + ]) |
| 118 | + |
| 119 | + |
106 | 120 | IMAGE_ASSETS = _ImageAssets()
|
107 | 121 | """Singleton instance of :class:`_ImageAssets`."""
|
108 | 122 | VIDEO_ASSETS = _VideoAssets()
|
109 | 123 | """Singleton instance of :class:`_VideoAssets`."""
|
| 124 | +AUDIO_ASSETS = _AudioAssets() |
| 125 | +"""Singleton instance of :class:`_AudioAssets`.""" |
110 | 126 |
|
111 | 127 |
|
112 | 128 | @pytest.fixture(scope="function", autouse=True)
|
@@ -263,6 +279,11 @@ def video_assets() -> _VideoAssets:
|
263 | 279 | return VIDEO_ASSETS
|
264 | 280 |
|
265 | 281 |
|
| 282 | +@pytest.fixture(scope="session") |
| 283 | +def audio_assets() -> _AudioAssets: |
| 284 | + return AUDIO_ASSETS |
| 285 | + |
| 286 | + |
266 | 287 | _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
|
267 | 288 | _R = TypeVar("_R")
|
268 | 289 |
|
@@ -390,10 +411,15 @@ def get_inputs(
|
390 | 411 | processor_kwargs["images"] = image
|
391 | 412 | if videos is not None and (video := videos[i]) is not None:
|
392 | 413 | processor_kwargs["videos"] = video
|
393 |
| - if audios is not None and (audio_tuple := audios[i]) is not None: |
394 |
| - audio, sr = audio_tuple |
395 |
| - processor_kwargs["audio"] = audio |
396 |
| - processor_kwargs["sampling_rate"] = sr |
| 414 | + if audios is not None and (audio_inputs := audios[i]) is not None: |
| 415 | + # HACK - not all processors take sampling_rate; we should |
| 416 | + # clean this up in the future. |
| 417 | + if len(audio_inputs) == 2: |
| 418 | + audio, sr = audio_inputs |
| 419 | + processor_kwargs["audio"] = audio |
| 420 | + processor_kwargs["sampling_rate"] = sr |
| 421 | + else: |
| 422 | + processor_kwargs["audio"] = audio_inputs |
397 | 423 |
|
398 | 424 | inputs = self.processor(**processor_kwargs)
|
399 | 425 | if isinstance(inputs, BatchFeature):
|
|
0 commit comments