Skip to content

Commit c45809d

Browse files
committed
Merge branch 'main' into granitemoehybrid_clean
2 parents e1ba9e8 + fe742ae commit c45809d

File tree

180 files changed

+8494
-3145
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

180 files changed

+8494
-3145
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,18 @@ steps:
8686
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
8787
env:
8888
DOCKER_BUILDKIT: "1"
89+
90+
- block: "Build Neuron release image"
91+
key: block-neuron-release-image-build
92+
depends_on: ~
93+
94+
- label: "Build and publish Neuron release image"
95+
depends_on: block-neuron-release-image-build
96+
agents:
97+
queue: neuron-postmerge
98+
commands:
99+
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
100+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
101+
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
102+
env:
103+
DOCKER_BUILDKIT: "1"

.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,34 @@
55
set -ex
66

77
# Setup cleanup
8-
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
8+
remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
99
trap remove_docker_container EXIT
1010
remove_docker_container
1111

1212
# Try building the docker image
13-
docker build -t cpu-test -f docker/Dockerfile.ppc64le .
13+
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
14+
15+
# Run the image
16+
podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
17+
18+
function cpu_tests() {
19+
20+
# offline inference
21+
podman exec cpu-test-ubi9-ppc bash -c "
22+
set -e
23+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
24+
25+
# Run basic model test
26+
podman exec cpu-test-ubi9-ppc bash -c "
27+
set -e
28+
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
29+
pip install sentence-transformers datamodel_code_generator
30+
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
31+
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
32+
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
33+
}
34+
35+
# All of CPU tests are expected to be finished less than 40 mins.
36+
export -f cpu_tests
37+
timeout 40m bash -c cpu_tests
1438

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ source /etc/environment
1717
docker run --privileged --net host --shm-size=16G -it \
1818
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
1919
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
20-
&& python3 -m pip install pytest \
20+
&& python3 -m pip install pytest tpu-info \
2121
&& python3 -m pip install lm_eval[api]==0.4.4 \
2222
&& export VLLM_USE_V1=1 \
2323
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
24+
&& echo HARDWARE \
25+
&& tpu-info \
2426
&& echo TEST_0 \
2527
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
2628
&& echo TEST_1 \

.buildkite/test-pipeline.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ steps:
118118
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
119119
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
120120
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
121-
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
121+
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
122122
- pytest -v -s entrypoints/test_chat_utils.py
123123
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
124124

@@ -552,6 +552,7 @@ steps:
552552
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
553553
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
554554
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
555+
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
555556

556557
- label: Plugin Tests (2 GPUs) # 40min
557558
working_dir: "/vllm-workspace/tests"

.github/ISSUE_TEMPLATE/200-installation.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ body:
1414
description: |
1515
Please run the following and paste the output below.
1616
```sh
17-
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
17+
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
1818
# For security purposes, please feel free to check the contents of collect_env.py before running it.
1919
python collect_env.py
2020
```

.github/ISSUE_TEMPLATE/300-usage.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ body:
1414
description: |
1515
Please run the following and paste the output below.
1616
```sh
17-
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
17+
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
1818
# For security purposes, please feel free to check the contents of collect_env.py before running it.
1919
python collect_env.py
2020
```

.github/ISSUE_TEMPLATE/400-bug-report.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ body:
1414
description: |
1515
Please run the following and paste the output below.
1616
```sh
17-
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
17+
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
1818
# For security purposes, please feel free to check the contents of collect_env.py before running it.
1919
python collect_env.py
2020
```

.github/ISSUE_TEMPLATE/700-performance-discussion.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ body:
3535
description: |
3636
Please run the following and paste the output below.
3737
```sh
38-
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
38+
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
3939
# For security purposes, please feel free to check the contents of collect_env.py before running it.
4040
python collect_env.py
4141
```

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,3 +203,6 @@ benchmarks/**/*.json
203203
# Linting
204204
actionlint
205205
shellcheck*/
206+
207+
# Ingore moe/marlin_moe gen code
208+
csrc/moe/marlin_moe_wna16/kernel_*

benchmarks/backend_request_func.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22

3+
import io
34
import json
45
import os
56
import sys
@@ -32,6 +33,7 @@ class RequestFuncInput:
3233
extra_body: Optional[dict] = None
3334
multi_modal_content: Optional[dict] = None
3435
ignore_eos: bool = False
36+
language: Optional[str] = None
3537

3638

3739
@dataclass
@@ -436,6 +438,110 @@ async def async_request_openai_chat_completions(
436438
return output
437439

438440

441+
async def async_request_openai_audio(
442+
request_func_input: RequestFuncInput,
443+
pbar: Optional[tqdm] = None,
444+
) -> RequestFuncOutput:
445+
# Lazy import without PlaceholderModule to avoid vllm dep.
446+
import soundfile
447+
api_url = request_func_input.api_url
448+
assert api_url.endswith(
449+
("transcriptions", "translations"
450+
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
451+
"or `translations`."
452+
453+
async with aiohttp.ClientSession(trust_env=True,
454+
timeout=AIOHTTP_TIMEOUT) as session:
455+
content = [{"type": "text", "text": request_func_input.prompt}]
456+
payload = {
457+
"model": request_func_input.model_name \
458+
if request_func_input.model_name else request_func_input.model,
459+
"temperature": 0.0,
460+
"max_completion_tokens": request_func_input.output_len,
461+
"stream": True,
462+
"language": "en",
463+
# Flattened due to multipart/form-data
464+
"stream_include_usage": True,
465+
"stream_continuous_usage_stats": True
466+
}
467+
if request_func_input.extra_body:
468+
payload.update(request_func_input.extra_body)
469+
headers = {
470+
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
471+
}
472+
473+
# Send audio file
474+
def to_bytes(y, sr):
475+
buffer = io.BytesIO()
476+
soundfile.write(buffer, y, sr, format="WAV")
477+
buffer.seek(0)
478+
return buffer
479+
480+
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
481+
form = aiohttp.FormData()
482+
form.add_field('file', f, content_type='audio/wav')
483+
for key, value in payload.items():
484+
form.add_field(key, str(value))
485+
486+
output = RequestFuncOutput()
487+
output.prompt_len = request_func_input.prompt_len
488+
489+
generated_text = ""
490+
ttft = 0.0
491+
st = time.perf_counter()
492+
most_recent_timestamp = st
493+
try:
494+
async with session.post(url=api_url,
495+
data=form,
496+
headers=headers) as response:
497+
if response.status == 200:
498+
async for chunk_bytes in response.content:
499+
chunk_bytes = chunk_bytes.strip()
500+
if not chunk_bytes:
501+
continue
502+
503+
chunk = chunk_bytes.decode("utf-8").removeprefix(
504+
"data: ")
505+
if chunk != "[DONE]":
506+
timestamp = time.perf_counter()
507+
data = json.loads(chunk)
508+
509+
if choices := data.get("choices"):
510+
content = choices[0]["delta"].get(
511+
"content")
512+
# First token
513+
if ttft == 0.0:
514+
ttft = timestamp - st
515+
output.ttft = ttft
516+
517+
# Decoding phase
518+
else:
519+
output.itl.append(
520+
timestamp - most_recent_timestamp)
521+
522+
generated_text += content or ""
523+
elif usage := data.get("usage"):
524+
output.output_tokens = usage.get(
525+
"completion_tokens")
526+
527+
most_recent_timestamp = timestamp
528+
529+
output.generated_text = generated_text
530+
output.success = True
531+
output.latency = most_recent_timestamp - st
532+
else:
533+
output.error = response.reason or ""
534+
output.success = False
535+
except Exception:
536+
output.success = False
537+
exc_info = sys.exc_info()
538+
output.error = "".join(traceback.format_exception(*exc_info))
539+
540+
if pbar:
541+
pbar.update(1)
542+
return output
543+
544+
439545
def get_model(pretrained_model_name_or_path: str) -> str:
440546
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
441547
from modelscope import snapshot_download
@@ -493,6 +599,7 @@ def get_tokenizer(
493599
"deepspeed-mii": async_request_deepspeed_mii,
494600
"openai": async_request_openai_completions,
495601
"openai-chat": async_request_openai_chat_completions,
602+
"openai-audio": async_request_openai_audio,
496603
"tensorrt-llm": async_request_trt_llm,
497604
"scalellm": async_request_openai_completions,
498605
"sglang": async_request_openai_completions,

benchmarks/benchmark_dataset.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class SampleRequest:
6464

6565
class BenchmarkDataset(ABC):
6666
DEFAULT_SEED = 0
67+
IS_MULTIMODAL = False
6768

6869
def __init__(
6970
self,
@@ -621,6 +622,7 @@ class ConversationDataset(HuggingFaceDataset):
621622
SUPPORTED_DATASET_PATHS = {
622623
'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
623624
}
625+
IS_MULTIMODAL = True
624626

625627
def sample(self,
626628
tokenizer: PreTrainedTokenizerBase,
@@ -685,6 +687,7 @@ class VisionArenaDataset(HuggingFaceDataset):
685687
"lmarena-ai/vision-arena-bench-v0.1":
686688
lambda x: x["turns"][0][0]["content"]
687689
}
690+
IS_MULTIMODAL = True
688691

689692
def sample(
690693
self,
@@ -815,3 +818,80 @@ def sample(self,
815818
))
816819
self.maybe_oversample_requests(sampled_requests, num_requests)
817820
return sampled_requests
821+
822+
823+
# -----------------------------------------------------------------------------
824+
# ASR Dataset Implementation
825+
# -----------------------------------------------------------------------------
826+
827+
828+
class ASRDataset(HuggingFaceDataset):
829+
"""
830+
Dataset class for processing a ASR dataset for transcription.
831+
Tested on the following set:
832+
833+
+----------------+----------------------------------------+--------------------------+-----------------------------+
834+
| Dataset | Domain | Speaking Style | hf-subset |
835+
+----------------+----------------------------------------+--------------------------+-----------------------------+
836+
| TED-LIUM | TED talks | Oratory | release1, release2, release3|
837+
| | | | release3-speaker-adaptation |
838+
| VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
839+
| LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
840+
| GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
841+
| SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
842+
| AMI | Meetings | Spontaneous | ihm, sdm |
843+
+----------------+----------------------------------------+--------------------------+-----------------------------+
844+
845+
""" # noqa: E501
846+
SUPPORTED_DATASET_PATHS = {
847+
"openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium",
848+
"edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech"
849+
}
850+
851+
DEFAULT_OUTPUT_LEN = 128
852+
IS_MULTIMODAL = True
853+
854+
# TODO Whisper-specific. Abstract interface when more models are supported.
855+
TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\
856+
"<|notimestamps|>"
857+
skip_long_audios: bool = True
858+
859+
def sample(
860+
self,
861+
tokenizer: PreTrainedTokenizerBase,
862+
num_requests: int,
863+
output_len: Optional[int] = None,
864+
**kwargs,
865+
) -> list:
866+
import librosa
867+
output_len = (output_len
868+
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
869+
prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
870+
prompt_len = len(tokenizer(prompt).input_ids)
871+
sampled_requests = []
872+
skipped = 0
873+
for item in self.data:
874+
if len(sampled_requests) >= num_requests:
875+
break
876+
audio = item["audio"]
877+
y, sr = audio["array"], audio["sampling_rate"]
878+
duration_s = librosa.get_duration(y=y, sr=sr)
879+
# Whisper max supported duration
880+
if self.skip_long_audios and duration_s > 30:
881+
skipped += 1
882+
continue
883+
884+
mm_content = {"audio": (y, sr)}
885+
sampled_requests.append(
886+
SampleRequest(
887+
prompt=prompt,
888+
prompt_len=prompt_len,
889+
expected_output_len=output_len,
890+
multi_modal_data=mm_content,
891+
))
892+
if skipped:
893+
logger.warning("%d samples discarded from dataset due to" \
894+
" their length being greater than" \
895+
" what Whisper supports.", skipped)
896+
self.maybe_oversample_requests(sampled_requests, num_requests)
897+
return sampled_requests

0 commit comments

Comments
 (0)