Skip to content

Commit 7309f4a

Browse files
committed
loosen test_sampler
Signed-off-by: Chenyaaang <[email protected]>
2 parents 8b9ea03 + f344107 commit 7309f4a

File tree

78 files changed

+6175
-887
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+6175
-887
lines changed

.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ tasks:
44
- name: "gsm8k"
55
metrics:
66
- name: "exact_match,strict-match"
7-
value: 0.31
7+
value: 0.30
88
- name: "exact_match,flexible-extract"
9-
value: 0.47
9+
value: 0.465
1010
limit: 1319
1111
num_fewshot: 5

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ source /etc/environment
1717
docker run --privileged --net host --shm-size=16G -it \
1818
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
1919
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
20-
&& python3 -m pip install pytest tpu-info \
20+
&& python3 -m pip install pytest pytest-asyncio tpu-info \
2121
&& python3 -m pip install lm_eval[api]==0.4.4 \
2222
&& export VLLM_USE_V1=1 \
2323
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
@@ -42,8 +42,10 @@ docker run --privileged --net host --shm-size=16G -it \
4242
&& echo TEST_8 \
4343
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
4444
&& echo TEST_9 \
45-
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
45+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
4646
&& echo TEST_10 \
47+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
48+
&& echo TEST_11 \
4749
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
4850

4951

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
678678
#
679679
set(VLLM_ROCM_EXT_SRC
680680
"csrc/rocm/torch_bindings.cpp"
681+
"csrc/rocm/skinny_gemms.cu"
681682
"csrc/rocm/attention.cu")
682683

683684
define_gpu_extension_target(
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# Copyright (c) Microsoft Corporation.
3+
# Licensed under the MIT License.
4+
5+
from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
6+
MINIMUM_BITBLAS_VERSION)
7+
8+
try:
9+
import bitblas
10+
if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
11+
raise ImportError("bitblas version is wrong. Please "
12+
f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
13+
except ImportError as e:
14+
bitblas_import_exception = e
15+
raise ValueError("Trying to use the bitblas backend, but could not import"
16+
f"with the following error: {bitblas_import_exception}. "
17+
"Please install bitblas through the following command: "
18+
f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
19+
) from bitblas_import_exception
20+
21+
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
22+
23+
from vllm.utils import FlexibleArgumentParser
24+
25+
parser = FlexibleArgumentParser(
26+
description="Benchmark BitBLAS int4 on a specific target.")
27+
28+
# Add arguments to the parser
29+
parser.add_argument(
30+
"--target",
31+
type=str,
32+
default=auto_detect_nvidia_target(),
33+
help="Specify the target device for benchmarking.",
34+
)
35+
parser.add_argument("--group_size",
36+
type=int,
37+
default=None,
38+
help="Group size for grouped quantization.")
39+
parser.add_argument(
40+
"--A_dtype",
41+
type=str,
42+
default="float16",
43+
choices=["float16", "float32", "float64", "int32", "int8"],
44+
help="Data type of activation A.",
45+
)
46+
parser.add_argument(
47+
"--W_dtype",
48+
type=str,
49+
default="int4",
50+
choices=[
51+
"float16",
52+
"float32",
53+
"float64",
54+
"int32",
55+
"int8",
56+
"int4",
57+
"int2",
58+
"int1",
59+
"nf4",
60+
"fp4_e2m1",
61+
],
62+
help="Data type of weight W.",
63+
)
64+
parser.add_argument(
65+
"--accum_dtype",
66+
type=str,
67+
default="float16",
68+
choices=["float16", "int32"],
69+
help="Data type for accumulation.",
70+
)
71+
parser.add_argument(
72+
"--out_dtype",
73+
type=str,
74+
default="float16",
75+
choices=["float16", "float32", "int32", "int8"],
76+
help="Data type for output.",
77+
)
78+
parser.add_argument(
79+
"--layout",
80+
type=str,
81+
default="nt",
82+
choices=["nt", "nn"],
83+
help="Matrix layout, 'nt' for non-transpose A and transpose W.",
84+
)
85+
parser.add_argument("--with_bias",
86+
action="store_true",
87+
help="Include bias in the benchmark.")
88+
parser.add_argument(
89+
"--with_scaling",
90+
action="store_true",
91+
help="Include scaling factor in the quantization.",
92+
)
93+
parser.add_argument("--with_zeros",
94+
action="store_true",
95+
help="Include zeros in the quantization.")
96+
parser.add_argument(
97+
"--zeros_mode",
98+
type=str,
99+
default=None,
100+
choices=["original", "rescale", "quantized"],
101+
help="Specify the mode for calculating zeros.",
102+
)
103+
104+
# Parse the arguments
105+
args = parser.parse_args()
106+
107+
# Assign arguments to variables
108+
target = args.target
109+
A_dtype = args.A_dtype
110+
W_dtype = args.W_dtype
111+
accum_dtype = args.accum_dtype
112+
out_dtype = args.out_dtype
113+
layout = args.layout
114+
with_bias = args.with_bias
115+
group_size = args.group_size
116+
with_scaling = args.with_scaling
117+
with_zeros = args.with_zeros
118+
zeros_mode = args.zeros_mode
119+
120+
# Define a list of shared arguments that repeat in every config
121+
shared_args = [
122+
A_dtype,
123+
W_dtype,
124+
out_dtype,
125+
accum_dtype,
126+
layout,
127+
with_bias,
128+
group_size,
129+
with_scaling,
130+
with_zeros,
131+
zeros_mode,
132+
]
133+
134+
# Define just the (M, K, N) shapes in a more compact list
135+
shapes = [
136+
# square test
137+
(1, 16384, 16384),
138+
# BLOOM-176B
139+
(1, 43008, 14336),
140+
(1, 14336, 14336),
141+
(1, 57344, 14336),
142+
(1, 14336, 57344),
143+
# OPT-65B
144+
(1, 9216, 9216),
145+
(1, 36864, 9216),
146+
(1, 9216, 36864),
147+
(1, 22016, 8192),
148+
# LLAMA-70B/65B
149+
(1, 8192, 22016),
150+
(1, 8192, 8192),
151+
(1, 28672, 8192),
152+
(1, 8192, 28672),
153+
# square test
154+
(16384, 16384, 16384),
155+
# BLOOM-176B
156+
(8192, 43008, 14336),
157+
(8192, 14336, 14336),
158+
(8192, 57344, 14336),
159+
(8192, 14336, 57344),
160+
# OPT-65B
161+
(8192, 9216, 9216),
162+
(8192, 36864, 9216),
163+
(8192, 9216, 36864),
164+
(8192, 22016, 8192),
165+
# LLAMA-70B/65B
166+
(8192, 8192, 22016),
167+
(8192, 8192, 8192),
168+
(8192, 28672, 8192),
169+
(8192, 8192, 28672),
170+
]
171+
172+
# Build test shapes with all the shared arguments
173+
test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args))
174+
for shape in shapes]
175+
176+
benchmark_sets = []
177+
benchmark_sets.extend(test_shapes)
178+
179+
benchmark_results = {}
180+
for config_class, operator, input_args in benchmark_sets:
181+
config = config_class(*input_args)
182+
matmul = operator(config, target=target, enable_tuning=True)
183+
kernel_latency = matmul.profile_latency()
184+
185+
print("Time cost is: {:.3f} ms".format(kernel_latency))
186+
187+
profile_config = {
188+
f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
189+
"BitBLAS_top20_latency": kernel_latency,
190+
}
191+
}
192+
193+
benchmark_results.update(profile_config)
194+
195+
# Define headers for the table
196+
headers = [
197+
"PrimFunc",
198+
"Input Arguments",
199+
"BitBLAS Top20 Latency",
200+
]
201+
202+
# Calculate column widths for pretty printing
203+
col_widths = [0, 0, 0]
204+
for config_key, values in benchmark_results.items():
205+
args_split = config_key.split("-")
206+
func_name = args_split[0]
207+
input_args_str = "-".join(args_split[1:])
208+
col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
209+
col_widths[1] = max(col_widths[1],
210+
len(input_args_str) + 2,
211+
len(headers[1]) + 2)
212+
col_widths[2] = max(col_widths[2],
213+
len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
214+
len(headers[2]) + 2)
215+
# break only if you want to measure widths from a single example;
216+
# otherwise, let it loop over all items.
217+
218+
# Print header
219+
for i, header in enumerate(headers):
220+
headers[i] = header.ljust(col_widths[i])
221+
print("".join(headers))
222+
print("-" * sum(col_widths))
223+
224+
# Print rows
225+
for config_key, values in benchmark_results.items():
226+
args_split = config_key.split("-")
227+
func_name = args_split[0]
228+
input_args_str = "-".join(args_split[1:])
229+
row = [
230+
func_name,
231+
input_args_str,
232+
f"{values['BitBLAS_top20_latency']:.3f} ms",
233+
]
234+
row_str = "".join(
235+
[str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)])
236+
print(row_str)

csrc/quantization/cutlass_w8a8/moe/moe_data.cu

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,26 @@ __global__ void compute_expert_offsets(
4646
}
4747

4848
__global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
49+
const int32_t* __restrict__ expert_offsets,
4950
int32_t* input_permutation,
5051
int32_t* output_permutation,
5152
int32_t* atomic_buffer, const int topk_length,
5253
const int topk) {
53-
int expert_id = blockIdx.x;
54+
int const blk_expert_id = blockIdx.x;
55+
int const num_experts = gridDim.x;
56+
int32_t const num_tokens = expert_offsets[num_experts];
5457

5558
for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
56-
if (topk_ids[i] == expert_id) {
59+
int const expert_id = topk_ids[i];
60+
if (expert_id == -1 && blockIdx.x == 0) {
61+
// output_permutation is used to re-order the moe outputs. It is
62+
// used as c2 = c2[c_map], where c2 is a torch.tensor that is the
63+
// output of the cutlass kernels and c_map is the output_permutation.
64+
// c2 is initialized to zeros, therefore by setting the output_permutation
65+
// to num_tokens, we are guaranteed to fill the moe outputs to zero
66+
// for "invalid" topk_ids.
67+
output_permutation[i] = num_tokens;
68+
} else if (expert_id == blk_expert_id) {
5769
int start = atomicAdd(&atomic_buffer[expert_id], 1);
5870
input_permutation[start] = i / topk;
5971
output_permutation[i] = start;
@@ -83,6 +95,7 @@ void get_cutlass_moe_mm_data_caller(
8395
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
8496
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
8597
static_cast<const int32_t*>(topk_ids.data_ptr()),
98+
static_cast<const int32_t*>(expert_offsets.data_ptr()),
8699
static_cast<int32_t*>(input_permutation.data_ptr()),
87100
static_cast<int32_t*>(output_permutation.data_ptr()),
88101
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),

csrc/rocm/ops.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,15 @@
22

33
#include <torch/all.h>
44

5+
torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
6+
const int64_t rows_per_block);
7+
8+
torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
9+
const int64_t CuCount);
10+
11+
void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
12+
at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount);
13+
514
void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
615
torch::Tensor& max_logits, torch::Tensor& tmp_out,
716
torch::Tensor& query, torch::Tensor& key_cache,

0 commit comments

Comments
 (0)