Skip to content

Commit b6dd32a

Browse files
authored
Make name of compressed-tensors quant method consistent across vLLM (#17255)
Signed-off-by: Harry Mellor <[email protected]>
1 parent f948869 commit b6dd32a

File tree

5 files changed

+10
-14
lines changed

5 files changed

+10
-14
lines changed

tests/compile/test_full_graph.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
2020
("facebook/opt-125m", {}),
2121
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
2222
"dtype": torch.float16,
23-
"quantization": "compressed-tensors"
2423
}),
2524
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
2625
"dtype": torch.float16,
27-
"quantization": "compressed-tensors"
28-
}),
29-
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
30-
"quantization": "compressed-tensors"
3126
}),
27+
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
3228
("meta-llama/Llama-3.2-1B-Instruct", {}),
3329
]
3430

vllm/config.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -752,9 +752,8 @@ def _verify_quantization(self) -> None:
752752
supported_quantization = QUANTIZATION_METHODS
753753
optimized_quantization_methods = [
754754
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
755-
"awq_marlin", "fbgemm_fp8", "compressed_tensors",
756-
"compressed-tensors", "experts_int8", "quark", "nvfp4", "bitblas",
757-
"gptq_bitblas"
755+
"awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
756+
"quark", "nvfp4", "bitblas", "gptq_bitblas"
758757
]
759758
if self.quantization is not None:
760759
self.quantization = self.quantization.lower()
@@ -764,6 +763,9 @@ def _verify_quantization(self) -> None:
764763

765764
if quant_cfg is not None:
766765
quant_method = quant_cfg.get("quant_method", "").lower()
766+
quant_method = quant_method.replace("compressed_tensors",
767+
"compressed-tensors")
768+
quant_cfg["quant_method"] = quant_method
767769

768770
# Detect which checkpoint is it
769771
for name in QUANTIZATION_METHODS:

vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def get_min_capability(cls) -> int:
7272
return 70
7373

7474
def get_name(self) -> str:
75-
return "compressed_tensors"
75+
return "compressed-tensors"
7676

7777
def get_quant_method(
7878
self,

vllm/platforms/rocm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,8 @@ class RocmPlatform(Platform):
130130
device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
131131

132132
supported_quantization: list[str] = [
133-
"awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
134-
"fbgemm_fp8", "gguf", "quark", "ptpc_fp8"
133+
"awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
134+
"quark", "ptpc_fp8"
135135
]
136136

137137
@classmethod

vllm/platforms/tpu.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,7 @@ class TpuPlatform(Platform):
3030
ray_device_key: str = "TPU"
3131
device_control_env_var: str = "TPU_VISIBLE_CHIPS"
3232

33-
supported_quantization: list[str] = [
34-
"tpu_int8", "compressed-tensors", "compressed_tensors"
35-
]
33+
supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"]
3634

3735
additional_env_vars: list[str] = [
3836
"TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"

0 commit comments

Comments
 (0)