Make name of compressed-tensors quant method consistent across vLLM (#17255)

hmellor · web-flow · commit b6dd32aa07c1 · 2025-04-28T16:28:13.000Z
Signed-off-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
@@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
         ("facebook/opt-125m", {}),
         ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
             "dtype": torch.float16,
-            "quantization": "compressed-tensors"
         }),
         ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
             "dtype": torch.float16,
-            "quantization": "compressed-tensors"
-        }),
-        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
-            "quantization": "compressed-tensors"
         }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
         ("meta-llama/Llama-3.2-1B-Instruct", {}),
     ]
 
diff --git a/vllm/config.py b/vllm/config.py
@@ -752,9 +752,8 @@ def _verify_quantization(self) -> None:
         supported_quantization = QUANTIZATION_METHODS
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
-            "awq_marlin", "fbgemm_fp8", "compressed_tensors",
-            "compressed-tensors", "experts_int8", "quark", "nvfp4", "bitblas",
-            "gptq_bitblas"
+            "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
+            "quark", "nvfp4", "bitblas", "gptq_bitblas"
         ]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
@@ -764,6 +763,9 @@ def _verify_quantization(self) -> None:
 
         if quant_cfg is not None:
             quant_method = quant_cfg.get("quant_method", "").lower()
+            quant_method = quant_method.replace("compressed_tensors",
+                                                "compressed-tensors")
+            quant_cfg["quant_method"] = quant_method
 
             # Detect which checkpoint is it
             for name in QUANTIZATION_METHODS:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -72,7 +72,7 @@ def get_min_capability(cls) -> int:
         return 70
 
     def get_name(self) -> str:
-        return "compressed_tensors"
+        return "compressed-tensors"
 
     def get_quant_method(
         self,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -130,8 +130,8 @@ class RocmPlatform(Platform):
     device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
 
     supported_quantization: list[str] = [
-        "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
-        "fbgemm_fp8", "gguf", "quark", "ptpc_fp8"
+        "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
+        "quark", "ptpc_fp8"
     ]
 
     @classmethod
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
@@ -30,9 +30,7 @@ class TpuPlatform(Platform):
     ray_device_key: str = "TPU"
     device_control_env_var: str = "TPU_VISIBLE_CHIPS"
 
-    supported_quantization: list[str] = [
-        "tpu_int8", "compressed-tensors", "compressed_tensors"
-    ]
+    supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"]
 
     additional_env_vars: list[str] = [
         "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"

Original file line number	Diff line number	Diff line change
`@@ -130,8 +130,8 @@ class RocmPlatform(Platform):`
`130`	`130`	`device_control_env_var: str = "CUDA_VISIBLE_DEVICES"`
`131`	`131`
`132`	`132`	`supported_quantization: list[str] = [`
`133`		`- "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",`
`134`		`- "fbgemm_fp8", "gguf", "quark", "ptpc_fp8"`
	`133`	`+ "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",`
	`134`	`+ "quark", "ptpc_fp8"`
`135`	`135`	`]`
`136`	`136`
`137`	`137`	`@classmethod`