[Model] Add GraniteMoeHybrid 4.0 model (vllm-project#17497)

s3woz · bohnstingl · DarkLight1337 · mawong-amd · commit cd765ccd1bb3 · 2025-05-13T21:05:58.000Z
Signed-off-by: Thomas Ortner &lt;boh@zurich.ibm.com&gt;
Signed-off-by: Stanislaw Wozniak &lt;stw@zurich.ibm.com&gt;
Co-authored-by: Thomas Ortner &lt;boh@zurich.ibm.com&gt;
Co-authored-by: Cyrus Leung &lt;cyrus.tl.leung@gmail.com&gt;
Co-authored-by: Tyler Michael Smith &lt;tysmith@redhat.com&gt;
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
@@ -385,6 +385,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
   * ✅︎
   * ✅︎
+- * `GraniteMoeHybridForCausalLM`
+  * Granite 4.0 MoE Hybrid
+  * `ibm-granite/granite-4.0-tiny-preview`, etc.
+  * ✅︎
+  * ✅︎
 - * `GraniteMoeSharedForCausalLM`
   * Granite MoE Shared
   * `ibm-research/moe-7b-1b-active-shared-experts` (test model)
diff --git a/tests/models/language/generation/test_granitemoehybrid.py b/tests/models/language/generation/test_granitemoehybrid.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from ...utils import check_logprobs_close
+
+# Path of the checkpoints
+MODELS = [
+    "ibm-granite/granite-4.0-tiny-preview",
+]
+
+
+@pytest.mark.skip(
+    reason="Granite 4.0 is not yet available in huggingface transformers")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float16", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_model_equivalence_to_hf_greedy(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+):
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
@@ -23,6 +23,9 @@
 
 HYBRID_MODELS = [
     "ai21labs/Jamba-tiny-dev",
+    # NOTE: ibm-granite/granite-4.0-tiny-preview are skipped currently as
+    # it is not yet available in huggingface transformers
+    # "ibm-granite/granite-4.0-tiny-preview",
     # NOTE: Running Plamo2 in transformers implementation requires to install
     # causal-conv1d package, which is not listed as a test dependency as it's
     # not compatible with pip-compile.
diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -166,6 +166,8 @@ def check_available_online(
                                           {"1b": "EleutherAI/pythia-1.4b"}),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview",  # noqa: E501
+                                                   min_transformers_version="4.52.0"),  # noqa: E501
     "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
     "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                              trust_remote_code=True),
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py