Added missing files

bohnstingl · bohnstingl · commit 0d4f3cd27c6d · 2025-05-02T07:56:57.000Z
Signed-off-by: Thomas Ortner &lt;boh@zurich.ibm.com&gt;
diff --git a/tests/models/decoder_only/language/test_granitemoehybrid.py b/tests/models/decoder_only/language/test_granitemoehybrid.py
diff --git a/tests/models/language/generation/test_granitemoehybrid.py b/tests/models/language/generation/test_granitemoehybrid.py
@@ -23,7 +23,7 @@ def test_model_equivalence_to_hf_greedy(
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
-):  
+):
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
@@ -25,7 +25,7 @@
     "ai21labs/Jamba-tiny-dev",
     # NOTE: ibm-research/granite-4.0-tiny-test are skipped currently as
     # the HF model URLs not available yet
-    "ibm-research/granite-4.0-tiny-test",
+    # "ibm-research/granite-4.0-tiny-test",
     # NOTE: Running Plamo2 in transformers implementation requires to install
     # causal-conv1d package, which is not listed as a test dependency as it's
     # not compatible with pip-compile.
@@ -49,9 +49,6 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    if model == "ibm-research/granite-4.0-tiny-test":
-        pytest.skip(reason="HF model URLs not available yet")
-    
     with hf_runner(model) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
@@ -232,8 +232,7 @@ def __init__(
                                        quant_config=quant_config,
                                        prefix=f"{prefix}.o_proj")
 
-        self.position_embedding_type = config.position_embedding_type
-        if self.position_embedding_type == "rope":
+        if config.position_embedding_type == "rope":
             self.rotary_emb = get_rope(
                 self.head_dim,
                 rotary_dim=self.head_dim,
@@ -244,6 +243,8 @@ def __init__(
                     and config.rope_scaling is not None else None,
                 is_neox_style=True,
             )
+        else:
+            self.rotary_emb = None
 
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -263,7 +264,7 @@ def forward(
         key = self.k_proj(hidden_states)[0]
         value = self.v_proj(hidden_states)[0]
 
-        if self.position_embedding_type == "rope":
+        if self.rotary_emb is not None:
             query, key = self.rotary_emb(positions, query, key)
 
         hidden_states = self.attn(query, key, value)
@@ -349,11 +350,11 @@ def forward(
                 hidden_states = hidden_states * self.embedding_multiplier
             residual = None
         else:
-            assert intermediate_tensors is not None
+            if intermediate_tensors is None:
+                raise RuntimeError('Intermediate tensors may not be None!')
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        residual = None
         num_attn = 0
         for i in range(len(self.layers)):
             layer = self.layers[i]
@@ -463,18 +464,19 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
     embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         lora_config = vllm_config.lora_config
         scheduler_config = vllm_config.scheduler_config
-        assert not cache_config.enable_prefix_caching, \
-            "GraniteMoeHybrid currently does not support prefix caching"
+        if cache_config.enable_prefix_caching:
+            raise RuntimeError(
+                "GraniteMoeHybrid currently does not support prefix caching")
 
         self.quant_config = vllm_config.quant_config
-
-        super().__init__()
         self.config = config
         self.scheduler_config = scheduler_config
         self.model = GraniteMoeHybridModel(vllm_config=vllm_config,