fix-3

quic-amitraj · quic-amitraj · commit a5669525baac · 2024-12-07T17:15:18.000+05:30
Signed-off-by: amitraj &lt;quic_amitraj@quicinc.com&gt;
diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
@@ -8,7 +8,7 @@
 from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
 from QEfficient.compile.compile_helper import compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_embedd, cloud_ai_100_exec_kv
+from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_embed, cloud_ai_100_exec_kv
 from QEfficient.peft import QEffAutoPeftModelForCausalLM
 from QEfficient.transformers.transform import transform
 
@@ -21,7 +21,7 @@
     "export",
     "compile",
     "cloud_ai_100_exec_kv",
-    "cloud_ai_100_exec_embedd",
+    "cloud_ai_100_exec_embed",
     "QEffAutoModel",
     "QEFFAutoModelForCausalLM",
     "QEffAutoPeftModelForCausalLM",
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -43,12 +43,7 @@ class QEFFBaseModel(ABC):
 
     @classmethod
     def _transform_names(cls) -> List[str]:
-        transform_names = []
-        if hasattr(cls, "_pytorch_transforms") and cls._pytorch_transforms:
-            transform_names.extend(x.__name__ for x in cls._pytorch_transforms)
-        if hasattr(cls, "_onnx_transforms") and cls._onnx_transforms:
-            transform_names.extend(x.__name__ for x in cls._onnx_transforms)
-        return transform_names
+        return [x.__name__ for x in cls._pytorch_transforms + cls._onnx_transforms]
 
     def __init__(self, model: torch.nn.Module) -> None:
         super().__init__()
@@ -59,11 +54,9 @@ def __init__(self, model: torch.nn.Module) -> None:
 
         # Apply the transformations
         any_transformed = False
-
-        if hasattr(self, "_pytorch_transforms") and self._pytorch_transforms:
-            for transform in self._pytorch_transforms:
-                self.model, transformed = transform.apply(self.model)
-                any_transformed = any_transformed or transformed
+        for transform in self._pytorch_transforms:
+            self.model, transformed = transform.apply(self.model)
+            any_transformed = any_transformed or transformed
 
         if not any_transformed:
             warnings.warn(f"No transforms applied to model: {self.model_name}. It may be an unsupported model!")
@@ -137,7 +130,6 @@ def _export(
             :onnx_transform_kwargs (dict): Additional arguments to be passed to `Transform.apply` for this class.
             :export_dir (str): Specify the export directory. The export_dir will be suffixed with a hash corresponding to current model.
         """
-
         export_dir = Path(export_dir or (QEFF_HOME / self.model_name))
         export_dir = export_dir.with_name(export_dir.name + "-" + self.model_hash)
         onnx_path = export_dir / f"{self.model_name}.onnx"
@@ -224,7 +216,6 @@ def _compile(
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
         """
-
         if onnx_path is None and self.onnx_path is None:
             self.export()
 
diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py
@@ -217,7 +217,7 @@ def fix_onnx_fp16(
     Return:
         :str: Updated base name of exported ONNX model.
     """
-    model = onnx.load("/local/mnt/workspace/amitraj/amit_efficient/efficient-transformers/model_base_name.onnx")
+    model = onnx.load(os.path.join(gen_models_path, f"{model_base_name}.onnx"))
     # TODO: Remove this `fix_onnx_fp16` function and replace with this transform
     # as we're not utilizing the validations done in this function
     model, fp16_fix = FP16ClipTransform.apply(model, onnx_base_dir=gen_models_path)
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -308,7 +308,7 @@ def cloud_ai_100_exec_kv(
     return exec_info
 
 
-def cloud_ai_100_exec_embedd(
+def cloud_ai_100_exec_embed(
     tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
     prompt: List[str],
     qpc_path: str,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -379,17 +379,20 @@ def generate(
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
         prompt: List[str],
         device_id: List[int] = [0],
-        runtime: str = "AI_100",
-        **kwargs,
+        runtime_ai100: bool = True,
+        seq_len: int = constants.Constants.CTX_LEN,
     ):
-        if runtime != "AI_100":
-            raise ValueError("Only AI_100 runtime is supported right now via generate API")
-        if not isinstance(self.qpc_path, Path):
-            raise TypeError("Please run compile API first!")
+        if runtime_ai100:
+            if not isinstance(self.qpc_path, Path):
+                raise TypeError("Please run compile API first!")        
 
-        return QEfficient.cloud_ai_100_exec_embedd(
-            tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id
-        )
+            return QEfficient.cloud_ai_100_exec_embed(
+                tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id
+            )
+        else:
+            inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
+            return self.model(**inputs)
+            
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
@@ -180,7 +180,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
 
 
-def check_embedd_pytorch_vs_ort_vs_ai100(
+def check_embed_pytorch_vs_ort_vs_ai100(
     model_name: str,
     seq_len: int = Constants.CTX_LEN,
     n_layer: int = 1,
@@ -197,14 +197,12 @@ def check_embedd_pytorch_vs_ort_vs_ai100(
     except TypeError:
         # If it fails, initialize without the parameter
         model = AutoModel.from_pretrained(model_name)
-        qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path, add_pooling_layer=False)
+        qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path)
     text = "My name is"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=seq_len)
 
-    # PyTorch output
-    with torch.no_grad():
-        pt_outputs = model(**inputs)
+    pt_outputs=qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False)
 
     onnx_model = qeff_model.export()
     ort_session = ort.InferenceSession(str(onnx_model))
@@ -213,22 +211,19 @@ def check_embedd_pytorch_vs_ort_vs_ai100(
     # Run inference
     onnx_outputs = ort_session.run(None, onnx_inputs)
 
-    # Extract the embeddings from PyTorch and ONNX outputs
-    pt_embeddings = pt_outputs[0].numpy()
+    # Compare PyTorch and ONNX outputs
+    pt_embeddings = pt_outputs[0].detach().numpy()
     onnx_embeddings = onnx_outputs[0]
-
-    # Calculate Mean Absolute Deviation (MAD)
     mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
     print("Mad for onnx and pytorch is ", mad)
     assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
 
-    # Compare with cloud AI100
-
     qeff_model.compile(
         num_cores=14,
     )
     ai100_output = qeff_model.generate(tokenizer=tokenizer, prompt=["My name is"])
 
+    # Compare ONNX and AI 100 outputs
     mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0]))
     print("Mad for onnx and AI 100 output is ", mad)
     assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}"
@@ -290,7 +285,7 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
 
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
 
-
-def test_embedd_model_pytorch_vs_onnx_vs_ai100():
+@pytest.mark.on_qaic
+def test_embed_model_pytorch_vs_onnx_vs_ai100():
     model_name = "BAAI/bge-small-en-v1.5"
-    check_embedd_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
+    check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)