Comments Addressed-1

quic-amitraj · quic-amitraj · commit 2fb41ad108da · 2024-12-12T11:50:41.000+05:30
Signed-off-by: amitraj &lt;quic_amitraj@quicinc.com&gt;
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -174,8 +174,7 @@ def get_compilation_dims(qpc_path: str) -> Tuple[int, int, Optional[int]]:
         raise FileNotFoundError(f"expected specializations.json file at path, {qpc_base_path}")
 
     compilation_batch_size = int(data["specializations"][0]["batch_size"])
-    if compilation_ctx_len := data["specializations"][0].get("ctx_len", None):
-        compilation_ctx_len = int(data["specializations"][0]["ctx_len"])
+    compilation_ctx_len = int(data["specializations"][0]["ctx_len"])
     if compilation_fbs := data["specializations"][0].get("full_batch_size", None):
         compilation_fbs = int(compilation_fbs)
     return compilation_batch_size, compilation_ctx_len, compilation_fbs
@@ -352,8 +351,24 @@ def cloud_ai_100_exec_embed(
     tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
     qpc_path: str,
     prompt: List[str],
-    device_id: List[int] = [0],    
-):
+    device_id: List[int] = [0],
+) -> dict:
+    """
+    This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
+    This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed.
+    If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped.
+
+    ``Mandatory`` Args:
+        :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer.
+        :qpc_path (str): Path to the saved generated binary file after compilation.
+        :prompt (str): Sample prompt for the model text generation.
+    ``Optional`` Args:
+        :device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``.
+
+    Returns:
+        :dict: Output from the ``AI_100`` runtime.
+    """
+
     session = QAICInferenceSession(qpc_path, device_ids=device_id)
     batch_size = session.bindings[0].dims[0]
     seq_len = session.bindings[0].dims[1]
@@ -368,8 +383,10 @@ def cloud_ai_100_exec_embed(
     }
     session.set_buffers(output)
     outputs = session.run(inputs)
+    session.deactivate()
     return outputs
 
+
 class QEffTextGenerationBase:
     def __init__(
         self,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -66,22 +66,6 @@ def model_name(self) -> str:
             mname = mname[4:]
         return mname
 
-    @property
-    def model_hash(self) -> str:
-        # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path.
-        # Using same card name will result in same hash. But, using a relative path for one run and
-        # absolute path for another run will result in different hash.
-        # The added complexity to resolve different paths to same location is not worth pursuing.
-        # Instead, advise the user to always provide same relative paths or absolute paths for local models.
-
-        # Compute the hash with: model_config, transforms
-        mhash = hashlib.sha256()
-        mhash.update(to_hashable(self.model.config.to_diff_dict()))
-        mhash.update(to_hashable(self._transform_names()))
-        mhash.update(to_hashable({"is_tlm": self.is_tlm}))
-        mhash = mhash.hexdigest()[:16]
-        return mhash
-
 
 class QEFFAutoModelForCausalLM(QEFFTransformersBase):
     """
@@ -349,8 +333,9 @@ def generate(
         self,
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
         prompts: List[str],
-        device_id: List[int] = None,
-        runtime: str = "AI_100",
+        device_id: List[int] = [0],
+        runtime_ai100: bool = True,
+        seq_len: int = constants.Constants.CTX_LEN,
         **kwargs,
     ):
         """
@@ -362,21 +347,24 @@ def generate(
             :prompts (List[str]): List of prompts to run the execution.
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
         ``optional`` Args:
-            :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100".
+            :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
+
         """
-        if runtime != "AI_100":
-            raise ValueError("Only AI_100 runtime is supported right now via generate API")
-        if not isinstance(self.qpc_path, Path):
-            raise TypeError("Please run compile API first!")
-        generation_len = kwargs.pop("generation_len", None)
-        return QEfficient.cloud_ai_100_exec_kv(
-            tokenizer,
-            self.qpc_path,
-            prompt=prompts,
-            device_id=device_id,
-            generation_len=generation_len,
-            is_tlm=self.is_tlm,
-        )
+        if runtime_ai100:
+            if not isinstance(self.qpc_path, Path):
+                raise TypeError("Please run compile API first!")
+            generation_len = kwargs.pop("generation_len", None)
+            return QEfficient.cloud_ai_100_exec_kv(
+                tokenizer,
+                self.qpc_path,
+                prompt=prompts,
+                device_id=device_id,
+                generation_len=generation_len,
+                is_tlm=self.is_tlm,
+            )
+        else:
+            inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len)
+            return self.model(**inputs)
 
 
 class QEffAutoModel(QEFFTransformersBase):
@@ -405,7 +393,7 @@ def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
         self.model.config.use_cache = True
         self.num_layers = model.config.num_hidden_layers
-    
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
         """
@@ -429,11 +417,26 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             # You can now execute the model
             model.generate(prompts=["Hi there!!"])
         """
-        
+
         self = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
 
         return self
 
+    @property
+    def model_hash(self) -> str:
+        # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path.
+        # Using same card name will result in same hash. But, using a relative path for one run and
+        # absolute path for another run will result in different hash.
+        # The added complexity to resolve different paths to same location is not worth pursuing.
+        # Instead, advise the user to always provide same relative paths or absolute paths for local models.
+
+        # Compute the hash with: model_config, transforms
+        mhash = hashlib.sha256()
+        mhash.update(to_hashable(self.model.config.to_diff_dict()))
+        mhash.update(to_hashable(self._transform_names()))
+        mhash = mhash.hexdigest()[:16]
+        return mhash
+
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
@@ -470,7 +473,9 @@ def compile(
         *,
         seq_len: int = 32,
         batch_size: int = 1,
+        num_devices: int = 1,
         num_cores: int = 16,  # FIXME: Make this mandatory arg
+        mxfp6_matmul: bool = False,
         **compiler_options,
     ) -> str:
         """
@@ -498,18 +503,20 @@ def compile(
             compile_only=True,
             specializations=specializations,
             convert_to_fp16=True,
+            mxfp6_matmul=mxfp6_matmul,
+            mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
             **compiler_options,
         )
 
     def generate(
         self,
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
-        prompt: List[str],
+        prompts: List[str],
         device_id: List[int] = [0],
         runtime_ai100: bool = True,
         seq_len: int = constants.Constants.CTX_LEN,
-    ) -> str:
+    ) -> dict:
         """
         This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
         This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed.
@@ -519,10 +526,10 @@ def generate(
             :prompts (List[str]): List of prompts to run the execution.
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
         ``optional`` Args:
-            :runtime_ai100 (bool), optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
+            :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
 
         Returns:
-            :str: Output from the ``AI_100`` or ``PyTorch`` runtime.
+            :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
         """
 
         # AI_100 runtime
@@ -531,10 +538,9 @@ def generate(
                 raise TypeError("Please run compile API first!")
 
             return QEfficient.cloud_ai_100_exec_embed(
-                tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id
+                tokenizer=tokenizer, prompt=prompts, qpc_path=self.qpc_path, device_id=device_id
             )
         # PyTorch runtime
         else:
-            inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
+            inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len)
             return self.model(**inputs)
-
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
@@ -47,7 +47,7 @@ def get_models_dir():
 ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32
 ONNX_EXPORT_EXAMPLE_FBS = 4
 ONNX_EXPORT_EXAMPLE_NLK = 2  # Number of Logits to Keep
-ONNX_EXPORT_OPSET = 13
+ONNX_EXPORT_OPSET = 14
 
 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"]
 
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
@@ -15,7 +15,7 @@
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM
 from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
-from QEfficient.utils import hf_download
+from QEfficient.utils import hf_download, padding_check_and_fix
 from QEfficient.utils._utils import load_hf_tokenizer
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.device_utils import get_available_device_id
@@ -192,13 +192,26 @@ def check_embed_pytorch_vs_ort_vs_ai100(
 
     # Try to initialize with add_pooling_layer parameter
     try:
-        qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path, add_pooling_layer=False)
+        qeff_model = QEffAutoModel.from_pretrained(
+            pretrained_model_name_or_path=model_path,
+            add_pooling_layer=False,
+            num_hidden_layers=n_layer,
+            attn_implementation="eager",
+            trust_remote_code=True,
+        )
     except TypeError:
         # If it fails, initialize without the parameter
-        qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path)
-    text = "My name is"
+        qeff_model = QEffAutoModel.from_pretrained(
+            pretrained_model_name_or_path=model_path,
+            num_hidden_layers=n_layer,
+            attn_implementation="eager",
+            trust_remote_code=True,
+        )
+
+    prompt = "My name is"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=seq_len)
+    padding_check_and_fix(tokenizer)
+    inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
 
     pt_outputs = qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False)
 
@@ -214,7 +227,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     onnx_embeddings = onnx_outputs[0]
     mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
     print("Mad for onnx and pytorch is ", mad)
-    assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
+    assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
 
     qeff_model.compile(
         num_cores=14,
@@ -224,7 +237,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     # Compare ONNX and AI 100 outputs
     mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0]))
     print("Mad for onnx and AI 100 output is ", mad)
-    assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}"
+    assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
 
 
 # FIXME: there should be a CB test here
@@ -302,7 +315,21 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
 
 
+embed_test_models = [
+    # model_name, architecture
+    "nomic-ai/nomic-embed-text-v1.5",  # NomicBertModel
+    "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
+    "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
+    "BAAI/bge-small-en-v1.5",  # BertModel
+    # "intfloat/e5-mistral-7b-instruct",  # MistralModel
+    # "dunzhang/stella_en_1.5B_v5", # Qwen2ForCausalLM
+]
+
+
 @pytest.mark.on_qaic
-def test_embed_model_pytorch_vs_onnx_vs_ai100():
-    model_name = "BAAI/bge-small-en-v1.5"
+@pytest.mark.parametrize("model_name", embed_test_models)
+def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
+    """
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
+    """
     check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)