Addressed comments-1

quic-amitraj · quic-amitraj · commit 6c4b5a6b25d2 · 2024-12-12T11:27:21.000+05:30
Signed-off-by: amitraj &lt;quic_amitraj@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -333,8 +333,9 @@ def generate(
         self,
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
         prompts: List[str],
-        device_id: List[int] = None,
-        runtime: str = "AI_100",
+        device_id: List[int] = [0],
+        runtime_ai100: bool = True,
+        seq_len: int = constants.Constants.CTX_LEN,
         **kwargs,
     ):
         """
@@ -346,21 +347,25 @@ def generate(
             :prompts (List[str]): List of prompts to run the execution.
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
         ``optional`` Args:
-            :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100".
+            :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
+            
         """
-        if runtime != "AI_100":
-            raise ValueError("Only AI_100 runtime is supported right now via generate API")
-        if not isinstance(self.qpc_path, Path):
-            raise TypeError("Please run compile API first!")
-        generation_len = kwargs.pop("generation_len", None)
-        return QEfficient.cloud_ai_100_exec_kv(
-            tokenizer,
-            self.qpc_path,
-            prompt=prompts,
-            device_id=device_id,
-            generation_len=generation_len,
-            is_tlm=self.is_tlm,
-        )
+        if runtime_ai100:    
+            if not isinstance(self.qpc_path, Path):
+                raise TypeError("Please run compile API first!")
+            generation_len = kwargs.pop("generation_len", None)
+            return QEfficient.cloud_ai_100_exec_kv(
+                tokenizer,
+                self.qpc_path,
+                prompt=prompts,
+                device_id=device_id,
+                generation_len=generation_len,
+                is_tlm=self.is_tlm,
+            )
+        else:
+            inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len)
+            return self.model(**inputs)
+            
 
 
 class QEffAutoModel(QEFFTransformersBase):
@@ -469,7 +474,9 @@ def compile(
         *,
         seq_len: int = 32,
         batch_size: int = 1,
+        num_devices: int = 1,
         num_cores: int = 16,  # FIXME: Make this mandatory arg
+        mxfp6_matmul: bool = False,
         **compiler_options,
     ) -> str:
         """
@@ -497,14 +504,16 @@ def compile(
             compile_only=True,
             specializations=specializations,
             convert_to_fp16=True,
+            mxfp6_matmul=mxfp6_matmul,
+            mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
             **compiler_options,
         )
 
     def generate(
         self,
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
-        prompt: List[str],
+        prompts: List[str],
         device_id: List[int] = [0],
         runtime_ai100: bool = True,
         seq_len: int = constants.Constants.CTX_LEN,
@@ -518,7 +527,7 @@ def generate(
             :prompts (List[str]): List of prompts to run the execution.
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
         ``optional`` Args:
-            :runtime_ai100 (bool), optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
+            :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
 
         Returns:
             :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
@@ -530,9 +539,9 @@ def generate(
                 raise TypeError("Please run compile API first!")
 
             return QEfficient.cloud_ai_100_exec_embed(
-                tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id
+                tokenizer=tokenizer, prompt=prompts, qpc_path=self.qpc_path, device_id=device_id
             )
         # PyTorch runtime
         else:
-            inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
+            inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len)
             return self.model(**inputs)