@@ -66,22 +66,6 @@ def model_name(self) -> str:
66
66
mname = mname [4 :]
67
67
return mname
68
68
69
- @property
70
- def model_hash (self ) -> str :
71
- # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path.
72
- # Using same card name will result in same hash. But, using a relative path for one run and
73
- # absolute path for another run will result in different hash.
74
- # The added complexity to resolve different paths to same location is not worth pursuing.
75
- # Instead, advise the user to always provide same relative paths or absolute paths for local models.
76
-
77
- # Compute the hash with: model_config, transforms
78
- mhash = hashlib .sha256 ()
79
- mhash .update (to_hashable (self .model .config .to_diff_dict ()))
80
- mhash .update (to_hashable (self ._transform_names ()))
81
- mhash .update (to_hashable ({"is_tlm" : self .is_tlm }))
82
- mhash = mhash .hexdigest ()[:16 ]
83
- return mhash
84
-
85
69
86
70
class QEFFAutoModelForCausalLM (QEFFTransformersBase ):
87
71
"""
@@ -349,8 +333,9 @@ def generate(
349
333
self ,
350
334
tokenizer : Union [PreTrainedTokenizerFast , PreTrainedTokenizer ],
351
335
prompts : List [str ],
352
- device_id : List [int ] = None ,
353
- runtime : str = "AI_100" ,
336
+ device_id : List [int ] = [0 ],
337
+ runtime_ai100 : bool = True ,
338
+ seq_len : int = constants .Constants .CTX_LEN ,
354
339
** kwargs ,
355
340
):
356
341
"""
@@ -362,21 +347,24 @@ def generate(
362
347
:prompts (List[str]): List of prompts to run the execution.
363
348
:device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
364
349
``optional`` Args:
365
- :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100".
350
+ :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
351
+
366
352
"""
367
- if runtime != "AI_100" :
368
- raise ValueError ("Only AI_100 runtime is supported right now via generate API" )
369
- if not isinstance (self .qpc_path , Path ):
370
- raise TypeError ("Please run compile API first!" )
371
- generation_len = kwargs .pop ("generation_len" , None )
372
- return QEfficient .cloud_ai_100_exec_kv (
373
- tokenizer ,
374
- self .qpc_path ,
375
- prompt = prompts ,
376
- device_id = device_id ,
377
- generation_len = generation_len ,
378
- is_tlm = self .is_tlm ,
379
- )
353
+ if runtime_ai100 :
354
+ if not isinstance (self .qpc_path , Path ):
355
+ raise TypeError ("Please run compile API first!" )
356
+ generation_len = kwargs .pop ("generation_len" , None )
357
+ return QEfficient .cloud_ai_100_exec_kv (
358
+ tokenizer ,
359
+ self .qpc_path ,
360
+ prompt = prompts ,
361
+ device_id = device_id ,
362
+ generation_len = generation_len ,
363
+ is_tlm = self .is_tlm ,
364
+ )
365
+ else :
366
+ inputs = tokenizer (prompts , return_tensors = "pt" , padding = "max_length" , max_length = seq_len )
367
+ return self .model (** inputs )
380
368
381
369
382
370
class QEffAutoModel (QEFFTransformersBase ):
@@ -405,7 +393,7 @@ def __init__(self, model: nn.Module, **kwargs):
405
393
super ().__init__ (model )
406
394
self .model .config .use_cache = True
407
395
self .num_layers = model .config .num_hidden_layers
408
-
396
+
409
397
@classmethod
410
398
def from_pretrained (cls , pretrained_model_name_or_path , * args , ** kwargs ):
411
399
"""
@@ -429,11 +417,26 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
429
417
# You can now execute the model
430
418
model.generate(prompts=["Hi there!!"])
431
419
"""
432
-
420
+
433
421
self = super ().from_pretrained (pretrained_model_name_or_path , * args , ** kwargs )
434
422
435
423
return self
436
424
425
+ @property
426
+ def model_hash (self ) -> str :
427
+ # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path.
428
+ # Using same card name will result in same hash. But, using a relative path for one run and
429
+ # absolute path for another run will result in different hash.
430
+ # The added complexity to resolve different paths to same location is not worth pursuing.
431
+ # Instead, advise the user to always provide same relative paths or absolute paths for local models.
432
+
433
+ # Compute the hash with: model_config, transforms
434
+ mhash = hashlib .sha256 ()
435
+ mhash .update (to_hashable (self .model .config .to_diff_dict ()))
436
+ mhash .update (to_hashable (self ._transform_names ()))
437
+ mhash = mhash .hexdigest ()[:16 ]
438
+ return mhash
439
+
437
440
def export (self , export_dir : Optional [str ] = None ) -> str :
438
441
"""
439
442
Exports the model to ``ONNX`` format using ``torch.onnx.export``.
@@ -470,7 +473,9 @@ def compile(
470
473
* ,
471
474
seq_len : int = 32 ,
472
475
batch_size : int = 1 ,
476
+ num_devices : int = 1 ,
473
477
num_cores : int = 16 , # FIXME: Make this mandatory arg
478
+ mxfp6_matmul : bool = False ,
474
479
** compiler_options ,
475
480
) -> str :
476
481
"""
@@ -498,18 +503,20 @@ def compile(
498
503
compile_only = True ,
499
504
specializations = specializations ,
500
505
convert_to_fp16 = True ,
506
+ mxfp6_matmul = mxfp6_matmul ,
507
+ mdp_ts_num_devices = num_devices ,
501
508
aic_num_cores = num_cores ,
502
509
** compiler_options ,
503
510
)
504
511
505
512
def generate (
506
513
self ,
507
514
tokenizer : Union [PreTrainedTokenizerFast , PreTrainedTokenizer ],
508
- prompt : List [str ],
515
+ prompts : List [str ],
509
516
device_id : List [int ] = [0 ],
510
517
runtime_ai100 : bool = True ,
511
518
seq_len : int = constants .Constants .CTX_LEN ,
512
- ) -> str :
519
+ ) -> dict :
513
520
"""
514
521
This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
515
522
This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed.
@@ -519,10 +526,10 @@ def generate(
519
526
:prompts (List[str]): List of prompts to run the execution.
520
527
:device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
521
528
``optional`` Args:
522
- :runtime_ai100 (bool) , optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
529
+ :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
523
530
524
531
Returns:
525
- :str : Output from the ``AI_100`` or ``PyTorch`` runtime.
532
+ :dict : Output from the ``AI_100`` or ``PyTorch`` runtime.
526
533
"""
527
534
528
535
# AI_100 runtime
@@ -531,10 +538,9 @@ def generate(
531
538
raise TypeError ("Please run compile API first!" )
532
539
533
540
return QEfficient .cloud_ai_100_exec_embed (
534
- tokenizer = tokenizer , prompt = prompt , qpc_path = self .qpc_path , device_id = device_id
541
+ tokenizer = tokenizer , prompt = prompts , qpc_path = self .qpc_path , device_id = device_id
535
542
)
536
543
# PyTorch runtime
537
544
else :
538
- inputs = tokenizer (prompt , return_tensors = "pt" , padding = "max_length" , max_length = seq_len )
545
+ inputs = tokenizer (prompts , return_tensors = "pt" , padding = "max_length" , max_length = seq_len )
539
546
return self .model (** inputs )
540
-
0 commit comments