From e757ab88f1484b83abb85017e0dfff2205221f4c Mon Sep 17 00:00:00 2001 From: Swati Allabadi Date: Mon, 28 Apr 2025 10:43:21 +0000 Subject: [PATCH 1/2] Adding steps about how to fine tune on any custom dataset. Signed-off-by: Swati Allabadi --- QEfficient/finetune/dataset/custom_dataset.py | 4 +-- docs/source/finetune.md | 34 +++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index 4bee06c58..f2811b6b3 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -23,7 +23,7 @@ def load_module_from_py_file(py_file: str) -> object: return module -def get_custom_dataset(dataset_config, tokenizer, split: str): +def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None): if ":" in dataset_config.file: module_path, func_name = dataset_config.file.split(":") else: @@ -38,7 +38,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str): module = load_module_from_py_file(module_path.as_posix()) try: - return getattr(module, func_name)(dataset_config, tokenizer, split) + return getattr(module, func_name)(dataset_config, tokenizer, split, context_length) except AttributeError as e: print( f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})." diff --git a/docs/source/finetune.md b/docs/source/finetune.md index 40df4401c..48db94b79 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -63,4 +63,38 @@ to visualise the data, ```python tensorboard --logdir runs/ --bind_all +``` + +## Fine-Tuning on custom dataset + +To run fine tuning for any user specific dataset, prepare the dataset using the following steps: + + 1) Create a directory named 'dataset' inside efficient-transformers. + 2) Inside this directory, create a file named 'custom_dataset.py'. This is different than the custom_dataset.py present at efficient-transformers/QEfficient/finetune/dataset. + 3) Inside the newly created efficient-transformers/dataset/custom_dataset.py, define a function named 'get_custom_dataset'. + 4) get_custom_dataset() should have following 4 parameters: dataset_config, tokenizer, split, context_length. This function gets called twice through Qefficient/cloud/finetune.py with the name get_preprocessed_dataset. + 5) Inside get_custom_dataset(), dataset needs to prepared for fine tuning. So, the user needs to apply prompt and tokenize the dataset accordingly. Please refer the below template on how to define get_custom_dataset(). + 6) For examples, please refer python files present in efficient-transformers/QEfficient/finetune/dataset. In case of Samsum dataset, get_preprocessed_samsum() of efficient-transformers/QEfficient/finetune/dataset/samsum_dataset.py is called. + 7) In efficient-transformers/QEfficient/finetune/configs/dataset_config.py, for custom_dataset class, pass the appropriate value for train_split and test_split according to the dataset keys corresponding to train and test data points. + 8) While running fine tuning, pass argument "-–dataset custom_dataset" to finetune on custom dataset. + +Template for get_custom_dataset() to be defined inside efficient-transformers/dataset/custom_dataset.py is as follows: + +```python +def get_custom_dataset(dataset_config, tokenizer, split, context_length=None): + + # load dataset + # based on split, retrieve only the specific portion of the dataset (train or eval) either here or at the last + + def apply_prompt_template(): + + def tokenize(): + + # define prompt + # call apply_prompt_template() for each data point: + # data = data.map(apply_prompt_template ,) + # call tokenize() for each data point: + # data = data.map(tokenize, ) + + return dataset ``` \ No newline at end of file From bae75d22129596751dbaf1d9b4994a96cf9dc9ba Mon Sep 17 00:00:00 2001 From: Swati Allabadi Date: Fri, 9 May 2025 16:27:35 +0530 Subject: [PATCH 2/2] Update finetune.md Signed-off-by: Swati Allabadi --- docs/source/finetune.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/finetune.md b/docs/source/finetune.md index 48db94b79..7d80b572b 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -92,9 +92,9 @@ def get_custom_dataset(dataset_config, tokenizer, split, context_length=None): # define prompt # call apply_prompt_template() for each data point: - # data = data.map(apply_prompt_template ,) + # dataset = dataset.map(apply_prompt_template ,) # call tokenize() for each data point: - # data = data.map(tokenize, ) + # dataset = dataset.map(tokenize, ) return dataset -``` \ No newline at end of file +```