From e757ab88f1484b83abb85017e0dfff2205221f4c Mon Sep 17 00:00:00 2001
From: Swati Allabadi <quic_sallabad@quicinc.com>
Date: Mon, 28 Apr 2025 10:43:21 +0000
Subject: [PATCH 1/2] Adding steps about how to fine tune on any custom
 dataset.

Signed-off-by: Swati Allabadi <quic_sallabad@quicinc.com>
---
 QEfficient/finetune/dataset/custom_dataset.py |  4 +--
 docs/source/finetune.md                       | 34 +++++++++++++++++++
 2 files changed, 36 insertions(+), 2 deletions(-)
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
index 4bee06c58..f2811b6b3 100644
--- a/QEfficient/finetune/dataset/custom_dataset.py
+++ b/QEfficient/finetune/dataset/custom_dataset.py
@@ -23,7 +23,7 @@ def load_module_from_py_file(py_file: str) -> object:
     return module
 
 
-def get_custom_dataset(dataset_config, tokenizer, split: str):
+def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None):
     if ":" in dataset_config.file:
         module_path, func_name = dataset_config.file.split(":")
     else:
@@ -38,7 +38,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str):
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
-        return getattr(module, func_name)(dataset_config, tokenizer, split)
+        return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
     except AttributeError as e:
         print(
             f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
diff --git a/docs/source/finetune.md b/docs/source/finetune.md
index 40df4401c..48db94b79 100644
--- a/docs/source/finetune.md
+++ b/docs/source/finetune.md
@@ -63,4 +63,38 @@ to visualise the data,
 
 ```python
 tensorboard --logdir runs/<file> --bind_all
+```
+
+## Fine-Tuning on custom dataset
+
+To run fine tuning for any user specific dataset, prepare the dataset using the following steps:
+
+    1) Create a  directory named 'dataset' inside efficient-transformers. 
+    2) Inside this directory, create a file named 'custom_dataset.py'. This is different than the custom_dataset.py present at efficient-transformers/QEfficient/finetune/dataset.
+    3) Inside the newly created efficient-transformers/dataset/custom_dataset.py, define a function named 'get_custom_dataset'. 
+    4) get_custom_dataset() should have following 4 parameters:  dataset_config, tokenizer, split, context_length. This function gets called twice through Qefficient/cloud/finetune.py with the name get_preprocessed_dataset. 
+    5) Inside get_custom_dataset(), dataset needs to prepared for fine tuning. So, the user needs to apply prompt and tokenize the dataset accordingly. Please refer the below template on how to define get_custom_dataset().
+    6) For examples, please refer python files present in efficient-transformers/QEfficient/finetune/dataset. In case of Samsum dataset, get_preprocessed_samsum() of efficient-transformers/QEfficient/finetune/dataset/samsum_dataset.py is called. 
+    7) In efficient-transformers/QEfficient/finetune/configs/dataset_config.py, for custom_dataset class, pass the appropriate value for train_split and test_split according to the dataset keys corresponding to train and test data points.
+    8) While running fine tuning, pass argument "-–dataset custom_dataset" to finetune on custom dataset.   
+
+Template for get_custom_dataset() to be defined inside efficient-transformers/dataset/custom_dataset.py is as follows:
+
+```python
+def get_custom_dataset(dataset_config, tokenizer, split, context_length=None):
+
+    # load dataset
+    # based on split, retrieve only the specific portion of the dataset (train or eval) either here or at the last
+    
+    def apply_prompt_template():
+    
+    def tokenize():
+    
+    # define prompt
+    # call apply_prompt_template() for each data point:
+    # data = data.map(apply_prompt_template ,<other args>)
+    # call tokenize() for each data point:
+    # data = data.map(tokenize, <other args>)
+    
+    return dataset
 ```
\ No newline at end of file

From bae75d22129596751dbaf1d9b4994a96cf9dc9ba Mon Sep 17 00:00:00 2001
From: Swati Allabadi <quic_sallabad@quicinc.com>
Date: Fri, 9 May 2025 16:27:35 +0530
Subject: [PATCH 2/2] Update finetune.md

Signed-off-by: Swati Allabadi <quic_sallabad@quicinc.com>
---
 docs/source/finetune.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/finetune.md b/docs/source/finetune.md
index 48db94b79..7d80b572b 100644
--- a/docs/source/finetune.md
+++ b/docs/source/finetune.md
@@ -92,9 +92,9 @@ def get_custom_dataset(dataset_config, tokenizer, split, context_length=None):
     
     # define prompt
     # call apply_prompt_template() for each data point:
-    # data = data.map(apply_prompt_template ,<other args>)
+    # dataset = dataset.map(apply_prompt_template ,<other args>)
     # call tokenize() for each data point:
-    # data = data.map(tokenize, <other args>)
+    # dataset = dataset.map(tokenize, <other args>)
     
     return dataset
-```
\ No newline at end of file
+```