diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index 4bee06c58..f2811b6b3 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -23,7 +23,7 @@ def load_module_from_py_file(py_file: str) -> object: return module -def get_custom_dataset(dataset_config, tokenizer, split: str): +def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None): if ":" in dataset_config.file: module_path, func_name = dataset_config.file.split(":") else: @@ -38,7 +38,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str): module = load_module_from_py_file(module_path.as_posix()) try: - return getattr(module, func_name)(dataset_config, tokenizer, split) + return getattr(module, func_name)(dataset_config, tokenizer, split, context_length) except AttributeError as e: print( f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})." diff --git a/docs/source/finetune.md b/docs/source/finetune.md index 40df4401c..48db94b79 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -63,4 +63,38 @@ to visualise the data, ```python tensorboard --logdir runs/ --bind_all +``` + +## Fine-Tuning on custom dataset + +To run fine tuning for any user specific dataset, prepare the dataset using the following steps: + + 1) Create a directory named 'dataset' inside efficient-transformers. + 2) Inside this directory, create a file named 'custom_dataset.py'. This is different than the custom_dataset.py present at efficient-transformers/QEfficient/finetune/dataset. + 3) Inside the newly created efficient-transformers/dataset/custom_dataset.py, define a function named 'get_custom_dataset'. + 4) get_custom_dataset() should have following 4 parameters: dataset_config, tokenizer, split, context_length. This function gets called twice through Qefficient/cloud/finetune.py with the name get_preprocessed_dataset. + 5) Inside get_custom_dataset(), dataset needs to prepared for fine tuning. So, the user needs to apply prompt and tokenize the dataset accordingly. Please refer the below template on how to define get_custom_dataset(). + 6) For examples, please refer python files present in efficient-transformers/QEfficient/finetune/dataset. In case of Samsum dataset, get_preprocessed_samsum() of efficient-transformers/QEfficient/finetune/dataset/samsum_dataset.py is called. + 7) In efficient-transformers/QEfficient/finetune/configs/dataset_config.py, for custom_dataset class, pass the appropriate value for train_split and test_split according to the dataset keys corresponding to train and test data points. + 8) While running fine tuning, pass argument "-–dataset custom_dataset" to finetune on custom dataset. + +Template for get_custom_dataset() to be defined inside efficient-transformers/dataset/custom_dataset.py is as follows: + +```python +def get_custom_dataset(dataset_config, tokenizer, split, context_length=None): + + # load dataset + # based on split, retrieve only the specific portion of the dataset (train or eval) either here or at the last + + def apply_prompt_template(): + + def tokenize(): + + # define prompt + # call apply_prompt_template() for each data point: + # data = data.map(apply_prompt_template ,) + # call tokenize() for each data point: + # data = data.map(tokenize, ) + + return dataset ``` \ No newline at end of file