Just after the LLM hype train arrived, I was struggling how to properly create and train self instruct models. LLM jumped straight into the mainstream and I really had that feeling that there is little focus on how this models works and how to properly define a dataset for correct fine-tuning process.

Also, people automatically forget about smaller architectures, and using models which has number of params smaller than 1b is just a faux pas. I decided to write a very simple wrapper for fine tune self instruct model on any architecture, using my last amount of energy and spare time.

Edit: Need to be refactored, but let me think of it later.

Data

Data was always the most informative part of ml/dl/modelling, but now it is really clear for the masses, that properly define dataset can trick you into thinking of AGI, and it cannot be solved by using some fuzzy wuzzy complex architectures.

So, at the very first information about Alpaca/LLama/chatgpt models I was scratching my head about input, instructions and output. These three allow me to spends sleepless nights, because there was not exact information (or maybe I am just unaware and my adhd is strong enought) about how these data is put straight into model.

So I thought that there exist some additional backbone throught network, some magic flow, some internat nn which passes input, communicate with instruction and give hand to the output, which is miracolously treat as an response. But, just no.

This information is just simply put into one big string with additional tokens, and rest is the same. Training process did not change, there are vast amount of parameters, models are big enought to learn the parse project, and use context from input to gather information to the output.

Fine tuning

These 150liner allows you to fine tune almost any model, using trining args and additional configuration params, to get some pretty accurate self instruct model.

It is messy at the moment, but who cares.



import argparse
import pandas as pd 
import pathlib
import pickle as pkl
import transformers
import torch 
import json

from pathlib import Path
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer 
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import Trainer

from transformers import HfArgumentParser
from transformers import TrainingArguments
from typing import List, Tuple, Union
torch.manual_seed(42)

def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--model', nargs="?", default="")
    parser.add_argument('-d', '--destination', nargs="?", default="")
    parser.add_argument('-t', '--tokenizer', nargs="?", default="")
    args = parser.parse_args()
    return args

class CFG:
    model_name: str = 'EleutherAI/gpt-neo-125m'
    tokenizer_name: str = 'EleutherAI/gpt-neo-125m'
    max_length: int  = 512
    device: str = 'cuda'
    cuda: bool = True
    train_ratio: float = 0.9
    args_path: str = 'args.json'
    data_path: str = 'data.csv'

config =  CFG();

def load_model(name):
    model = AutoModelForCausalLM.from_pretrained(name)
    if config.cuda:
        return model.cuda()
    return model

def load_tokenizer(name):
    tokenizer = AutoTokenizer.from_pretrained(name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

def generate_prompt(instruction: str, input: str, response: str):
    if input:
        return f"""Poniżej znajduje się instrukcja
        opisująca zadanie,
        połączona z danymi wejściowymi
        , które zapewniają dalszy konktekst.
        Napisz odpowiedź, która odpowiednio odpowie na pytanie.

### Instruction:
{instruction}

### Input:
{input}
|
### Response:
{response}
"""

class InstructDataset(Dataset):
    
    def __init__(self, data, tokenizer, max_length):
        
        self.input_ids: List = []
        self.attn_masks: List = []
        self.labels: List = []
        
        for txt in data:
            instruction = generate_prompt(instruction = txt['instruction'], 
                                          input = txt['input'], 
                                          response = txt['output'])
            encodings_dict = tokenizer(instruction, padding="max_length", max_length = max_length, truncation=True)
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
            
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx) -> Tuple[List, List]:
        return self.input_ids[idx], self.attn_masks[idx]
    
Pathable = Union[str, pathlib.Path]

def get_extension(input_path: Pathable):
    if isinstance(input_path, pathlib.Path):
        input_path = str(input_path)
    return input_path.split('.')[-1]

class BasicException(Exception):
    ...

def read_data(input_path: Pathable):
    proper_types: List = ['csv', 'json', '']
    extension = get_extension(input_path) 
    if extension not in proper_types:
        raise BasicException("Please provide a proper file extension")

def dataset_prep(configtokenizer) -> Tuple[torch.utils.data.Dataset, torch.utils.data.Dataset]:
    input_data = read_data(config.data_path)
    dataset = InstructDataset(data = input_data, tokenizer = tokenizer, max_length = config.max_length)
    train_size = int(config.train_ratio * len(input_data))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    return train_dataset, val_dataset

def save_training_args(args: TrainingArguments) -> None:
    args_json = args.to_json_string()
    with open(config.args_path, 'w') as outfile:
        outfile.write(args_json)

def load_training_args() -> TrainingArguments:
    parser = HfArgumentParser(TrainingArguments)
    training_args, = parser.parse_json_file(json_file = config.args_path)
    return training_args

def parse_model_name(model_name: str) -> str:
    return model_name.split('/')[-1]

def create_outpath(model_name) -> None:
    name: str = parse_model_name(model_name=model_name)
    outpath = f"./result/{name}"
    return outpath

def create_training_args():
    ...
    # How to properly create training algorithms

# def  collate_data(data):
#     lambda data: {'input_ids': torch.stack([f[0] for f in data]),
#     'attention_mask': torch.stack([f[1] for f in data]),
#     'labels': torch.stack([f[0] for f in data])}).train()

def train():
    ...

def main():
    args = get_args()
    args_list: List = [args.model, args.destination, args.tokenizer] 
    # If empty, go straight to  config
    if "" in args_list:
        model_name = config.model_name
        tokenizer_name = config.tokenizer_name

    else:
        model_name = args.model
        tokenizer_name = args.tokenizer
    
    parsed_name = parse_model_name(model_name)
    output_path = create_outpath(parsed_name)
    model = load_model(model_name)
    tokenizer = load_tokenizer(tokenizer_name)
    
    
    #train_dataset, val_dataset = dataset_prep(config = config)

    data = pd.read_csv("alpaca_dolly.csv")
    dict_data = pd.DataFrame.to_dict(data, orient='records' )
    dataset = InstructDataset(data = dict_data, tokenizer = tokenizer, max_length=512)
    train_size = int(0.9 * len(dataset))
    train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
        
    training_args = TrainingArguments(output_dir=output_path, num_train_epochs=2, logging_steps=5000,
                                  per_device_train_batch_size=8, per_device_eval_batch_size=2,
                                  warmup_steps=10, weight_decay=0.01,
                                  learning_rate = 1e-3, logging_dir='./logs', 
                                 save_total_limit=2)
    
    trainer = Trainer(model = model, 
            args = training_args, 
            train_dataset = train_dataset, 
            eval_dataset = val_dataset, 
            data_collator= lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()
    trainer.train()

if __name__ == "__main__":
    main()