Fine-tune a Large Language Model in Google Colab¶

2mn intro¶

This notebook shows how to finetune a TinyLlama model. We used TinyLama which is a 1.1B parameters model to save time and be able to run on Google Colab.

Since our application uses a Q&A dataset, we chosed to use the chat flavor of the model.

This notebook can therefore run on a T4 GPU.

Getting started¶

Let's install our dependencies.

huggingface_hub will be used to load and publish dataset and models in order to version them
transformers is the blockbuster library for NLP by 🤗
accelerate
bitsandbytes
peft
trl

In [ ]:

Copied!

%%capture
!pip install --upgrade pip
!pip install --upgrade huggingface_hub
%%capture
!pip install --upgrade pip
!pip install --upgrade huggingface_hub

In [ ]:

Copied!

%%capture
!pip install accelerate bitsandbytes datasets peft transformers trl
%%capture
!pip install accelerate bitsandbytes datasets peft transformers trl

Let's connect to HuggingFace - you would need to register if you don't have yet an account. This is needed to push your model !

Once you have register, add your HuggingFacetoken in the Google Colab secrets

Now let's import our modules

In [ ]:

Copied!





import json
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import json
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

And define some parameters.

In [ ]:

Copied!





# The model that you want to train from the Hugging Face hub
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# The instruction dataset to use
dataset_name="burkelibbey/colors"

# Fine-tuned model name
new_model = "galleon/TinyLlama-1.1B-Chat-OpenHermes-v1.0"
new_model = "galleon/TinyLlama-1.1B-Chat-no_robots-v1.0"
new_model = "TinyLlama-1.1B-Chat-colors-v1.0"
# The model that you want to train from the Hugging Face hub
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# The instruction dataset to use
dataset_name="burkelibbey/colors"

# Fine-tuned model name
new_model = "galleon/TinyLlama-1.1B-Chat-OpenHermes-v1.0"
new_model = "galleon/TinyLlama-1.1B-Chat-no_robots-v1.0"
new_model = "TinyLlama-1.1B-Chat-colors-v1.0"

Load the `burkelibbey/colors` dataset from the hub and build the TinyLlama prompts¶

In [ ]:

Copied!





tiny_llama_prompt="""<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
{}<|im_end|>
"""

def formatting_prompts_func(examples):
    description_col = examples["description"]
    color_col = examples["color"]
    texts = []
    for description, color in zip(description_col, color_col):
        text = tiny_llama_prompt.format(description, color)
        texts.append(text)
    return {"text": texts}

dataset = load_dataset(dataset_name, split="train")

dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)

# Filter out rows where 'text' is empty
dataset = dataset.filter(lambda example: example["text"] != "")

# Remove all but tyhe text column
dataset = dataset.remove_columns(["description", "color"])
tiny_llama_prompt="""<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
{}<|im_end|>
"""

def formatting_prompts_func(examples):
    description_col = examples["description"]
    color_col = examples["color"]
    texts = []
    for description, color in zip(description_col, color_col):
        text = tiny_llama_prompt.format(description, color)
        texts.append(text)
    return {"text": texts}

dataset = load_dataset(dataset_name, split="train")

dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)

# Filter out rows where 'text' is empty
dataset = dataset.filter(lambda example: example["text"] != "")

# Remove all but tyhe text column
dataset = dataset.remove_columns(["description", "color"])

In [ ]:

Copied!

dataset[0]
dataset[0]

Load the model and tokenizer¶

In [ ]:

Copied!





################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

# Chosen torch device
device_map = "auto" # use {"": 0} to load the entire model on the GPU 0

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

# Chosen torch device
device_map = "auto" # use {"": 0} to load the entire model on the GPU 0

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [ ]:

Copied!





################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 8
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.05

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 8
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.05

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [ ]:

Copied!





################################################################################
# TrainingArguments parameters
################################################################################

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 8

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Log every X updates steps
logging_steps = 10

# Push model to HuggingFace hub
push_to_hub = False

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=new_model,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=fp16,
    bf16=bf16,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
)
################################################################################
# TrainingArguments parameters
################################################################################

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 8

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Log every X updates steps
logging_steps = 10

# Push model to HuggingFace hub
push_to_hub = False

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=new_model,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=fp16,
    bf16=bf16,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
)

This is the resource usage for the color dataset

In [ ]:

Copied!





################################################################################
# Define SFT parameters and create Trainer
################################################################################

# Maximum sequence length to use
max_seq_length = 1024

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)
################################################################################
# Define SFT parameters and create Trainer
################################################################################

# Maximum sequence length to use
max_seq_length = 1024

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Save model on disk¶

In [ ]:

Copied!

trainer.model.save_pretrained(f"galleon/{new_model}")
trainer.model.save_pretrained(f"galleon/{new_model}")

Merge the low rank adapter with the initial model¶

In [ ]:

Copied!





from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = f"galleon/{new_model}"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")

model = peft_model.merge_and_unload()
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = f"galleon/{new_model}"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")

model = peft_model.merge_and_unload()

Save the model to the HuggingFace hub¶

In [ ]:

Copied!

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

Let's try our newly finetuned model¶

Using a text-generation pipeline¶

In [ ]:

Copied!





from transformers import pipeline

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) #, torch_dtype=torch.bfloat16, device_map="auto")

tiny_llama_prompt_="""<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
"""

outputs = pipe(tiny_llama_prompt_.format("Golden Yellow"), max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs)
from transformers import pipeline

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) #, torch_dtype=torch.bfloat16, device_map="auto")

tiny_llama_prompt_="""<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
"""

outputs = pipe(tiny_llama_prompt_.format("Golden Yellow"), max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs)

Using a conversational pipeline¶

In [ ]:

Copied!





pipe = pipeline(
    "conversational", model=model, tokenizer=tokenizer
)

messages = [
    {
        "role": "system",
        "content": "You will be given a text describing a color. Only return the hex color code without any other information",
    },
    {"role": "user", "content": "Golden Yellow"},
]

messages = pipe(
    messages, max_new_tokens=6, do_sample=True, temperature=0.9, top_k=50, top_p=0.95
)
print(messages[-1]["content"])
pipe = pipeline(
    "conversational", model=model, tokenizer=tokenizer
)

messages = [
    {
        "role": "system",
        "content": "You will be given a text describing a color. Only return the hex color code without any other information",
    },
    {"role": "user", "content": "Golden Yellow"},
]

messages = pipe(
    messages, max_new_tokens=6, do_sample=True, temperature=0.9, top_k=50, top_p=0.95
)
print(messages[-1]["content"])

In [ ]:

Copied!





def print_color_space(messages):
    def hex_to_rgb(hex_color):
        hex_color = hex_color.lstrip("#")
        return tuple(int(hex_color[i : i + 2], 16) for i in (0, 2, 4))

    hex_color = messages[-1]["content"]
    r, g, b = hex_to_rgb(hex_color)
    print(
        f"{messages[1]['content']} [{hex_color}]: \033[48;2;{r};{g};{b}m           \033[0m"
    )
def print_color_space(messages):
    def hex_to_rgb(hex_color):
        hex_color = hex_color.lstrip("#")
        return tuple(int(hex_color[i : i + 2], 16) for i in (0, 2, 4))

    hex_color = messages[-1]["content"]
    r, g, b = hex_to_rgb(hex_color)
    print(
        f"{messages[1]['content']} [{hex_color}]: \033[48;2;{r};{g};{b}m           \033[0m"
    )

In [ ]:

Copied!

print_color_space(messages)
print_color_space(messages)

Empty CPU/GPU memory

In [ ]:

Copied!





# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

Fine-tune a Large Language Model in Google Colab¶

2mn intro¶

Getting started¶

Load the burkelibbey/colors dataset from the hub and build the TinyLlama prompts¶

Load the model and tokenizer¶

Save model on disk¶

Merge the low rank adapter with the initial model¶

Save the model to the HuggingFace hub¶

Let's try our newly finetuned model¶

Using a text-generation pipeline¶

Using a conversational pipeline¶

Load the `burkelibbey/colors` dataset from the hub and build the TinyLlama prompts¶