Fine-tune a Large Language Model in Google Colab¶
2mn intro¶
This notebook shows how to finetune a TinyLlama model. We used TinyLama which is a 1.1B parameters model to save time and be able to run on Google Colab.
Since our application uses a Q&A dataset, we chosed to use the chat flavor of the model.
This notebook can therefore run on a T4 GPU.
Getting started¶
Let's install our dependencies.
- huggingface_hub will be used to load and publish dataset and models in order to version them
- transformers is the blockbuster library for NLP by 🤗
- accelerate
- bitsandbytes
- peft
- trl
In [ ]:
Copied!
%%capture
!pip install --upgrade pip
!pip install --upgrade huggingface_hub
%%capture
!pip install --upgrade pip
!pip install --upgrade huggingface_hub
In [ ]:
Copied!
%%capture
!pip install accelerate bitsandbytes datasets peft transformers trl
%%capture
!pip install accelerate bitsandbytes datasets peft transformers trl
Let's connect to HuggingFace - you would need to register if you don't have yet an account. This is needed to push your model !
Once you have register, add your HuggingFace
token in the Google Colab secrets
Now let's import our modules
In [ ]:
Copied!
import json
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
TrainingArguments,
pipeline,
logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import json
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
TrainingArguments,
pipeline,
logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
And define some parameters.
In [ ]:
Copied!
# The model that you want to train from the Hugging Face hub
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# The instruction dataset to use
dataset_name="burkelibbey/colors"
# Fine-tuned model name
new_model = "galleon/TinyLlama-1.1B-Chat-OpenHermes-v1.0"
new_model = "galleon/TinyLlama-1.1B-Chat-no_robots-v1.0"
new_model = "TinyLlama-1.1B-Chat-colors-v1.0"
# The model that you want to train from the Hugging Face hub
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# The instruction dataset to use
dataset_name="burkelibbey/colors"
# Fine-tuned model name
new_model = "galleon/TinyLlama-1.1B-Chat-OpenHermes-v1.0"
new_model = "galleon/TinyLlama-1.1B-Chat-no_robots-v1.0"
new_model = "TinyLlama-1.1B-Chat-colors-v1.0"
Load the burkelibbey/colors
dataset from the hub and build the TinyLlama prompts¶
In [ ]:
Copied!
tiny_llama_prompt="""<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
{}<|im_end|>
"""
def formatting_prompts_func(examples):
description_col = examples["description"]
color_col = examples["color"]
texts = []
for description, color in zip(description_col, color_col):
text = tiny_llama_prompt.format(description, color)
texts.append(text)
return {"text": texts}
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.map(
formatting_prompts_func,
batched=True,
)
# Filter out rows where 'text' is empty
dataset = dataset.filter(lambda example: example["text"] != "")
# Remove all but tyhe text column
dataset = dataset.remove_columns(["description", "color"])
tiny_llama_prompt="""<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
{}<|im_end|>
"""
def formatting_prompts_func(examples):
description_col = examples["description"]
color_col = examples["color"]
texts = []
for description, color in zip(description_col, color_col):
text = tiny_llama_prompt.format(description, color)
texts.append(text)
return {"text": texts}
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.map(
formatting_prompts_func,
batched=True,
)
# Filter out rows where 'text' is empty
dataset = dataset.filter(lambda example: example["text"] != "")
# Remove all but tyhe text column
dataset = dataset.remove_columns(["description", "color"])
In [ ]:
Copied!
dataset[0]
dataset[0]
Load the model and tokenizer¶
In [ ]:
Copied!
################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True
# Chosen torch device
device_map = "auto" # use {"": 0} to load the entire model on the GPU 0
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=use_nested_quant,
)
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
major, _ = torch.cuda.get_device_capability()
if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16: accelerate training with bf16=True")
print("=" * 80)
# Load base model
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True
# Chosen torch device
device_map = "auto" # use {"": 0} to load the entire model on the GPU 0
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=use_nested_quant,
)
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
major, _ = torch.cuda.get_device_capability()
if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16: accelerate training with bf16=True")
print("=" * 80)
# Load base model
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
In [ ]:
Copied!
################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 8
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.05
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
r=lora_r,
bias="none",
task_type="CAUSAL_LM",
)
################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 8
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.05
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
r=lora_r,
bias="none",
task_type="CAUSAL_LM",
)
In [ ]:
Copied!
################################################################################
# TrainingArguments parameters
################################################################################
# Number of training epochs
num_train_epochs = 1
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False
# Batch size per GPU for training
per_device_train_batch_size = 8
# Batch size per GPU for evaluation
per_device_eval_batch_size = 4
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4
# Enable gradient checkpointing
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule
lr_scheduler_type = "cosine"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True
# Log every X updates steps
logging_steps = 10
# Push model to HuggingFace hub
push_to_hub = False
# Set training parameters
training_arguments = TrainingArguments(
output_dir=new_model,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
logging_steps=logging_steps,
learning_rate=learning_rate,
fp16=fp16,
bf16=bf16,
group_by_length=group_by_length,
lr_scheduler_type=lr_scheduler_type,
)
################################################################################
# TrainingArguments parameters
################################################################################
# Number of training epochs
num_train_epochs = 1
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False
# Batch size per GPU for training
per_device_train_batch_size = 8
# Batch size per GPU for evaluation
per_device_eval_batch_size = 4
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4
# Enable gradient checkpointing
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule
lr_scheduler_type = "cosine"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True
# Log every X updates steps
logging_steps = 10
# Push model to HuggingFace hub
push_to_hub = False
# Set training parameters
training_arguments = TrainingArguments(
output_dir=new_model,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
logging_steps=logging_steps,
learning_rate=learning_rate,
fp16=fp16,
bf16=bf16,
group_by_length=group_by_length,
lr_scheduler_type=lr_scheduler_type,
)
This is the resource usage for the color dataset
In [ ]:
Copied!
################################################################################
# Define SFT parameters and create Trainer
################################################################################
# Maximum sequence length to use
max_seq_length = 1024
# Pack multiple short examples in the same input sequence to increase efficiency
packing = False
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
packing=packing,
)
# Train model
trainer.train()
# Save trained model
trainer.model.save_pretrained(new_model)
################################################################################
# Define SFT parameters and create Trainer
################################################################################
# Maximum sequence length to use
max_seq_length = 1024
# Pack multiple short examples in the same input sequence to increase efficiency
packing = False
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
packing=packing,
)
# Train model
trainer.train()
# Save trained model
trainer.model.save_pretrained(new_model)
Save model on disk¶
In [ ]:
Copied!
trainer.model.save_pretrained(f"galleon/{new_model}")
trainer.model.save_pretrained(f"galleon/{new_model}")
Merge the low rank adapter with the initial model¶
In [ ]:
Copied!
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, load_in_8bit=False,
device_map="auto",
trust_remote_code=True)
model_path = f"galleon/{new_model}"
peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")
model = peft_model.merge_and_unload()
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, load_in_8bit=False,
device_map="auto",
trust_remote_code=True)
model_path = f"galleon/{new_model}"
peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")
model = peft_model.merge_and_unload()
Save the model to the HuggingFace hub¶
In [ ]:
Copied!
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)
Let's try our newly finetuned model¶
Using a text-generation pipeline¶
In [ ]:
Copied!
from transformers import pipeline
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) #, torch_dtype=torch.bfloat16, device_map="auto")
tiny_llama_prompt_="""<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
"""
outputs = pipe(tiny_llama_prompt_.format("Golden Yellow"), max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs)
from transformers import pipeline
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) #, torch_dtype=torch.bfloat16, device_map="auto")
tiny_llama_prompt_="""<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
"""
outputs = pipe(tiny_llama_prompt_.format("Golden Yellow"), max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs)
Using a conversational pipeline¶
In [ ]:
Copied!
pipe = pipeline(
"conversational", model=model, tokenizer=tokenizer
)
messages = [
{
"role": "system",
"content": "You will be given a text describing a color. Only return the hex color code without any other information",
},
{"role": "user", "content": "Golden Yellow"},
]
messages = pipe(
messages, max_new_tokens=6, do_sample=True, temperature=0.9, top_k=50, top_p=0.95
)
print(messages[-1]["content"])
pipe = pipeline(
"conversational", model=model, tokenizer=tokenizer
)
messages = [
{
"role": "system",
"content": "You will be given a text describing a color. Only return the hex color code without any other information",
},
{"role": "user", "content": "Golden Yellow"},
]
messages = pipe(
messages, max_new_tokens=6, do_sample=True, temperature=0.9, top_k=50, top_p=0.95
)
print(messages[-1]["content"])
In [ ]:
Copied!
def print_color_space(messages):
def hex_to_rgb(hex_color):
hex_color = hex_color.lstrip("#")
return tuple(int(hex_color[i : i + 2], 16) for i in (0, 2, 4))
hex_color = messages[-1]["content"]
r, g, b = hex_to_rgb(hex_color)
print(
f"{messages[1]['content']} [{hex_color}]: \033[48;2;{r};{g};{b}m \033[0m"
)
def print_color_space(messages):
def hex_to_rgb(hex_color):
hex_color = hex_color.lstrip("#")
return tuple(int(hex_color[i : i + 2], 16) for i in (0, 2, 4))
hex_color = messages[-1]["content"]
r, g, b = hex_to_rgb(hex_color)
print(
f"{messages[1]['content']} [{hex_color}]: \033[48;2;{r};{g};{b}m \033[0m"
)
In [ ]:
Copied!
print_color_space(messages)
print_color_space(messages)
Empty CPU/GPU memory
In [ ]:
Copied!
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()