LLM实战:LLM微调加速神器-Unsloth + Qwen1.5
1. 背景上一篇介绍了基于训练加速框架Unsloth,微调训练Llama3的显卡资源占用及训练时间对比。近期Unsloth新增了Qwen1.5的模型适配,因此本qiang~马不停蹄地又进行了一次实验对比。对Unsloth的简介,感兴趣的客观可以查阅上一篇《LLM微调加速神器:Unsloth + LLama3》。2. 实战本着眼过千遍不如手过一遍的宗旨,本qiang~针对Unsloth做了一个对比实
1. 背景
上一篇介绍了基于训练加速框架Unsloth,微调训练Llama3的显卡资源占用及训练时间对比。
近期Unsloth新增了Qwen1.5的模型适配,因此本qiang~马不停蹄地又进行了一次实验对比。
对Unsloth的简介,感兴趣的客观可以查阅上一篇《LLM微调加速神器:Unsloth + LLama3》。
2. 实战
本着眼过千遍不如手过一遍的宗旨,本qiang~针对Unsloth做了一个对比实现。
对比的实验环境为: A800,模型为Qwen1.5-32B-Chat。
可以使用如下命令进行更新unsloth。
pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
2.1 比对维度
维度 | 说明 |
显卡 | 是否支持bf16 |
最大文本长度 | max_seq_length |
批次大小 | per_device_train_batch_size |
梯度累加步长 | gradient_accumulation_steps |
秩 | LoRA的rank |
dropout | lora_droput |
2.2 源码
源码请见附件,由于Qwen1.5和Llama3的prompt模板不一致,因此源码层面上也稍加了改动。
3 实验结果
3.1 A800
3.2 结论
针对于Qwen1.5-32B-Chat进行unsloth训练,与基于transformers框架训练进行比对,结论如下:
集成unsloth后,显卡占用确实更少,平均减少20%-25%,训练效率确实更快,不管是哪种维度,平均训练时间减少了27%-41%。
使用40G显存的单卡如A40,即可进行微调训练。
4. 总结
一句话足矣~
本文主要是使用unsloth框架针对Qwen1.5的高效微调实验,提供了详细的对比代码以及不同维度的对比分析结果。
之后会研读一遍unsloth的底层源码,具体是如何使用triton语言实现加速的,以及如何手写的前馈网络和反向传播的实现~
5. 参考
1. unsloth: https://github.com/unslothai/unsloth
6. 附件
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer, AutoModelForCausalLM, set_seed, AutoTokenizer, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
import gc
set_seed(42)
def train_unsloth(dtype,
max_seq_length,
per_device_train_batch_size,
gradient_accumulation_steps,
rank,
lora_alpha=16,
lora_dropout=0,
max_steps=50,
save_steps=50,
seed=42,
warmup_steps=5,
learning_rate=2e-4,
logging_steps=5):
print(f'dtype:{dtype}, max_seq_length:{max_seq_length}, per_device_train_batch_size:{per_device_train_batch_size}, gradient_accumulation_steps:{gradient_accumulation_steps}, rank:{rank}, lora_dropout:{lora_dropout}')
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name='pretrain_models/Qwen/Qwen1.5-32B-Chat/',
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit
)
model = FastLanguageModel.get_peft_model(
model,
r = rank,
target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
bias='none',
use_gradient_checkpointing=True,
random_state=seed,
use_rslora=False,
max_seq_length=max_seq_length
)
def formatting_prompts_func(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
for instruction, input, output in zip(instructions, inputs, outputs):
text = tokenizer.apply_chat_template(
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f'{instruction}. {input}'},
{'role': 'assistant', 'content': f'{output}'}
],
tokenize=False,
add_generation_prompt=False
)
texts.append(text)
return { "text" : texts}
pass
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field='text',
max_seq_length=max_seq_length,
packing=False,
args = TrainingArguments(
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
warmup_steps=warmup_steps,
learning_rate=learning_rate,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
logging_steps=logging_steps,
optim='adamw_8bit',
weight_decay=0.01,
lr_scheduler_type='linear',
seed=seed,
output_dir='output/llame3-8b-instruct-unsloth',
save_steps=save_steps,
max_steps=max_steps
)
)
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved()/1024/1024/1024, 3)
max_memory = round(gpu_stats.total_memory/1024/1024/1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
trainer_stats = trainer.train()
used_memory = round(torch.cuda.max_memory_reserved()/1024/1024/1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory)
used_percentage = round(used_memory/max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
model.save_pretrained("output/llame3-8b-instruct-unsloth-lora") # Local saving
tokenizer.save_pretrained("output/llame3-8b-instruct-unsloth-lora")
# model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",) # Merge to 16bit
# model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",) # Merge to 4bit
# model.save_pretrained_merged("model", tokenizer, save_method = "lora",) # Just LoRA adapters
# model.save_pretrained_gguf("model", tokenizer,) # Save to 8bit Q8_0
# model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16") # Save to 16bit GGUF
# model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m") # Save to q4_k_m GGUF
del model
del tokenizer
torch.cuda.empty_cache()
for _ in range(3):
gc.collect()
def train_trans(dtype,
max_seq_length,
per_device_train_batch_size,
gradient_accumulation_steps,
rank,
lora_alpha=16,
lora_dropout=0,
max_steps=50,
save_steps=50,
seed=42,
warmup_steps=5,
learning_rate=2e-4,
logging_steps=5):
print(f'dtype:{dtype}, max_seq_length:{max_seq_length}, per_device_train_batch_size:{per_device_train_batch_size}, gradient_accumulation_steps:{gradient_accumulation_steps}, rank:{rank}, lora_dropout:{lora_dropout}')
model_path = 'pretrain_models/Qwen/Qwen1.5-32B-Chat/'
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side='right', model_max_length=8192)
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=dtype,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=dtype,
quantization_config=quantization_config,
)
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
model.enable_input_require_grads()
config = LoraConfig(
r=rank,
lora_alpha=lora_alpha,
target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
lora_dropout=lora_dropout,
bias="none",
task_type="CAUSAL_LM",
use_rslora=False
)
model = get_peft_model(model, peft_config=config)
model.gradient_checkpointing_enable()
def formatting_prompts_func(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
for instruction, input, output in zip(instructions, inputs, outputs):
text = tokenizer.apply_chat_template(
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f'{instruction}. {input}'},
{'role': 'assistant', 'content': f'{output}'}
],
tokenize=False,
add_generation_prompt=False
)
texts.append(text)
return { "text" : texts}
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field='text',
max_seq_length=max_seq_length,
packing=False,
args = TrainingArguments(
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
warmup_steps=warmup_steps,
learning_rate=learning_rate,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
logging_steps=logging_steps,
optim='adamw_8bit',
weight_decay=0.01,
lr_scheduler_type='linear',
seed=seed,
output_dir='output/llame3-8b-instruct-unsloth',
save_steps=save_steps,
max_steps=max_steps
)
)
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved()/1024/1024/1024, 3)
max_memory = round(gpu_stats.total_memory/1024/1024/1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
trainer_stats = trainer.train()
used_memory = round(torch.cuda.max_memory_reserved()/1024/1024/1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory)
used_percentage = round(used_memory/max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
model.save_pretrained("output/llame3-8b-instruct-unsloth-lora") # Local saving
tokenizer.save_pretrained("output/llame3-8b-instruct-unsloth-lora")
del model
del tokenizer
torch.cuda.empty_cache()
for _ in range(3):
gc.collect()
def infer():
model, tokenizer = FastLanguageModel.from_pretrained(
model_name='output/llame3-8b-instruct-unsloth-lora',
max_seq_length=2048,
dtype=torch.float16,
load_in_4bit=True
)
# 2x的速率进行推理
FastLanguageModel.for_inference(model)
inputs = tokenizer([alpaca_prompt.format('Continue the fibonnaci sequence.', '1, 1, 2, 3, 5, 8', '')], return_tensors = "pt").to('cuda')
outputs = model.generate(**inputs, max_new_tokens=1024, use_cache=True)
print(tokenizer.batch_decode(outputs))
text_streamer = TextStreamer(tokenizer)
outputs = model.generate(**inputs, max_new_tokens=1024, streamer=text_streamer)
print(tokenizer.batch_decode(outputs))
if __name__ == '__main__':
train_unsloth(dtype=torch.bfloat16, max_seq_length=1024, per_device_train_batch_size=1, gradient_accumulation_steps=16, rank=8, lora_dropout=0)
train_unsloth(dtype=torch.bfloat16, max_seq_length=1024, per_device_train_batch_size=1, gradient_accumulation_steps=16, rank=64, lora_dropout=0)
train_unsloth(dtype=torch.bfloat16, max_seq_length=2048, per_device_train_batch_size=1, gradient_accumulation_steps=16, rank=64, lora_dropout=0)
train_unsloth(dtype=torch.bfloat16, max_seq_length=2048, per_device_train_batch_size=4, gradient_accumulation_steps=4, rank=64, lora_dropout=0)
train_unsloth(dtype=torch.bfloat16, max_seq_length=2048, per_device_train_batch_size=4, gradient_accumulation_steps=4, rank=64, lora_dropout=0.05)
train_unsloth(dtype=torch.bfloat16, max_seq_length=2048, per_device_train_batch_size=16, gradient_accumulation_steps=4, rank=64, lora_dropout=0.05)
train_trans(dtype=torch.bfloat16, max_seq_length=1024, per_device_train_batch_size=1, gradient_accumulation_steps=16, rank=8, lora_dropout=0)
train_trans(dtype=torch.bfloat16, max_seq_length=1024, per_device_train_batch_size=1, gradient_accumulation_steps=16, rank=64, lora_dropout=0)
train_trans(dtype=torch.bfloat16, max_seq_length=2048, per_device_train_batch_size=1, gradient_accumulation_steps=16, rank=64, lora_dropout=0)
train_trans(dtype=torch.bfloat16, max_seq_length=2048, per_device_train_batch_size=4, gradient_accumulation_steps=4, rank=64, lora_dropout=0)
train_trans(dtype=torch.bfloat16, max_seq_length=2048, per_device_train_batch_size=4, gradient_accumulation_steps=4, rank=64, lora_dropout=0.05)
转至:https://blog.csdn.net/MENGERN/article/details/138974731
更多推荐
所有评论(0)