文章
大模型微调记录
数据集:
https://huggingface.co/datasets/leo009/ChineseDataset
模型地址:
https://huggingface.co/unsloth/llama-3-8b-bnb-4bit
下载模型脚本:
分文件下载(避开大文件阻塞)
from huggingface_hub import snapshot_download
snapshot_download(
repo_id="unsloth/llama-3-8b-bnb-4bit",
ignore_patterns=["*.safetensors", "*.bin"], # 先跳过模型文件
local_dir="llama-3-8b-bnb-4bit"
)
大文件 windows本地下载后 ubuntu复制到自己环境,大文件可以自己的方法加速下载
ubuntu目录结构
datasets/
-- ChineseDataset.json
models/unsloth/llama-3-8b-bnb-4bit
scripts/
-- app.py
-- test.py
app.py
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
"unsloth/mistral-7b-bnb-4bit",
"unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
"unsloth/llama-2-7b-bnb-4bit",
"unsloth/gemma-7b-bnb-4bit",
"unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
"unsloth/gemma-2b-bnb-4bit",
"unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
"unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
] # More models at https://huggingface.co/unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "/home/user/models/unsloth/llama-3-8b-bnb-4bit",
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model = FastLanguageModel.get_peft_model(
model,
r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
for instruction, input, output in zip(instructions, inputs, outputs):
# Must add EOS_TOKEN, otherwise your generation will go on forever!
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
texts.append(text)
return { "text" : texts, }
pass
from datasets import load_dataset
file_path = "/home/user/datasets/ChineseDataset.json"
dataset = load_dataset("json", data_files={"train": file_path}, split="train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = dataset,
dataset_text_field = "text",
max_seq_length = max_seq_length,
dataset_num_proc = 2,
packing = False, # Can make training 5x faster for short sequences.
args = TrainingArguments(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 5,
max_steps = 60,
learning_rate = 2e-4,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "outputs",
),
)
trainer_stats = trainer.train()
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
alpaca_prompt.format(
"Continue the fibonnaci sequence.", # instruction
"1, 1, 2, 3, 5, 8", # input
"", # output - leave this blank for generation!
)
], return_tensors = "pt").to("cuda")
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
model.save_pretrained("lora_model") # Local saving
inputs = tokenizer(
[
alpaca_prompt.format(
"介绍AI超元域频道", # instruction
"", # input
"", # output - leave this blank for generation!
)
], return_tensors = "pt").to("cuda")
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
test.py
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
# ======================
# 1. 参数设置
# ======================
max_seq_length = 2048 # 必须与训练时相同
dtype = None # 自动检测数据类型
load_in_4bit = True # 4bit量化推理
# ======================
# 2. 直接加载微调后的模型
# ======================
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "./lora_model", # 修改为您的模型路径
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # 启用推理优化
# ======================
# 3. 使用训练时的提示模板
# ======================
your_prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:
{output}"""
# ======================
# 4. 测试案例
# ======================
test_cases = [
{
"instruction": "介绍AI超元域频道",
"input": "",
},
{
"instruction": "解释神经网络的工作原理",
"input": "用通俗易懂的方式",
}
]
# ======================
# 5. 批量测试生成
# ======================
for case in test_cases:
# 格式化提示
formatted_prompt = your_prompt_template.format(
instruction=case["instruction"],
input=case["input"],
output="", # 生成部分留空
)
# Token化
inputs = tokenizer(
[formatted_prompt],
return_tensors="pt",
padding=True,
truncation=True,
).to("cuda")
# 流式输出
print(f"\n{'='*40}")
print(f"指令: {case['instruction']}")
if case["input"]: print(f"输入: {case['input']}")
print(f"{'-'*20} 生成结果 {'-'*20}")
text_streamer = TextStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True
)
_ = model.generate(
**inputs,
streamer=text_streamer,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
do_sample=True,
)
安装llama.cpp
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
编译项目
编译项目前,先安装所需依赖项:
sudo apt update
sudo apt install -y build-essential cmake git
#llama.cpp的某些功能依赖libcurl
#如llama-download 的自动下载模型
sudo apt install -y libcurl4-openssl-dev
#如果要使用python接口,还需要
sudo apt install -y python3 python3-pip
pip3 install numpy
CPU Backend
默认使用CPU版本编译
cmake -B build
cmake --build build --config Release
# cmake --build build --config Release -j 8
# -j 8 可加速编译过程,视你的 CPU 核心数而定
编译带CUDA的llama.cpp
#如果你用CPU生成过编译文件,执行新的make指令时可能会报错
#先使用 rm -rf build 把之前的清空
rm -rf build
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release -j 8
# -j 8 可加速编译过程,视你的 CPU 核心数而定
# 其实重启电脑也可以达到一样的效果
合并大模型和lora模型
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
# 基础模型路径
base_model = "Qwen/Qwen1.5-8B" # 或您使用的具体Qwen3 8B模型路径
# LoRA模型路径
lora_model = "./lora_output_dir" # 替换为您的LoRA模型目录
# 输出合并后模型路径
output_dir = "./merged_model"
# 加载基础模型
model = AutoModelForCausalLM.from_pretrained(
base_model,
torch_dtype="auto",
device_map="auto"
)
# 加载LoRA适配器并合并
model = PeftModel.from_pretrained(model, lora_model)
model = model.merge_and_unload()
# 保存合并后的模型
model.save_pretrained(output_dir)
# 保存tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.save_pretrained(output_dir)
llama.cpp命令转换gguf模型
# 将HuggingFace模型转换为GGUF格式
python convert-hf-to-gguf.py ../merged_model --outfile qwen3-8b-merged-f16.gguf --outtype f16
# 如果需要量化(推荐)
./quantize qwen3-8b-merged-f16.gguf qwen3-8b-merged-q4_k_m.gguf q4_k_m