trl+大模型reward训练

电脑硬件
2025-09-08 01:48:02

一、定义

1.强化学习-reward训练 2.reward 模型重新加载与训练

二、实现

.kaggle /code/neuqsnail/open-llama-finetune-sequenceclassification/notebook#Save-and-reload-Model 1.trl 强化训练-reward训练案例 #注意：lora训练需要 task_type 为 SEQ_CLS 1. 下载trl 训练脚本 2. 指令训练

python examples/scripts/reward_modeling.py \ --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ --dataset_name trl-lib/ultrafeedback_binarized \ --output_dir Qwen2-0.5B-Reward-LoRA \ --per_device_train_batch_size 8 \ --num_train_epochs 1 \ --gradient_checkpointing True \ --learning_rate 1.0e-4 \ --logging_steps 25 \ --eval_strategy steps \ --eval_steps 50 \ --max_length 2048 \ --task_type SEQ_CLS\ --use_peft \ --lora_r 32 \ --lora_alpha 16

对应代码

import warnings import torch from datasets import load_dataset from transformers import AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser from trl import ( ModelConfig, RewardConfig, RewardTrainer, ScriptArguments, get_kbit_device_map, get_peft_config, get_quantization_config, setup_chat_format, ) if __name__ == "__main__": parser = HfArgumentParser((ScriptArguments, RewardConfig, ModelConfig)) script_args, training_args, model_args = parser.parse_args_into_dataclasses() training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False) ################ # Model & Tokenizer ################ torch_dtype = ( model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype) ) quantization_config = get_quantization_config(model_args) model_kwargs = dict( revision=model_args.model_revision, device_map=get_kbit_device_map() if quantization_config is not None else None, quantization_config=quantization_config, use_cache=False if training_args.gradient_checkpointing else True, torch_dtype=torch_dtype, ) tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, use_fast=True ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, num_labels=1, trust_remote_code=model_args.trust_remote_code, **model_kwargs ) # Align padding tokens between tokenizer and model model.config.pad_token_id = tokenizer.pad_token_id # If post-training a base model, use ChatML as the default template if tokenizer.chat_template is None: model, tokenizer = setup_chat_format(model, tokenizer) if model_args.use_peft and model_args.lora_task_type != "SEQ_CLS": warnings.warn( "You are using a `task_type` that is different than `SEQ_CLS` for PEFT. This will lead to silent bugs" " Make sure to pass --lora_task_type SEQ_CLS when using this script with PEFT.", UserWarning, #reward 应该是SEQ_CLS 类型 ) ############## # Load dataset ############## dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config) ########## # Training ########## trainer = RewardTrainer( model=model, processing_class=tokenizer, args=training_args, train_dataset=dataset[script_args.dataset_train_split].select(range(50)), eval_dataset=dataset[script_args.dataset_test_split].select(range(50)) if training_args.eval_strategy != "no" else None, peft_config=get_peft_config(model_args), ) trainer.train() ############################ # Save model and push to Hub ############################ trainer.save_model(training_args.output_dir) if training_args.eval_strategy != "no": metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Save and push to hub trainer.save_model(training_args.output_dir) if training_args.push_to_hub: trainer.push_to_hub(dataset_name=script_args.dataset_name)

#输入：数据处理: 需要字段chosen、rejected 两个字段目标是：对两者进行估分，并使选中的概率大于拒绝的概率，越大越好。

def _tokenize(batch: dict[str, list[Any]], tokenizer: "PreTrainedTokenizerBase") -> dict[str, list[Any]]: """Tokenize a batch from a reward modelling dataset.""" new_examples = { "input_ids_chosen": [], "attention_mask_chosen": [], "input_ids_rejected": [], "attention_mask_rejected": [], } for chosen, rejected in zip(batch["chosen"], batch["rejected"]): tokenized_chosen = tokenizer(chosen) tokenized_rejected = tokenizer(rejected) new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"]) new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"]) new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"]) new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"]) return new_examples

#损失函数

rewards_chosen = model( input_ids=inputs["input_ids_chosen"], attention_mask=inputs["attention_mask_chosen"], return_dict=True, )["logits"] rewards_rejected = model( input_ids=inputs["input_ids_rejected"], attention_mask=inputs["attention_mask_rejected"], return_dict=True, )["logits"] loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean()

2.模型重新加载

from peft import AutoPeftModelForSequenceClassification from transformers import AutoTokenizer import torch adapter_model_name = "Qwen2-0.5B-Reward-LoRA" model = AutoPeftModelForSequenceClassification.from_pretrained("Qwen2-0.5B-Reward-LoRA", num_labels=1) #和训练保持一致 tokenizer = AutoTokenizer.from_pretrained( "E:\\Qwen2.5-0.5B-Instruct", use_fast=True ) # Align padding tokens between tokenizer and model 注意需要和训练保持一致 model.config.pad_token_id = tokenizer.pad_token_id print(model)

标签：

trl+大模型reward训练由讯客互联电脑硬件栏目发布，感谢您对讯客互联的认可，以及对我们原创作品以及文章的青睐，非常欢迎各位朋友分享到个人网站或者朋友圈，但转载请说明文章出处“trl+大模型reward训练”

上一篇
2025年2月16日笔记

下一篇
蓝桥与力扣刷题（230二叉搜索树中第k小的元素）