Training Guides
Process Reward Models
Train models to verify reasoning steps and provide process-level rewards.
PRM Training
from thinkrl.training import PRMTrainer, PRMConfig
config = PRMConfig(
model_name_or_path="meta-llama/Llama-3-8b",
dataset_name="your-org/step-labeled-data",
output_dir="./prm_checkpoint",
step_separator="\n",
learning_rate=1e-5,
)
trainer = PRMTrainer(config)
trainer.train()Using PRM for RLHF
from thinkrl import RLHFTrainer, ModelConfig
config = ModelConfig(
model_name_or_path="./sft_checkpoint",
reward_model_path="./prm_checkpoint",
use_process_reward=True,
algorithm="vapo",
)
trainer = RLHFTrainer(config)
trainer.train()