提交 887ca941 编写于 作者: U u010280923

reward model finished

上级 0e61d27f
......@@ -66,7 +66,7 @@ python train_sft.py --load_model "rwkv-190.pth" --wandb "" --proj_dir "out_sft"
```
python train_rm.py --load_model "rwkv-190.pth" --wandb "" --proj_dir "out_rm" \
--data_file "data/rm_mock_data.csv" --data_type "utf-8" --vocab_size 50277 \
--ctx_len 2048 --epoch_steps 200 --epoch_count 1000 --epoch_begin 0 --epoch_save 2 \
--ctx_len 1024 --epoch_steps 200 --epoch_count 1000 --epoch_begin 0 --epoch_save 2 \
--micro_bsz 2 --n_layer 24 --n_embd 2048 --pre_ffn 0 --head_qk 0 \
--lr_init 1e-5 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.999 --adam_eps 1e-8 \
--accelerator gpu --devices 1 --precision bf16 --strategy deepspeed_stage_2_offload --grad_cp 1 \
......
import os, sys, torch
import numpy as np
np.set_printoptions(precision=4, suppress=True, linewidth=200)
# current_path = os.path.dirname(os.path.abspath(__file__))
# sys.path.append(f'{current_path}/rwkv_pip_package/src')
# Tune these below (test True/False for all of them) to find the fastest setting:
# torch._C._jit_set_profiling_executor(True)
# torch._C._jit_set_profiling_mode(True)
# torch._C._jit_override_can_fuse_on_cpu(True)
# torch._C._jit_override_can_fuse_on_gpu(True)
# torch._C._jit_set_texpr_fuser_enabled(False)
# torch._C._jit_set_nvfuser_enabled(False)
########################################################################################################
#
# Use '/' in model path, instead of '\'. Use ctx4096 models if you need long ctx.
#
# fp16 = good for GPU (!!! DOES NOT support CPU !!!)
# fp32 = good for CPU
# bf16 = worse accuracy, supports CPU
# xxxi8 (example: fp16i8) = xxx with int8 quantization to save 50% VRAM/RAM, slower, slightly less accuracy
#
# Read https://pypi.org/project/rwkv/ for Strategy Guide
#
########################################################################################################
# set these before import RWKV
os.environ['RWKV_JIT_ON'] = '1'
os.environ["RWKV_CUDA_ON"] = '0' # if '1' then compile CUDA kernel for seq mode (much faster)
# from rwkv.model import RWKV # pip install rwkv
from src.rlhf.rwkv.model import RWKV
# model = RWKV(model='./model/rwkv-190.pth', strategy='cpu fp32')
model = RWKV(model='./model/RWKV-4-Pile-169M-20220807-8023.pth', strategy='cpu fp32')
# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-169m/RWKV-4-Pile-169M-20220807-8023', strategy='cuda fp16')
# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-169m/RWKV-4-Pile-169M-20220807-8023', strategy='cuda fp16i8')
# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-169m/RWKV-4-Pile-169M-20220807-8023', strategy='cpu fp32')
# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-169m/RWKV-4-Pile-169M-20220807-8023', strategy='cpu fp32 *3 -> cuda fp16 *6+')
# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cpu fp32')
# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cuda fp16')
# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cuda fp16 *8 -> cpu fp32')
# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cuda:0 fp16 -> cuda:1 fp16 -> cpu fp32 *1')
# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cuda fp16 *6+')
# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-14b/RWKV-4-Pile-14B-20230213-8019', strategy='cuda fp16 *0+ -> cpu fp32 *1')
# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-3b/RWKV-4-Pile-3B-20221110-ctx4096', strategy='cuda:0 fp16 *25 -> cuda:1 fp16')
out, state, token_embed = model.forward([187, 510, 1563, 310, 247], None)
print(out.detach().cpu().numpy()) # get logits
# out, state = model.forward([187, 510], None)
# out, state = model.forward([1563], state) # RNN has state (use deepcopy to clone states)
# out, state = model.forward([310, 247], state)
# print(out.detach().cpu().numpy()) # same result as above
import ipdb
ipdb.set_trace()
# print('\n')
# from src.rlhf.rwkv.utils import PIPELINE, PIPELINE_ARGS
# pipeline = PIPELINE(model, "20B_tokenizer.json")
# ctx = "\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese."
# print(ctx, end='')
# def my_print(s):
# print(s, end='', flush=True)
# # For alpha_frequency and alpha_presence, see "Frequency and presence penalties":
# # https://platform.openai.com/docs/api-reference/parameter-details
# args = PIPELINE_ARGS(temperature = 1.0, top_p = 0.7,
# alpha_frequency = 0.25,
# alpha_presence = 0.25,
# token_ban = [0], # ban the generation of some tokens
# token_stop = []) # stop generation whenever you see any token here
# ########################################################################################################
# # 1. set os.environ["RWKV_CUDA_ON"] = '1' if possible, for faster preprocess of a long ctx.
# # 2. Reuse the state (use deepcopy to clone it) when you are running the same ctx multiple times.
# pipeline.generate(ctx, token_count=200, args=args, callback=my_print)
# print('\n')
\ No newline at end of file
'''
@File : train_rm_demo.py
@Time : 2023/03/10 00:54:57
@Author : Lu Xin
@Contact : luxin@csdn.net
'''
# here put the import lib
import torch
from tqdm import tqdm
from src.rlhf.reward import RewardModel
from src.rlhf.rwkv.model import RWKV
def loss_function(prefer_reward, alter_reward):
return -torch.mean(torch.log(torch.sigmoid(alter_reward - prefer_reward)))
model = "./model/RWKV-4-Pile-169M-20220807-8023.pth"
strategy = "cpu fp32"
rwkv_model = RWKV(model, strategy)
reward_model = RewardModel(
rwkv_model
)
import ipdb
ipdb.set_trace()
# as used in the InstructGPT paper
optimizer = torch.optim.Adam(
reward_model.parameters(), lr=1e-5, betas=(0.9, 0.95))
# 假数据
dim = 20000
prompt = torch.randint(0, dim, (1, 50))
prefer_response = torch.randint(0, dim, (1, 50))
alter_response = torch.randint(0, dim, (1, 50))
prefer_pair = torch.concat((prompt, prefer_response), dim=1)
alter_pair = torch.concat((prompt, alter_response), dim=1)
prompt_mask = torch.cat((torch.ones(1, 50).bool(), torch.zeros(1, 50).bool()), dim=1)
for epoch in range(100):
# 计算奖励
prefer_reward = reward_model(prefer_pair, prompt_mask = prompt_mask)
alter_reward = reward_model(alter_pair, prompt_mask = prompt_mask)
# print(f"prefer_reward: {prefer_reward}")
# print(f"alter_reward: {alter_reward}")
# train
loss = loss_function(prefer_reward, alter_reward)
print(f"loss: {loss}")
# Backward pass
loss.backward()
optimizer.step()
# Zero the gradients
optimizer.zero_grad()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册