reward model finished

887ca941 · u010280923 · 0e61d27f · 887ca941 · 0e61d27f · 0e61d27f
隐藏空白更改
内联并排

Showing with 1 addition and 147 deletion

README.md README.md +1 -1

forward_demo.py forward_demo.py +0 -84

train_rm_demo.py train_rm_demo.py +0 -62

未找到文件。
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ python train_sft.py  --load_model "rwkv-190.pth" --wandb "" --proj_dir "out_sft"
 ```
 python train_rm.py  --load_model "rwkv-190.pth" --wandb "" --proj_dir "out_rm" \
 --data_file "data/rm_mock_data.csv" --data_type "utf-8" --vocab_size 50277 \
--ctx_len 2048 --epoch_steps 200 --epoch_count 1000 --epoch_begin 0 --epoch_save 2 \
+--ctx_len 1024 --epoch_steps 200 --epoch_count 1000 --epoch_begin 0 --epoch_save 2 \
 --micro_bsz 2 --n_layer 24 --n_embd 2048 --pre_ffn 0 --head_qk 0 \
 --lr_init 1e-5 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.999 --adam_eps 1e-8 \
 --accelerator gpu --devices 1 --precision bf16 --strategy deepspeed_stage_2_offload --grad_cp 1 \

--- a/forward_demo.py
+++ b/forward_demo.py
-import os, sys, torch
-import numpy as np
-np.set_printoptions(precision=4, suppress=True, linewidth=200)
-
-# current_path = os.path.dirname(os.path.abspath(__file__))
-# sys.path.append(f'{current_path}/rwkv_pip_package/src')
-
-# Tune these below (test True/False for all of them) to find the fastest setting:
-# torch._C._jit_set_profiling_executor(True)
-# torch._C._jit_set_profiling_mode(True)
-# torch._C._jit_override_can_fuse_on_cpu(True)
-# torch._C._jit_override_can_fuse_on_gpu(True)
-# torch._C._jit_set_texpr_fuser_enabled(False)
-# torch._C._jit_set_nvfuser_enabled(False)
-
-########################################################################################################
-#
-# Use '/' in model path, instead of '\'. Use ctx4096 models if you need long ctx.
-#
-# fp16 = good for GPU (!!! DOES NOT support CPU !!!)
-# fp32 = good for CPU
-# bf16 = worse accuracy, supports CPU
-# xxxi8 (example: fp16i8) = xxx with int8 quantization to save 50% VRAM/RAM, slower, slightly less accuracy
-#
-# Read https://pypi.org/project/rwkv/ for Strategy Guide
-#
-########################################################################################################
-# set these before import RWKV
-os.environ['RWKV_JIT_ON'] = '1'
-os.environ["RWKV_CUDA_ON"] = '0' #  if '1' then compile CUDA kernel for seq mode (much faster)
-
-# from rwkv.model import RWKV # pip install rwkv
-from src.rlhf.rwkv.model import RWKV 
-# model = RWKV(model='./model/rwkv-190.pth', strategy='cpu fp32')
-model = RWKV(model='./model/RWKV-4-Pile-169M-20220807-8023.pth', strategy='cpu fp32')
-
-# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-169m/RWKV-4-Pile-169M-20220807-8023', strategy='cuda fp16')
-# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-169m/RWKV-4-Pile-169M-20220807-8023', strategy='cuda fp16i8')
-# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-169m/RWKV-4-Pile-169M-20220807-8023', strategy='cpu fp32')
-# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-169m/RWKV-4-Pile-169M-20220807-8023', strategy='cpu fp32 *3 -> cuda fp16 *6+')
-# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cpu fp32')
-# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cuda fp16')
-# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cuda fp16 *8 -> cpu fp32')
-# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cuda:0 fp16 -> cuda:1 fp16 -> cpu fp32 *1')
-# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cuda fp16 *6+')
-# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-14b/RWKV-4-Pile-14B-20230213-8019', strategy='cuda fp16 *0+ -> cpu fp32 *1')
-# model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-3b/RWKV-4-Pile-3B-20221110-ctx4096', strategy='cuda:0 fp16 *25 -> cuda:1 fp16')
-
-out, state, token_embed = model.forward([187, 510, 1563, 310, 247], None)
-print(out.detach().cpu().numpy())                   # get logits
-# out, state = model.forward([187, 510], None)
-# out, state = model.forward([1563], state)           # RNN has state (use deepcopy to clone states)
-# out, state = model.forward([310, 247], state)
-# print(out.detach().cpu().numpy())                   # same result as above
-
-import ipdb
-ipdb.set_trace()
-
-# print('\n')
-
-# from src.rlhf.rwkv.utils import PIPELINE, PIPELINE_ARGS
-# pipeline = PIPELINE(model, "20B_tokenizer.json")
-
-# ctx = "\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese."
-# print(ctx, end='')
-
-# def my_print(s):
-#     print(s, end='', flush=True)
-
-# # For alpha_frequency and alpha_presence, see "Frequency and presence penalties":
-# # https://platform.openai.com/docs/api-reference/parameter-details
-
-# args = PIPELINE_ARGS(temperature = 1.0, top_p = 0.7,
-#                      alpha_frequency = 0.25,
-#                      alpha_presence = 0.25,
-#                      token_ban = [0], # ban the generation of some tokens
-#                      token_stop = []) # stop generation whenever you see any token here
-
-# ########################################################################################################
-# # 1. set os.environ["RWKV_CUDA_ON"] = '1' if possible, for faster preprocess of a long ctx.
-# # 2. Reuse the state (use deepcopy to clone it) when you are running the same ctx multiple times. 
-# pipeline.generate(ctx, token_count=200, args=args, callback=my_print)
-
-# print('\n')
\ No newline at end of file
--- a/train_rm_demo.py
+++ b/train_rm_demo.py
-'''
-@File    :   train_rm_demo.py
-@Time    :   2023/03/10 00:54:57
-@Author  :   Lu Xin 
-@Contact :   luxin@csdn.net
-'''
-
-# here put the import lib
-
-import torch
-
-from tqdm import tqdm
-
-from src.rlhf.reward import RewardModel
-from src.rlhf.rwkv.model import RWKV
-
-def loss_function(prefer_reward, alter_reward):
-    return -torch.mean(torch.log(torch.sigmoid(alter_reward - prefer_reward)))
-
-model = "./model/RWKV-4-Pile-169M-20220807-8023.pth"
-strategy = "cpu fp32"
-rwkv_model = RWKV(model, strategy)
-
-reward_model = RewardModel(
-    rwkv_model
-)
-
-import ipdb
-ipdb.set_trace()
-
-# as used in the InstructGPT paper
-optimizer = torch.optim.Adam(
-    reward_model.parameters(), lr=1e-5, betas=(0.9, 0.95)) 
-
-# 假数据
-dim = 20000
-prompt = torch.randint(0, dim, (1, 50))
-prefer_response = torch.randint(0, dim, (1, 50))   
-alter_response = torch.randint(0, dim, (1, 50))
-
-prefer_pair = torch.concat((prompt, prefer_response), dim=1)
-alter_pair = torch.concat((prompt, alter_response), dim=1)
-
-prompt_mask = torch.cat((torch.ones(1, 50).bool(), torch.zeros(1, 50).bool()), dim=1)
-
-for epoch in range(100):
-    # 计算奖励
-    prefer_reward = reward_model(prefer_pair, prompt_mask = prompt_mask)
-    alter_reward = reward_model(alter_pair, prompt_mask = prompt_mask)
-    # print(f"prefer_reward: {prefer_reward}")
-    # print(f"alter_reward: {alter_reward}")
-
-    # train
-    loss = loss_function(prefer_reward, alter_reward)
-    print(f"loss: {loss}")
-
-    # Backward pass
-    loss.backward()
-    optimizer.step()
-
-    # Zero the gradients
-    optimizer.zero_grad()