reward.py 7.0 KB
Newer Older
U
u010280923 已提交
1 2 3 4 5 6 7 8 9 10
import copy
from pathlib import Path

from tqdm import tqdm
from beartype import beartype
from beartype.typing import Tuple, Optional

import torch
from torch import nn
import torch.nn.functional as F
U
u010280923 已提交
11 12
import pytorch_lightning as pl
from pytorch_lightning.utilities import rank_zero_info
U
u010280923 已提交
13 14
import deepspeed
from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
U
u010280923 已提交
15
from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam
U
u010280923 已提交
16
from pytorch_lightning.strategies import DeepSpeedStrategy
U
u010280923 已提交
17 18 19 20 21

from einops import rearrange, repeat, reduce, pack, unpack
from einops.layers.torch import Rearrange, Reduce

from src.rlhf.utils import masked_mean, gumbel_sample
U
u010280923 已提交
22
from src.model import RWKV
U
u010280923 已提交
23

U
u010280923 已提交
24

U
u010280923 已提交
25 26 27 28 29
# helper functions

def exists(val):
    return val is not None

U
u010280923 已提交
30 31
# loss function
def loss_function(prefer_reward, alter_reward):
U
u010280923 已提交
32
    return -torch.mean(torch.log(torch.sigmoid(prefer_reward - alter_reward)))
U
u010280923 已提交
33

U
u010280923 已提交
34 35 36
# Reward Model - RWKV with a scalar head

@beartype
U
u010280923 已提交
37
class RewardModel(pl.LightningModule):
U
u010280923 已提交
38
    def __init__(self, args, rwkv: RWKV):
U
u010280923 已提交
39 40 41
        super().__init__()

        # 用预训练模型初始化奖励模型
U
u010280923 已提交
42
        self.rwkv = rwkv
U
u010280923 已提交
43
        self.args = args
U
u010280923 已提交
44 45

        # 输出 token 向量的维度
U
u010280923 已提交
46
        dim = self.args.n_embd
U
u010280923 已提交
47

U
u010280923 已提交
48
        # 用于区分输入中的 prompt 和 response,当作模型参数进行训练,初始化为全0
U
u010280923 已提交
49 50 51
        self.prompt_embed = nn.Parameter(torch.zeros(dim))
        self.response_embed = nn.Parameter(torch.zeros(dim))
        self.padding_embed = nn.Parameter(torch.zeros(dim), requires_grad=False)
U
u010280923 已提交
52 53

        # reward 得分计算
U
u010280923 已提交
54 55
        self.pred_reward = nn.Linear(dim, 1, bias=False)

U
u010280923 已提交
56 57 58 59
    def load(self, path):
        path = Path(path)
        assert path.exists()
        self.load_state_dict(torch.load(str(path)))
U
u010280923 已提交
60 61
    
    def configure_optimizers(self):
U
u010280923 已提交
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
        args = self.args
        if args.layerwise_lr > 0:
            lr_1x = set()
            lr_2x = set()
            lr_3x = set()
            for n, p in self.named_parameters():
                if "time_mix" in n:
                    if args.my_pile_stage == 2:
                        lr_2x.add(n)
                    else:
                        lr_1x.add(n)
                elif "time_decay" in n:
                    if args.my_pile_stage == 2:
                        lr_3x.add(n)
                    else:
                        lr_2x.add(n)
                elif "time_first" in n:
                    lr_3x.add(n)
                else:
                    lr_1x.add(n)
            lr_1x = sorted(list(lr_1x))
            lr_2x = sorted(list(lr_2x))
            lr_3x = sorted(list(lr_3x))
            # print('1x', lr_1x)
            # print('2x', lr_2x)
            # print('3x', lr_3x)
            param_dict = {n: p for n, p in self.named_parameters()}
            if args.my_pile_stage == 2:
                optim_groups = [
                    {"params": [param_dict[n] for n in lr_1x], "weight_decay": 0.0, "my_lr_scale": 1.0},
                    {"params": [param_dict[n] for n in lr_2x], "weight_decay": 0.0, "my_lr_scale": 5.0},# test: 2e-3 / args.lr_init},
                    {"params": [param_dict[n] for n in lr_3x], "weight_decay": 0.0, "my_lr_scale": 5.0},# test: 3e-3 / args.lr_init},
                ]
            else:
                optim_groups = [
                    {"params": [param_dict[n] for n in lr_1x], "weight_decay": 0.0, "my_lr_scale": 1.0},
                    {"params": [param_dict[n] for n in lr_2x], "weight_decay": 0.0, "my_lr_scale": 2.0},
                    {"params": [param_dict[n] for n in lr_3x], "weight_decay": 0.0, "my_lr_scale": 3.0},
                ]
U
debug  
u010280923 已提交
101

U
u010280923 已提交
102 103 104 105
        else:
            optim_groups = [
                {"params": [p for n, p in self.named_parameters()], "weight_decay": 0.0},
            ]
U
u010280923 已提交
106
        
U
u010280923 已提交
107 108 109
        if self.deepspeed_offload:
            return DeepSpeedCPUAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adamw_mode=False, weight_decay=0, amsgrad=False)
        return FusedAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adam_w_mode=False, weight_decay=0, amsgrad=False)
U
u010280923 已提交
110
        # return ZeroOneAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, weight_decay=0, amsgrad=False, cuda_aware=False)
U
u010280923 已提交
111 112 113 114 115 116
    
    @property
    def deepspeed_offload(self) -> bool:
        strategy = self.trainer.strategy
        if isinstance(strategy, DeepSpeedStrategy):
            cfg = strategy.config["zero_optimization"]
U
u010280923 已提交
117
            return bool(cfg.get("offload_optimizer") or cfg.get("offload_param"))
U
u010280923 已提交
118
        return False
U
u010280923 已提交
119

U
u010280923 已提交
120
    def forward(
U
u010280923 已提交
121 122 123 124
        self,
        x,
        mask = None,
        prompt_mask = None,
U
u010280923 已提交
125
        prompt_lengths = None
U
u010280923 已提交
126 127
    ):

U
u010280923 已提交
128
        # prompt_mask 和 prompt_lengths 只能二选一
U
u010280923 已提交
129 130 131 132 133
        assert not (exists(prompt_mask) and exists(prompt_lengths))

        # derive prompt mask from prompt lengths
        if exists(prompt_lengths):
            batch, seq_len = x.shape
U
u010280923 已提交
134
            arange = torch.arange(seq_len, device=x.device)
U
u010280923 已提交
135 136 137 138 139
            prompt_mask = repeat(arange, 'n -> b n', b = batch) < rearrange(prompt_lengths, 'b -> b 1')

        # reward model should have an understanding of which section is prompt, and which section is response
        # 根据 prompt_mask 中 token 的 True 和 False,从 prompt_embed 或 response_embed 中取值
        # 如果为 True,则从 prompt_embed 中选,否则从 response_embed 中选
U
u010280923 已提交
140 141 142 143 144
        prompt_response_mask_embed = torch.stack([
            self.prompt_embed,
            self.response_embed,
            self.padding_embed
        ]).to(prompt_mask.device)
U
u010280923 已提交
145 146
        extra_embed = None
        if exists(prompt_mask):
U
u010280923 已提交
147
            extra_embed = prompt_response_mask_embed[prompt_mask]            
U
u010280923 已提交
148

U
u010280923 已提交
149 150
        # 获得最后一个 token 的 embedding
        last_token_embeds = self.rwkv(
U
u010280923 已提交
151
            x,
U
u010280923 已提交
152 153
            extra_embed=extra_embed,
            rm_train=True
U
u010280923 已提交
154
        )[:, -1, :]
U
u010280923 已提交
155

U
u010280923 已提交
156
        # 计算奖励
U
u010280923 已提交
157
        reward = self.pred_reward(last_token_embeds)
U
u010280923 已提交
158
        reward = reward.squeeze(-1)
U
u010280923 已提交
159 160

        return reward
U
u010280923 已提交
161
    
U
u010280923 已提交
162
    def train_forward(self, x_p, x_a, m_p, m_a):
U
u010280923 已提交
163 164 165
        # 因为前向传播的时候,需要过两次模型。所以反馈的时候需要冻结其中一次的参数
        # 不然梯度会被计算两次,在包含 deepspeed 框架下会报错
        # 报错信息:Gradient computed twice for this partition.
U
u010280923 已提交
166

U
u010280923 已提交
167
        with torch.enable_grad():
U
u010280923 已提交
168
            prefer_reward = self.forward(x_p, prompt_mask=m_p)
U
u010280923 已提交
169
        with torch.no_grad():
U
u010280923 已提交
170
            alter_reward = self.forward(x_a, prompt_mask=m_a)
U
u010280923 已提交
171 172 173 174

        return prefer_reward, alter_reward
    
    def training_step(self, batch, batch_idx):
U
u010280923 已提交
175
        x_p, x_a, m_p, m_a = batch
U
u010280923 已提交
176
        prefer_reward, alter_reward = self.train_forward(
U
u010280923 已提交
177
            x_p, x_a, m_p, m_a)
U
u010280923 已提交
178 179 180 181
        
        loss = loss_function(prefer_reward, alter_reward)

        return loss
U
u010280923 已提交
182 183 184 185 186
    
    def training_step_end(self, batch_parts):
        all = self.all_gather(batch_parts)
        if self.trainer.is_global_zero:
            self.trainer.my_loss_all = all
U
u010280923 已提交
187 188