reward.py 7.5 KB
Newer Older
U
u010280923 已提交
1 2 3 4 5 6 7 8 9 10
import copy
from pathlib import Path

from tqdm import tqdm
from beartype import beartype
from beartype.typing import Tuple, Optional

import torch
from torch import nn
import torch.nn.functional as F
U
u010280923 已提交
11 12
import pytorch_lightning as pl
from pytorch_lightning.utilities import rank_zero_info
U
u010280923 已提交
13 14
import deepspeed
from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
U
u010280923 已提交
15
from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam
U
u010280923 已提交
16
from pytorch_lightning.strategies import DeepSpeedStrategy
U
u010280923 已提交
17 18 19 20 21

from einops import rearrange, repeat, reduce, pack, unpack
from einops.layers.torch import Rearrange, Reduce

from src.rlhf.utils import masked_mean, gumbel_sample
U
u010280923 已提交
22
from src.model import RWKV
U
u010280923 已提交
23 24 25 26 27 28

# helper functions

def exists(val):
    return val is not None

U
u010280923 已提交
29 30
# loss function
def loss_function(prefer_reward, alter_reward):
U
u010280923 已提交
31
    return -torch.mean(torch.log(torch.sigmoid(prefer_reward - alter_reward)))
U
u010280923 已提交
32

U
u010280923 已提交
33 34 35
# Reward Model - RWKV with a scalar head

@beartype
U
u010280923 已提交
36
class RewardModel(pl.LightningModule):
U
u010280923 已提交
37
    def __init__(self, args):
U
u010280923 已提交
38 39
        super().__init__()

U
u010280923 已提交
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
        # 加载 RWKV 模型
        rwkv = RWKV(args)

        if len(args.load_model) == 0:
            rank_zero_info(f"SFT must load model, please input ")
            exit(1)

        rank_zero_info(f"########## Loading {args.load_model}... ##########")
        try:
            load_dict = torch.load(args.load_model, map_location="cpu")
        except:
            rank_zero_info(f"Bad checkpoint {args.load_model}")
            exit(1)

        if args.load_partial == 1:
            load_keys = load_dict.keys()
            for k in rwkv.state_dict():
                if k not in load_keys:
                    load_dict[k] = rwkv.state_dict()[k]
        rwkv.load_state_dict(load_dict)

U
u010280923 已提交
61
        # 用预训练模型初始化奖励模型
U
u010280923 已提交
62
        self.rwkv = rwkv
U
u010280923 已提交
63
        self.args = args
U
u010280923 已提交
64 65

        # 输出 token 向量的维度
U
u010280923 已提交
66
        dim = self.args.n_embd
U
u010280923 已提交
67

U
u010280923 已提交
68
        # 用于区分输入中的 prompt 和 response,当作模型参数进行训练,初始化为全0
U
u010280923 已提交
69 70 71
        self.prompt_embed = nn.Parameter(torch.zeros(dim))
        self.response_embed = nn.Parameter(torch.zeros(dim))
        self.padding_embed = nn.Parameter(torch.zeros(dim), requires_grad=False)
U
u010280923 已提交
72 73 74

        # reward 得分计算
        self.pred_reward = nn.Sequential(
U
u010280923 已提交
75
            nn.Linear(dim, 1, bias=False),
U
u010280923 已提交
76 77
            Rearrange('... 1 -> ...')   # 降维
        )
U
u010280923 已提交
78 79 80 81 82

    def load(self, path):
        path = Path(path)
        assert path.exists()
        self.load_state_dict(torch.load(str(path)))
U
u010280923 已提交
83 84
    
    def configure_optimizers(self):
U
u010280923 已提交
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
        args = self.args
        if args.layerwise_lr > 0:
            lr_1x = set()
            lr_2x = set()
            lr_3x = set()
            for n, p in self.named_parameters():
                if "time_mix" in n:
                    if args.my_pile_stage == 2:
                        lr_2x.add(n)
                    else:
                        lr_1x.add(n)
                elif "time_decay" in n:
                    if args.my_pile_stage == 2:
                        lr_3x.add(n)
                    else:
                        lr_2x.add(n)
                elif "time_first" in n:
                    lr_3x.add(n)
                else:
                    lr_1x.add(n)
            lr_1x = sorted(list(lr_1x))
            lr_2x = sorted(list(lr_2x))
            lr_3x = sorted(list(lr_3x))
            # print('1x', lr_1x)
            # print('2x', lr_2x)
            # print('3x', lr_3x)
            param_dict = {n: p for n, p in self.named_parameters()}
            if args.my_pile_stage == 2:
                optim_groups = [
                    {"params": [param_dict[n] for n in lr_1x], "weight_decay": 0.0, "my_lr_scale": 1.0},
                    {"params": [param_dict[n] for n in lr_2x], "weight_decay": 0.0, "my_lr_scale": 5.0},# test: 2e-3 / args.lr_init},
                    {"params": [param_dict[n] for n in lr_3x], "weight_decay": 0.0, "my_lr_scale": 5.0},# test: 3e-3 / args.lr_init},
                ]
            else:
                optim_groups = [
                    {"params": [param_dict[n] for n in lr_1x], "weight_decay": 0.0, "my_lr_scale": 1.0},
                    {"params": [param_dict[n] for n in lr_2x], "weight_decay": 0.0, "my_lr_scale": 2.0},
                    {"params": [param_dict[n] for n in lr_3x], "weight_decay": 0.0, "my_lr_scale": 3.0},
                ]
        else:
            optim_groups = [
                {"params": [p for n, p in self.named_parameters()], "weight_decay": 0.0},
            ]
U
u010280923 已提交
128
        
U
u010280923 已提交
129 130 131
        # if self.deepspeed_offload:
        #     return DeepSpeedCPUAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adamw_mode=False, weight_decay=0, amsgrad=False)
        # return FusedAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adam_w_mode=False, weight_decay=0, amsgrad=False)
U
u010280923 已提交
132 133
        # return ZeroOneAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, weight_decay=0, amsgrad=False, cuda_aware=False)
        return torch.optim.Adam(optim_groups, lr=1e-5, betas=(0.9, 0.95))
U
u010280923 已提交
134 135 136 137 138 139
    
    @property
    def deepspeed_offload(self) -> bool:
        strategy = self.trainer.strategy
        if isinstance(strategy, DeepSpeedStrategy):
            cfg = strategy.config["zero_optimization"]
U
u010280923 已提交
140
            return bool(cfg.get("offload_optimizer") or cfg.get("offload_param"))
U
u010280923 已提交
141
        return False
U
u010280923 已提交
142 143

    def single_forward(
U
u010280923 已提交
144 145 146 147
        self,
        x,
        mask = None,
        prompt_mask = None,
U
u010280923 已提交
148
        prompt_lengths = None
U
u010280923 已提交
149 150
    ):

U
u010280923 已提交
151
        # prompt_mask 和 prompt_lengths 只能二选一
U
u010280923 已提交
152 153 154 155 156
        assert not (exists(prompt_mask) and exists(prompt_lengths))

        # derive prompt mask from prompt lengths
        if exists(prompt_lengths):
            batch, seq_len = x.shape
U
u010280923 已提交
157
            arange = torch.arange(seq_len, device=x.device)
U
u010280923 已提交
158 159 160 161 162
            prompt_mask = repeat(arange, 'n -> b n', b = batch) < rearrange(prompt_lengths, 'b -> b 1')

        # reward model should have an understanding of which section is prompt, and which section is response
        # 根据 prompt_mask 中 token 的 True 和 False,从 prompt_embed 或 response_embed 中取值
        # 如果为 True,则从 prompt_embed 中选,否则从 response_embed 中选
U
u010280923 已提交
163 164 165 166 167
        prompt_response_mask_embed = torch.stack([
            self.prompt_embed,
            self.response_embed,
            self.padding_embed
        ]).to(prompt_mask.device)
U
u010280923 已提交
168 169
        extra_embed = None
        if exists(prompt_mask):
U
u010280923 已提交
170
            extra_embed = prompt_response_mask_embed[prompt_mask]            
U
u010280923 已提交
171

U
u010280923 已提交
172 173
        # 获得最后一个 token 的 embedding
        last_token_embeds = self.rwkv(
U
u010280923 已提交
174
            x,
U
u010280923 已提交
175 176
            extra_embed=extra_embed,
            rm_train=True
U
u010280923 已提交
177
        )[:, -1, :]
U
u010280923 已提交
178

U
u010280923 已提交
179
        # 计算奖励
U
u010280923 已提交
180
        reward = self.pred_reward(last_token_embeds)
U
u010280923 已提交
181 182

        return reward
U
u010280923 已提交
183
    
U
u010280923 已提交
184 185 186
    def forward(self, x_p, x_a, m_p, m_a):
        prefer_reward = self.single_forward(x_p, prompt_mask=m_p)
        alter_reward = self.single_forward(x_a, prompt_mask=m_a)
U
u010280923 已提交
187 188 189 190

        return prefer_reward, alter_reward
    
    def training_step(self, batch, batch_idx):
U
u010280923 已提交
191
        x_p, x_a, m_p, m_a = batch
U
u010280923 已提交
192
        prefer_reward, alter_reward = self(
U
u010280923 已提交
193
            x_p, x_a, m_p, m_a)
U
u010280923 已提交
194 195 196 197
        
        loss = loss_function(prefer_reward, alter_reward)

        return loss
U
u010280923 已提交
198 199 200 201 202
    
    def training_step_end(self, batch_parts):
        all = self.all_gather(batch_parts)
        if self.trainer.is_global_zero:
            self.trainer.my_loss_all = all
U
u010280923 已提交
203 204