From c1da839bededeceea65d6c24892606a5f966c1db Mon Sep 17 00:00:00 2001 From: u010280923 Date: Mon, 13 Mar 2023 11:13:51 +0800 Subject: [PATCH] opt reward model --- src/rlhf/reward.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/rlhf/reward.py b/src/rlhf/reward.py index fd12d9d..58a8983 100644 --- a/src/rlhf/reward.py +++ b/src/rlhf/reward.py @@ -71,10 +71,12 @@ class RewardModel(pl.LightningModule): self.padding_embed = nn.Parameter(torch.zeros(dim), requires_grad=False) # reward 得分计算 - self.pred_reward = nn.Sequential( - nn.Linear(dim, 1, bias=False), - Rearrange('... 1 -> ...') # 降维 - ) + self.pred_reward = nn.Linear(dim, 1, bias=False) + + # self.pred_reward = nn.Sequential( + # nn.Linear(dim, 1, bias=False), + # Rearrange('... 1 -> ...') # 降维 + # ) def load(self, path): path = Path(path) @@ -177,6 +179,7 @@ class RewardModel(pl.LightningModule): # 计算奖励 reward = self.pred_reward(last_token_embeds) + reward = reward.squeeze(-1) return reward @@ -184,7 +187,7 @@ class RewardModel(pl.LightningModule): # 因为前向传播的时候,需要过两次模型。所以反馈的时候需要冻结其中一次的参数 # 不然梯度会被计算两次,在包含 deepspeed 框架下会报错 # 报错信息:Gradient computed twice for this partition. - + with torch.enable_grad(): prefer_reward = self.single_forward(x_p, prompt_mask=m_p) with torch.no_grad(): -- GitLab