diff --git a/src/rlhf/reward.py b/src/rlhf/reward.py index fd12d9dd563ff645f2a6a28169b848b57703236d..58a89834bbb2c92d07e763b552561c58594f0b52 100644 --- a/src/rlhf/reward.py +++ b/src/rlhf/reward.py @@ -71,10 +71,12 @@ class RewardModel(pl.LightningModule): self.padding_embed = nn.Parameter(torch.zeros(dim), requires_grad=False) # reward 得分计算 - self.pred_reward = nn.Sequential( - nn.Linear(dim, 1, bias=False), - Rearrange('... 1 -> ...') # 降维 - ) + self.pred_reward = nn.Linear(dim, 1, bias=False) + + # self.pred_reward = nn.Sequential( + # nn.Linear(dim, 1, bias=False), + # Rearrange('... 1 -> ...') # 降维 + # ) def load(self, path): path = Path(path) @@ -177,6 +179,7 @@ class RewardModel(pl.LightningModule): # 计算奖励 reward = self.pred_reward(last_token_embeds) + reward = reward.squeeze(-1) return reward @@ -184,7 +187,7 @@ class RewardModel(pl.LightningModule): # 因为前向传播的时候,需要过两次模型。所以反馈的时候需要冻结其中一次的参数 # 不然梯度会被计算两次,在包含 deepspeed 框架下会报错 # 报错信息:Gradient computed twice for this partition. - + with torch.enable_grad(): prefer_reward = self.single_forward(x_p, prompt_mask=m_p) with torch.no_grad():