diff --git a/src/rlhf/reward.py b/src/rlhf/reward.py index cd03bef515333cf505158b39e8ae793ca70c3552..fd12d9dd563ff645f2a6a28169b848b57703236d 100644 --- a/src/rlhf/reward.py +++ b/src/rlhf/reward.py @@ -181,6 +181,10 @@ class RewardModel(pl.LightningModule): return reward def forward(self, x_p, x_a, m_p, m_a): + # 因为前向传播的时候,需要过两次模型。所以反馈的时候需要冻结其中一次的参数 + # 不然梯度会被计算两次,在包含 deepspeed 框架下会报错 + # 报错信息:Gradient computed twice for this partition. + with torch.enable_grad(): prefer_reward = self.single_forward(x_p, prompt_mask=m_p) with torch.no_grad():