opt reward model

fa78eeda · u010280923 · 68662e23 · fa78eeda
隐藏空白更改
内联并排

Showing with 4 addition and 0 deletion

src/rlhf/reward.py src/rlhf/reward.py +4 -0

未找到文件。
--- a/src/rlhf/reward.py
+++ b/src/rlhf/reward.py
@@ -181,6 +181,10 @@ class RewardModel(pl.LightningModule):
        return reward
    
    def forward(self, x_p, x_a, m_p, m_a):
+        # 因为前向传播的时候，需要过两次模型。所以反馈的时候需要冻结其中一次的参数
+        # 不然梯度会被计算两次，在包含 deepspeed 框架下会报错
+        # 报错信息：Gradient computed twice for this partition.
+        
        with torch.enable_grad():
            prefer_reward = self.single_forward(x_p, prompt_mask=m_p)
        with torch.no_grad():