From fa78eedaf1c929eca34f88412e481ea17c032981 Mon Sep 17 00:00:00 2001
From: u010280923 <xinlu.nlp@foxmail.com>
Date: Mon, 13 Mar 2023 10:41:42 +0800
Subject: [PATCH] opt reward model

---
 src/rlhf/reward.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/rlhf/reward.py b/src/rlhf/reward.py
index cd03bef..fd12d9d 100644
--- a/src/rlhf/reward.py
+++ b/src/rlhf/reward.py
@@ -181,6 +181,10 @@ class RewardModel(pl.LightningModule):
         return reward
     
     def forward(self, x_p, x_a, m_p, m_a):
+        # 因为前向传播的时候，需要过两次模型。所以反馈的时候需要冻结其中一次的参数
+        # 不然梯度会被计算两次，在包含 deepspeed 框架下会报错
+        # 报错信息：Gradient computed twice for this partition.
+        
         with torch.enable_grad():
             prefer_reward = self.single_forward(x_p, prompt_mask=m_p)
         with torch.no_grad():
-- 
GitLab