From 95ea4cd1a4a9a29bbece43ba36f48bda54032e8c Mon Sep 17 00:00:00 2001
From: u010280923 <xinlu.nlp@foxmail.com>
Date: Mon, 13 Mar 2023 13:48:43 +0800
Subject: [PATCH] opt reward model

---
 src/rlhf/reward.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/rlhf/reward.py b/src/rlhf/reward.py
index 58a8983..2c74992 100644
--- a/src/rlhf/reward.py
+++ b/src/rlhf/reward.py
@@ -141,7 +141,7 @@ class RewardModel(pl.LightningModule):
             return bool(cfg.get("offload_optimizer") or cfg.get("offload_param"))
         return False
 
-    def single_forward(
+    def forward(
         self,
         x,
         mask = None,
@@ -183,21 +183,21 @@ class RewardModel(pl.LightningModule):
 
         return reward
     
-    def forward(self, x_p, x_a, m_p, m_a):
+    def train_forward(self, x_p, x_a, m_p, m_a):
         # 因为前向传播的时候，需要过两次模型。所以反馈的时候需要冻结其中一次的参数
         # 不然梯度会被计算两次，在包含 deepspeed 框架下会报错
         # 报错信息：Gradient computed twice for this partition.
 
         with torch.enable_grad():
-            prefer_reward = self.single_forward(x_p, prompt_mask=m_p)
+            prefer_reward = self.forward(x_p, prompt_mask=m_p)
         with torch.no_grad():
-            alter_reward = self.single_forward(x_a, prompt_mask=m_a)
+            alter_reward = self.forward(x_a, prompt_mask=m_a)
 
         return prefer_reward, alter_reward
     
     def training_step(self, batch, batch_idx):
         x_p, x_a, m_p, m_a = batch
-        prefer_reward, alter_reward = self(
+        prefer_reward, alter_reward = self.train_forward(
             x_p, x_a, m_p, m_a)
         
         loss = loss_function(prefer_reward, alter_reward)
-- 
GitLab