From 95ea4cd1a4a9a29bbece43ba36f48bda54032e8c Mon Sep 17 00:00:00 2001 From: u010280923 Date: Mon, 13 Mar 2023 13:48:43 +0800 Subject: [PATCH] opt reward model --- src/rlhf/reward.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/rlhf/reward.py b/src/rlhf/reward.py index 58a8983..2c74992 100644 --- a/src/rlhf/reward.py +++ b/src/rlhf/reward.py @@ -141,7 +141,7 @@ class RewardModel(pl.LightningModule): return bool(cfg.get("offload_optimizer") or cfg.get("offload_param")) return False - def single_forward( + def forward( self, x, mask = None, @@ -183,21 +183,21 @@ class RewardModel(pl.LightningModule): return reward - def forward(self, x_p, x_a, m_p, m_a): + def train_forward(self, x_p, x_a, m_p, m_a): # 因为前向传播的时候,需要过两次模型。所以反馈的时候需要冻结其中一次的参数 # 不然梯度会被计算两次,在包含 deepspeed 框架下会报错 # 报错信息:Gradient computed twice for this partition. with torch.enable_grad(): - prefer_reward = self.single_forward(x_p, prompt_mask=m_p) + prefer_reward = self.forward(x_p, prompt_mask=m_p) with torch.no_grad(): - alter_reward = self.single_forward(x_a, prompt_mask=m_a) + alter_reward = self.forward(x_a, prompt_mask=m_a) return prefer_reward, alter_reward def training_step(self, batch, batch_idx): x_p, x_a, m_p, m_a = batch - prefer_reward, alter_reward = self( + prefer_reward, alter_reward = self.train_forward( x_p, x_a, m_p, m_a) loss = loss_function(prefer_reward, alter_reward) -- GitLab