diff --git a/src/rlhf/reward.py b/src/rlhf/reward.py index 58a89834bbb2c92d07e763b552561c58594f0b52..2c74992d231dad7d47e499630e1b3936527804a7 100644 --- a/src/rlhf/reward.py +++ b/src/rlhf/reward.py @@ -141,7 +141,7 @@ class RewardModel(pl.LightningModule): return bool(cfg.get("offload_optimizer") or cfg.get("offload_param")) return False - def single_forward( + def forward( self, x, mask = None, @@ -183,21 +183,21 @@ class RewardModel(pl.LightningModule): return reward - def forward(self, x_p, x_a, m_p, m_a): + def train_forward(self, x_p, x_a, m_p, m_a): # 因为前向传播的时候,需要过两次模型。所以反馈的时候需要冻结其中一次的参数 # 不然梯度会被计算两次,在包含 deepspeed 框架下会报错 # 报错信息:Gradient computed twice for this partition. with torch.enable_grad(): - prefer_reward = self.single_forward(x_p, prompt_mask=m_p) + prefer_reward = self.forward(x_p, prompt_mask=m_p) with torch.no_grad(): - alter_reward = self.single_forward(x_a, prompt_mask=m_a) + alter_reward = self.forward(x_a, prompt_mask=m_a) return prefer_reward, alter_reward def training_step(self, batch, batch_idx): x_p, x_a, m_p, m_a = batch - prefer_reward, alter_reward = self( + prefer_reward, alter_reward = self.train_forward( x_p, x_a, m_p, m_a) loss = loss_function(prefer_reward, alter_reward)