提交 95ea4cd1 编写于 作者: U u010280923

opt reward model

上级 c1da839b
...@@ -141,7 +141,7 @@ class RewardModel(pl.LightningModule): ...@@ -141,7 +141,7 @@ class RewardModel(pl.LightningModule):
return bool(cfg.get("offload_optimizer") or cfg.get("offload_param")) return bool(cfg.get("offload_optimizer") or cfg.get("offload_param"))
return False return False
def single_forward( def forward(
self, self,
x, x,
mask = None, mask = None,
...@@ -183,21 +183,21 @@ class RewardModel(pl.LightningModule): ...@@ -183,21 +183,21 @@ class RewardModel(pl.LightningModule):
return reward return reward
def forward(self, x_p, x_a, m_p, m_a): def train_forward(self, x_p, x_a, m_p, m_a):
# 因为前向传播的时候,需要过两次模型。所以反馈的时候需要冻结其中一次的参数 # 因为前向传播的时候,需要过两次模型。所以反馈的时候需要冻结其中一次的参数
# 不然梯度会被计算两次,在包含 deepspeed 框架下会报错 # 不然梯度会被计算两次,在包含 deepspeed 框架下会报错
# 报错信息:Gradient computed twice for this partition. # 报错信息:Gradient computed twice for this partition.
with torch.enable_grad(): with torch.enable_grad():
prefer_reward = self.single_forward(x_p, prompt_mask=m_p) prefer_reward = self.forward(x_p, prompt_mask=m_p)
with torch.no_grad(): with torch.no_grad():
alter_reward = self.single_forward(x_a, prompt_mask=m_a) alter_reward = self.forward(x_a, prompt_mask=m_a)
return prefer_reward, alter_reward return prefer_reward, alter_reward
def training_step(self, batch, batch_idx): def training_step(self, batch, batch_idx):
x_p, x_a, m_p, m_a = batch x_p, x_a, m_p, m_a = batch
prefer_reward, alter_reward = self( prefer_reward, alter_reward = self.train_forward(
x_p, x_a, m_p, m_a) x_p, x_a, m_p, m_a)
loss = loss_function(prefer_reward, alter_reward) loss = loss_function(prefer_reward, alter_reward)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册