提交 c1da839b 编写于 作者: U u010280923

opt reward model

上级 fa78eeda
...@@ -71,10 +71,12 @@ class RewardModel(pl.LightningModule): ...@@ -71,10 +71,12 @@ class RewardModel(pl.LightningModule):
self.padding_embed = nn.Parameter(torch.zeros(dim), requires_grad=False) self.padding_embed = nn.Parameter(torch.zeros(dim), requires_grad=False)
# reward 得分计算 # reward 得分计算
self.pred_reward = nn.Sequential( self.pred_reward = nn.Linear(dim, 1, bias=False)
nn.Linear(dim, 1, bias=False),
Rearrange('... 1 -> ...') # 降维 # self.pred_reward = nn.Sequential(
) # nn.Linear(dim, 1, bias=False),
# Rearrange('... 1 -> ...') # 降维
# )
def load(self, path): def load(self, path):
path = Path(path) path = Path(path)
...@@ -177,6 +179,7 @@ class RewardModel(pl.LightningModule): ...@@ -177,6 +179,7 @@ class RewardModel(pl.LightningModule):
# 计算奖励 # 计算奖励
reward = self.pred_reward(last_token_embeds) reward = self.pred_reward(last_token_embeds)
reward = reward.squeeze(-1)
return reward return reward
...@@ -184,7 +187,7 @@ class RewardModel(pl.LightningModule): ...@@ -184,7 +187,7 @@ class RewardModel(pl.LightningModule):
# 因为前向传播的时候,需要过两次模型。所以反馈的时候需要冻结其中一次的参数 # 因为前向传播的时候,需要过两次模型。所以反馈的时候需要冻结其中一次的参数
# 不然梯度会被计算两次,在包含 deepspeed 框架下会报错 # 不然梯度会被计算两次,在包含 deepspeed 框架下会报错
# 报错信息:Gradient computed twice for this partition. # 报错信息:Gradient computed twice for this partition.
with torch.enable_grad(): with torch.enable_grad():
prefer_reward = self.single_forward(x_p, prompt_mask=m_p) prefer_reward = self.single_forward(x_p, prompt_mask=m_p)
with torch.no_grad(): with torch.no_grad():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册