diff --git a/src/dataset.py b/src/dataset.py index e65c312c3f2b3ba12533194d55f7e4446f1396ce..326a6e770fa6418e2ddb82004ceb6d7777662c80 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -283,7 +283,7 @@ class RMDataset(Dataset): preferred_idx = self.tokenizer.tokenizer.encode(preferred) alternate_idx = self.tokenizer.tokenizer.encode(alternate) - prompt_mask = [self.padding_mask_id] * len(prompt_idx) + prompt_mask = [self.prompt_mask_id] * len(prompt_idx) preferred_mask = [self.response_mask_id] * len(preferred_idx) alternate_mask = [self.response_mask_id] * len(alternate_idx) diff --git a/src/rlhf/reward.py b/src/rlhf/reward.py index b9797d1a3e0d9b89ff01ebb0cae10eb206bb0de4..b0a3824dbefa3140deb3aa4ffe43fa0978b657fa 100644 --- a/src/rlhf/reward.py +++ b/src/rlhf/reward.py @@ -66,9 +66,9 @@ class RewardModel(pl.LightningModule): dim = self.args.n_embd # 用于区分输入中的 prompt 和 response,当作模型参数进行训练,初始化为全0 - self.prompt_embed = nn.Parameter(torch.zeros(1, 1, dim)).to() - self.response_embed = nn.Parameter(torch.zeros(1, 1, dim)) - self.padding_embed = nn.Parameter(torch.zeros(1, 1, dim), requires_grad=False) + self.prompt_embed = nn.Parameter(torch.zeros(dim)) + self.response_embed = nn.Parameter(torch.zeros(dim)) + self.padding_embed = nn.Parameter(torch.zeros(dim), requires_grad=False) # reward 得分计算 self.pred_reward = nn.Sequential(