未验证 提交 efa8aded 编写于 作者: A Arash Bakhtiari 提交者: GitHub

Fix the residual add mp scaling for GPTNeoX (#2310)

上级 a691ec60
......@@ -321,10 +321,10 @@ __global__ void gptj_residual_add(float* input,
if (attnbias) {
float4 attn_bias = attnbias_cast[offset % intermediate_size];
data.x += attn_bias.x * mp_scale;
data.y += attn_bias.y * mp_scale;
data.z += attn_bias.z * mp_scale;
data.w += attn_bias.w * mp_scale;
data.x += attn_bias.x;
data.y += attn_bias.y;
data.z += attn_bias.z;
data.w += attn_bias.w;
}
data.x = out.x + res_vec.x + (data.x + bias_data.x) * mp_scale;
data.y = out.y + res_vec.y + (data.y + bias_data.y) * mp_scale;
......@@ -383,10 +383,10 @@ __global__ void gptj_residual_add(__half* input,
__half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
float2 attn_low_bias = __half22float2(attnbias_half[0]);
float2 attn_high_bias = __half22float2(attnbias_half[1]);
low_data.x += attn_low_bias.x * mp_scale;
low_data.y += attn_low_bias.y * mp_scale;
high_data.x += attn_high_bias.x * mp_scale;
high_data.y += attn_high_bias.y * mp_scale;
low_data.x += attn_low_bias.x;
low_data.y += attn_low_bias.y;
high_data.x += attn_high_bias.x;
high_data.y += attn_high_bias.y;
}
low_data.x = low_res.x + low_out.x + (low_data.x + low_bias.x) * mp_scale;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册