Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
DeepSpeed
提交
79692af1
D
DeepSpeed
项目概览
Greenplum
/
DeepSpeed
上一次同步 大约 1 年
通知
10
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeed
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
79692af1
编写于
9月 27, 2022
作者:
A
Arash Bakhtiari
提交者:
GitHub
9月 27, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Extend residual_add kernel tests to conver pre_attn_norm (#2354)
Co-authored-by:
N
Jeff Rasley
<
jerasley@microsoft.com
>
上级
b450da4f
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
58 addition
and
38 deletion
+58
-38
tests/unit/ops/transformer/inference/test_residual_add.py
tests/unit/ops/transformer/inference/test_residual_add.py
+58
-38
未找到文件。
tests/unit/ops/transformer/inference/test_residual_add.py
浏览文件 @
79692af1
...
...
@@ -23,37 +23,60 @@ def inference_module():
return
InferenceBuilder
().
load
()
def
r
un_residual_add_reference
(
hidden_state
,
def
r
es_add_bias_ref
(
hidden_state
,
residual
,
attentio
n_output
,
attention_output
_bias
,
att
n_output
,
attn
_bias
,
final_bias
,
mlp_after_attn
,
add_bias
,
mp_size
=
1
):
residual_scaled
=
residual
/
mp_size
final_bias_scaled
=
final_bias
/
mp_size
attention_output_scaled
=
attention_output
/
mp_size
attention_output_bias_scaled
=
attention_output_bias
/
mp_size
hidden_state
=
hidden_state
+
residual_scaled
+
final_bias_scaled
# in case that mlp_after_attn = True, we additionally need to scale attention_output as well
if
mlp_after_attn
:
hidden_state
+=
attention_output_scaled
mp_size
=
1
,
pre_attn_norm
=
True
):
if
pre_attn_norm
:
hidden_state
+=
(
residual
+
final_bias
+
attn_output
+
attn_bias
)
/
mp_size
else
:
hidden_state
+=
attention_output
# TODO: The `add_bias` flag is used only for `launch_gptj_residual_add` kernel (aka, mlp_after_attn is False).
# This is a hack to get the parametarized add_bias to work. We need to fix this after refactoring the kernels.
add_bias
=
True
if
mlp_after_attn
else
add_bias
hidden_state
+=
residual
+
final_bias
return
hidden_state
if
add_bias
:
hidden_state
=
hidden_state
+
attention_output_bias_scaled
def
res_add_bias_ref_gptj
(
hidden_state
,
residual
,
attn_output
,
attn_bias
,
final_bias
,
add_attn_bias
,
mp_size
):
hidden_state
+=
attn_output
+
(
residual
+
final_bias
)
/
mp_size
if
add_attn_bias
:
hidden_state
+=
attn_bias
/
mp_size
return
hidden_state
def
run_residual_add_reference
(
hidden_state
,
residual
,
attn_output
,
attn_bias
,
final_bias
,
mlp_after_attn
,
add_attn_bias
,
mp_size
,
pre_attn_norm
):
if
mlp_after_attn
:
return
res_add_bias_ref
(
hidden_state
,
residual
,
attn_output
,
attn_bias
,
final_bias
,
mp_size
,
pre_attn_norm
)
else
:
return
res_add_bias_ref_gptj
(
hidden_state
,
residual
,
attn_output
,
attn_bias
,
final_bias
,
add_attn_bias
,
mp_size
)
@
pytest
.
mark
.
inference
@
pytest
.
mark
.
parametrize
(
"batch"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"sequence"
,
[
1
,
128
,
255
])
...
...
@@ -62,7 +85,7 @@ def run_residual_add_reference(hidden_state,
@
pytest
.
mark
.
parametrize
(
"mlp_after_attn"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"add_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"mp_size"
,
[
1
,
2
])
# @pytest.mark.parametrize("preln", [True]) # TODO: add support for preln
@
pytest
.
mark
.
parametrize
(
"pre_attn_norm"
,
[
True
,
False
])
def
test_residual_add
(
inference_module
,
batch
,
sequence
,
...
...
@@ -70,38 +93,35 @@ def test_residual_add(inference_module,
dtype
,
mlp_after_attn
,
add_bias
,
mp_size
):
preln
=
True
mp_size
,
pre_attn_norm
):
ds_out
=
torch
.
randn
((
batch
,
sequence
,
hidden_dim
),
dtype
=
dtype
,
device
=
'cuda'
)
residual
=
torch
.
randn
((
batch
,
sequence
,
hidden_dim
),
dtype
=
dtype
,
device
=
'cuda'
)
attention_output
=
torch
.
randn
((
batch
,
sequence
,
hidden_dim
),
dtype
=
dtype
,
device
=
'cuda'
)
attn_output
=
torch
.
randn
((
batch
,
sequence
,
hidden_dim
),
dtype
=
dtype
,
device
=
'cuda'
)
final_bias
=
torch
.
randn
((
hidden_dim
),
dtype
=
dtype
,
device
=
'cuda'
)
att
ention_output
_bias
=
torch
.
randn
((
hidden_dim
),
dtype
=
dtype
,
device
=
'cuda'
)
att
n
_bias
=
torch
.
randn
((
hidden_dim
),
dtype
=
dtype
,
device
=
'cuda'
)
ref_out
=
ds_out
.
clone
()
ref_out
=
run_residual_add_reference
(
ref_out
,
residual
,
att
entio
n_output
,
att
ention_output
_bias
,
attn_output
,
att
n
_bias
,
final_bias
,
mlp_after_attn
,
add_bias
,
mp_size
)
mp_size
,
pre_attn_norm
)
res_add_args
=
[
ds_out
,
residual
,
att
entio
n_output
,
att
ention_output
_bias
,
attn_output
,
att
n
_bias
,
final_bias
,
mp_size
,
mlp_after_attn
,
add_bias
,
pre
ln
pre
_attn_norm
]
if
dtype
==
torch
.
float16
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录