Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
DeepSpeed
提交
cf638be9
D
DeepSpeed
项目概览
Greenplum
/
DeepSpeed
上一次同步 大约 1 年
通知
10
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeed
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
cf638be9
编写于
9月 14, 2022
作者:
J
Jeff Rasley
提交者:
GitHub
9月 14, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
only override forward if using cuda-graph (#2291)
上级
95d11517
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
16 addition
and
50 deletion
+16
-50
.github/workflows/nv-inference.yml
.github/workflows/nv-inference.yml
+0
-2
deepspeed/inference/engine.py
deepspeed/inference/engine.py
+9
-37
tests/unit/inference/test_inference.py
tests/unit/inference/test_inference.py
+7
-11
未找到文件。
.github/workflows/nv-inference.yml
浏览文件 @
cf638be9
...
...
@@ -40,8 +40,6 @@ jobs:
run
:
|
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
git checkout v4.21.2
git rev-parse --short HEAD
pip uninstall --yes transformers
pip install .
...
...
deepspeed/inference/engine.py
浏览文件 @
cf638be9
...
...
@@ -162,10 +162,7 @@ class InferenceEngine(Module):
torch
.
cuda
.
set_rng_state
(
_rng_state
.
cpu
())
if
self
.
mp_world_size
>
1
:
self
.
model_orig_fwd
=
self
.
module
.
forward
self
.
module
.
forward
=
self
.
forward
else
:
self
.
module
.
register_forward_pre_hook
(
self
.
_pre_forward_hook
)
assert
not
self
.
enable_cuda_graph
,
"Cuda graph is not supported for model parallelism"
def
_get_model_config_generate
(
self
,
config
):
self
.
config
=
getattr
(
self
.
module
,
'config'
,
None
)
if
config
is
None
else
config
...
...
@@ -475,14 +472,6 @@ class InferenceEngine(Module):
elif
self
.
dtype
==
torch
.
float
:
self
.
module
.
float
()
def
_pre_forward_hook
(
self
,
module
,
*
inputs
,
**
kwargs
):
for
input
in
inputs
:
if
torch
.
is_tensor
(
input
):
input
=
input
.
to
(
torch
.
cuda
.
current_device
())
for
k
in
kwargs
:
if
torch
.
is_tensor
(
kwargs
[
k
]):
kwargs
[
k
]
=
kwargs
[
k
].
to
(
torch
.
cuda
.
current_device
())
def
_create_cuda_graph
(
self
,
*
inputs
,
**
kwargs
):
# warmup to create the workspace and cublas handle
cuda_stream
=
torch
.
cuda
.
Stream
()
...
...
@@ -519,23 +508,6 @@ class InferenceEngine(Module):
*inputs: Variable length input list
**kwargs: variable length keyword arguments
"""
if
self
.
mp_world_size
>
1
:
if
self
.
mpu
is
None
:
for
input
in
inputs
:
if
torch
.
is_tensor
(
input
):
input
=
input
.
to
(
torch
.
cuda
.
current_device
())
if
not
input
.
is_contiguous
():
input
=
input
.
contiguous
()
dist
.
broadcast
(
input
,
0
)
for
k
in
kwargs
:
if
torch
.
is_tensor
(
kwargs
[
k
]):
kwargs
[
k
]
=
kwargs
[
k
].
to
(
torch
.
cuda
.
current_device
())
if
not
kwargs
[
k
].
is_contiguous
():
kwargs
[
k
]
=
kwargs
[
k
].
contiguous
()
dist
.
broadcast
(
kwargs
[
k
],
0
)
outputs
=
self
.
model_orig_fwd
(
*
inputs
,
**
kwargs
)
else
:
if
self
.
enable_cuda_graph
:
if
self
.
cuda_graph_created
:
outputs
=
self
.
_graph_replay
(
*
inputs
,
**
kwargs
)
...
...
@@ -544,5 +516,5 @@ class InferenceEngine(Module):
outputs
=
self
.
_graph_replay
(
*
inputs
,
**
kwargs
)
else
:
outputs
=
self
.
module
(
*
inputs
,
**
kwargs
)
#outputs = self.module(*inputs, **kwargs)
return
outputs
tests/unit/inference/test_inference.py
浏览文件 @
cf638be9
...
...
@@ -292,13 +292,13 @@ class TestModelTask(DistributedTest):
@
pytest
.
mark
.
seq_inference
@
pytest
.
mark
.
parametrize
(
"model_w_task"
,
[(
"
gpt2
"
,
[(
"
EleutherAI/gpt-neo-1.3B
"
,
"text-generation"
),
(
"EleutherAI/gpt-neox-20b"
,
"text-generation"
),
(
"bigscience/bloom-3b"
,
"text-generation"
)],
ids
=
[
"gpt
2
"
,
ids
=
[
"gpt
-neo
"
,
"gpt-neox"
,
"bloom"
])
class
TestMPSize
(
DistributedTest
):
...
...
@@ -308,7 +308,6 @@ class TestMPSize(DistributedTest):
self
,
model_w_task
,
dtype
,
enable_cuda_graph
,
query
,
inf_kwargs
,
assert_fn
,
...
...
@@ -325,14 +324,11 @@ class TestMPSize(DistributedTest):
pipe
=
pipeline
(
task
,
model
=
model
,
device
=-
1
,
framework
=
"pt"
)
bs_output
=
pipe
(
query
,
**
inf_kwargs
)
pipe
.
model
=
deepspeed
.
init_inference
(
pipe
.
model
,
pipe
.
model
=
deepspeed
.
init_inference
(
pipe
.
model
,
mp_size
=
self
.
world_size
,
dtype
=
dtype
,
replace_method
=
"auto"
,
replace_with_kernel_inject
=
True
,
enable_cuda_graph
=
enable_cuda_graph
,
)
replace_with_kernel_inject
=
True
)
# Switch device to GPU so that input tensors are not on CPU
pipe
.
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
ds_output
=
pipe
(
query
,
**
inf_kwargs
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录