Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
DeepSpeed
提交
e59ba12d
D
DeepSpeed
项目概览
Greenplum
/
DeepSpeed
上一次同步 大约 1 年
通知
10
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeed
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
e59ba12d
编写于
1月 20, 2021
作者:
S
Shaden Smith
提交者:
GitHub
1月 20, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
make test_pipe more stable (#683)
上级
7b0bee0b
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
40 addition
and
22 deletion
+40
-22
tests/unit/test_pipe.py
tests/unit/test_pipe.py
+40
-22
未找到文件。
tests/unit/test_pipe.py
浏览文件 @
e59ba12d
import
os
import
copy
import
torch
import
torch.nn
as
nn
...
...
@@ -13,8 +14,7 @@ import deepspeed.runtime.utils as ds_utils
from
deepspeed.runtime.pipe.topology
import
PipeDataParallelTopology
,
PipeModelDataParallelTopology
PipeTopo
=
PipeDataParallelTopology
import
deepspeed.runtime.pipe.module
as
PipelineModule
from
deepspeed.runtime.pipe.module
import
LayerSpec
from
deepspeed.runtime.pipe.module
import
PipelineModule
,
LayerSpec
from
common
import
distributed_test
...
...
@@ -74,7 +74,13 @@ class AlexNet(nn.Module):
return
self
.
loss_fn
(
x
,
y
)
class
AlexNetPipe
(
PipelineModule
.
PipelineModule
):
class
AlexNetPipe
(
AlexNet
):
def
to_layers
(
self
):
layers
=
[
*
self
.
features
,
lambda
x
:
x
.
view
(
x
.
size
(
0
),
-
1
),
self
.
classifier
]
return
layers
class
AlexNetPipeSpec
(
PipelineModule
):
def
__init__
(
self
,
num_classes
=
10
,
**
kwargs
):
self
.
num_classes
=
num_classes
specs
=
[
...
...
@@ -135,6 +141,9 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
with
torch
.
random
.
fork_rng
(
devices
=
[
torch
.
cuda
.
current_device
()]):
ds_utils
.
set_random_seed
(
seed
)
# disable dropout
model
.
eval
()
trainset
=
cifar_trainset
(
fp16
=
fp16
)
args
.
local_rank
=
dist
.
get_rank
()
...
...
@@ -148,7 +157,7 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
for
step
in
range
(
num_steps
):
loss
=
engine
.
train_batch
()
losses
.
append
(
loss
.
item
())
if
step
%
50
==
0
:
if
step
%
50
==
0
and
dist
.
get_rank
()
==
0
:
print
(
f
'STEP=
{
step
}
LOSS=
{
loss
.
item
()
}
'
)
if
average_dp_losses
:
...
...
@@ -160,18 +169,16 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
return
losses
@
pytest
.
mark
.
parametrize
(
'
base_topo,test_
topo'
,
@
pytest
.
mark
.
parametrize
(
'topo'
,
[
(
PipeTopo
(
num_pp
=
1
,
num_dp
=
4
),
PipeTopo
(
num_pp
=
2
,
num_dp
=
2
)),
(
PipeTopo
(
num_pp
=
1
,
num_dp
=
4
),
PipeTopo
(
num_pp
=
4
,
num_dp
=
1
)),
PipeTopo
(
num_pp
=
1
,
num_dp
=
4
),
PipeTopo
(
num_pp
=
2
,
num_dp
=
2
),
PipeTopo
(
num_pp
=
4
,
num_dp
=
1
),
])
def
test_pipe_cifar10
_seedlayers
(
base_topo
,
test_
topo
,
tmpdir
):
def
test_pipe_cifar10
(
topo
,
tmpdir
):
config_dict
=
{
"train_batch_size"
:
16
,
"train_micro_batch_size_per_gpu"
:
4
,
...
...
@@ -199,21 +206,32 @@ def test_pipe_cifar10_seedlayers(base_topo, test_topo, tmpdir):
}
args
=
args_from_dict
(
tmpdir
,
config_dict
)
# Allocate model for consistent initial weights.
init_net
=
AlexNetPipe
()
@
distributed_test
(
world_size
=
4
)
def
_helper
(
base_topo
,
test_
topo
,
tmpdir
,
steps
=
500
):
def
_helper
(
topo
,
tmpdir
,
steps
=
500
):
assert
steps
>=
100
base_model
=
AlexNetPipe
(
num_classes
=
10
,
topology
=
base_topo
,
seed_layers
=
config_dict
[
'pipeline'
][
'seed_layers'
])
base_net
=
copy
.
deepcopy
(
init_net
)
base_model
=
PipelineModule
(
layers
=
base_net
.
to_layers
(),
num_stages
=
1
,
loss_fn
=
nn
.
CrossEntropyLoss
())
# Train with just data parallelism
base_losses
=
train_cifar
(
base_model
,
args
,
num_steps
=
steps
,
fp16
=
config_dict
[
'fp16'
][
'enabled'
])
test_model
=
AlexNetPipe
(
num_classes
=
10
,
topology
=
test_topo
,
seed_layers
=
config_dict
[
'pipeline'
][
'seed_layers'
])
test_net
=
copy
.
deepcopy
(
init_net
)
test_model
=
PipelineModule
(
layers
=
test_net
.
to_layers
(),
topology
=
topo
,
loss_fn
=
nn
.
CrossEntropyLoss
())
#test_model = AlexNetPipe(num_classes=10,
# topology=test_topo,
# seed_layers=config_dict['pipeline']['seed_layers'])
test_losses
=
train_cifar
(
test_model
,
args
,
num_steps
=
steps
,
...
...
@@ -246,4 +264,4 @@ def test_pipe_cifar10_seedlayers(base_topo, test_topo, tmpdir):
test_avg
=
sum
(
test
)
/
len
(
test
)
assert
rel_diff
(
base_avg
,
test_avg
)
<
0.03
_helper
(
base_topo
,
test_
topo
,
tmpdir
)
_helper
(
topo
,
tmpdir
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录