Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
610c6442
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
610c6442
编写于
4月 10, 2019
作者:
C
chengduo
提交者:
GitHub
4月 10, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Make test_parallel_executor_seresnet.py Faster (#16701)
* slimming test_parallel_executor_seresnet.py
上级
112f1614
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
120 addition
and
132 deletion
+120
-132
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+2
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
...fluid/tests/unittests/test_parallel_executor_seresnext.py
+118
-131
未找到文件。
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
610c6442
...
@@ -29,7 +29,8 @@ __all__ = ['TestParallelExecutorBase']
...
@@ -29,7 +29,8 @@ __all__ = ['TestParallelExecutorBase']
class
TestParallelExecutorBase
(
unittest
.
TestCase
):
class
TestParallelExecutorBase
(
unittest
.
TestCase
):
def
check_network_convergence
(
self
,
@
classmethod
def
check_network_convergence
(
cls
,
method
,
method
,
use_cuda
=
True
,
use_cuda
=
True
,
memory_opt
=
True
,
memory_opt
=
True
,
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
浏览文件 @
610c6442
...
@@ -29,7 +29,7 @@ import unittest
...
@@ -29,7 +29,7 @@ import unittest
import
math
import
math
import
numpy
as
np
import
numpy
as
np
from
functools
import
partial
from
functools
import
partial
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
# and Executor is different. Because, for ParallelExecutor, the dropout_op of
# and Executor is different. Because, for ParallelExecutor, the dropout_op of
# the neural net will be copied N copies(N is the number of device). This will
# the neural net will be copied N copies(N is the number of device). This will
...
@@ -113,7 +113,6 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
...
@@ -113,7 +113,6 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
return
fluid
.
layers
.
elementwise_add
(
x
=
short
,
y
=
scale
,
act
=
'relu'
)
return
fluid
.
layers
.
elementwise_add
(
x
=
short
,
y
=
scale
,
act
=
'relu'
)
batch_size
=
12
img_shape
=
[
3
,
224
,
224
]
img_shape
=
[
3
,
224
,
224
]
...
@@ -181,43 +180,84 @@ def optimizer(learning_rate=0.01):
...
@@ -181,43 +180,84 @@ def optimizer(learning_rate=0.01):
return
optimizer
return
optimizer
class
TestResnet
(
TestParallelExecutorBase
):
def
_batch_size
():
@
classmethod
return
12
def
setUpClass
(
cls
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
global
remove_dropout
def
_iter
(
use_cuda
):
global
remove_bn
if
use_cuda
:
remove_dropout
=
False
return
10
remove_bn
=
False
return
2
gpu_img
,
gpu_label
=
init_data
(
batch_size
=
_batch_size
(),
img_shape
=
img_shape
,
label_range
=
999
)
cpu_img
,
cpu_label
=
init_data
(
batch_size
=
_batch_size
(),
img_shape
=
img_shape
,
label_range
=
999
)
feed_dict_gpu
=
{
"image"
:
gpu_img
,
"label"
:
gpu_label
}
feed_dict_cpu
=
{
"image"
:
cpu_img
,
"label"
:
cpu_label
}
model
=
SE_ResNeXt50Small
def
_feed_dict
(
use_cuda
):
if
use_cuda
:
return
feed_dict_gpu
return
feed_dict_cpu
def
_compare_reduce_and_allreduce
(
self
,
def
_get_result_of_origin_model
(
use_cuda
):
global
remove_bn
global
remove_dropout
remove_bn
=
True
remove_dropout
=
True
first_loss
,
last_loss
=
TestParallelExecutorBase
.
check_network_convergence
(
model
,
model
,
use_cuda
,
feed_dict
=
_feed_dict
(
use_cuda
),
iter
=
20
,
iter
=
_iter
(
use_cuda
),
delta2
=
1e-5
):
batch_size
=
_batch_size
(),
use_cuda
=
use_cuda
,
use_reduce
=
False
,
optimizer
=
optimizer
)
return
first_loss
,
last_loss
origin_cpu_first_loss
,
origin_cpu_last_loss
=
_get_result_of_origin_model
(
False
)
if
core
.
is_compiled_with_cuda
():
origin_gpu_first_loss
,
origin_gpu_last_loss
=
_get_result_of_origin_model
(
True
)
def
_get_origin_result
(
use_cuda
):
if
use_cuda
:
assert
core
.
is_compiled_with_cuda
(),
"Doesn't compiled with CUDA."
return
origin_gpu_first_loss
,
origin_gpu_last_loss
return
origin_cpu_first_loss
,
origin_cpu_last_loss
class
TestResnet
(
TestParallelExecutorBase
):
def
_compare_reduce_and_allreduce
(
self
,
use_cuda
,
delta2
=
1e-5
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
return
global
remove_bn
global
remove_bn
global
remove_dropout
remove_bn
=
True
remove_bn
=
True
remove_dropout
=
True
img
,
label
=
init_data
(
batch_size
=
batch_size
,
img_shape
=
img_shape
,
label_range
=
999
)
all_reduce_first_loss
,
all_reduce_last_loss
=
self
.
check_network_convergence
(
all_reduce_first_loss
,
all_reduce_last_loss
=
self
.
check_network_convergence
(
model
,
model
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
_feed_dict
(
use_cuda
),
"label"
:
label
},
iter
=
_iter
(
use_cuda
),
iter
=
iter
,
batch_size
=
_batch_size
(),
batch_size
=
batch_size
,
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
use_reduce
=
False
,
use_reduce
=
False
,
optimizer
=
optimizer
)
optimizer
=
optimizer
)
reduce_first_loss
,
reduce_last_loss
=
self
.
check_network_convergence
(
reduce_first_loss
,
reduce_last_loss
=
self
.
check_network_convergence
(
model
,
model
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
_feed_dict
(
use_cuda
),
"label"
:
label
},
iter
=
_iter
(
use_cuda
),
iter
=
iter
,
batch_size
=
_batch_size
(),
batch_size
=
batch_size
,
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
use_reduce
=
True
,
use_reduce
=
True
,
optimizer
=
optimizer
)
optimizer
=
optimizer
)
...
@@ -232,10 +272,9 @@ class TestResnet(TestParallelExecutorBase):
...
@@ -232,10 +272,9 @@ class TestResnet(TestParallelExecutorBase):
all_reduce_first_loss_seq
,
all_reduce_last_loss_seq
=
self
.
check_network_convergence
(
all_reduce_first_loss_seq
,
all_reduce_last_loss_seq
=
self
.
check_network_convergence
(
model
,
model
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
_feed_dict
(
use_cuda
),
"label"
:
label
},
iter
=
_iter
(
use_cuda
),
iter
=
iter
,
batch_size
=
_batch_size
(),
batch_size
=
batch_size
,
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
use_reduce
=
False
,
use_reduce
=
False
,
optimizer
=
optimizer
,
optimizer
=
optimizer
,
...
@@ -243,10 +282,9 @@ class TestResnet(TestParallelExecutorBase):
...
@@ -243,10 +282,9 @@ class TestResnet(TestParallelExecutorBase):
reduce_first_loss_seq
,
reduce_last_loss_seq
=
self
.
check_network_convergence
(
reduce_first_loss_seq
,
reduce_last_loss_seq
=
self
.
check_network_convergence
(
model
,
model
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
_feed_dict
(
use_cuda
),
"label"
:
label
},
iter
=
_iter
(
use_cuda
),
iter
=
iter
,
batch_size
=
_batch_size
(),
batch_size
=
batch_size
,
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
use_reduce
=
True
,
use_reduce
=
True
,
optimizer
=
optimizer
,
optimizer
=
optimizer
,
...
@@ -267,37 +305,28 @@ class TestResnet(TestParallelExecutorBase):
...
@@ -267,37 +305,28 @@ class TestResnet(TestParallelExecutorBase):
for
loss
in
zip
(
all_reduce_last_loss_seq
,
reduce_last_loss_seq
):
for
loss
in
zip
(
all_reduce_last_loss_seq
,
reduce_last_loss_seq
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
delta2
)
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
delta2
)
def
_check_resnet_convergence
(
self
,
def
_compare_result_with_origin_model
(
self
,
model
,
get_origin_result
,
check_func_1
,
check_func_2
,
check_func_2
,
use_cuda
,
use_cuda
,
iter
=
20
,
delta2
=
1e-5
,
delta2
=
1e-5
,
compare_seperately
=
True
):
compare_seperately
=
True
,
rm_drop_out
=
False
,
rm_bn
=
False
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
return
global
remove_dropout
global
remove_bn
global
remove_bn
remove_dropout
=
True
global
remove_dropout
remove_bn
=
True
remove_bn
=
rm_bn
or
use_cuda
remove_dropout
=
rm_drop_out
img
,
label
=
init_data
(
func_1_first_loss
,
func_1_last_loss
=
get_origin_result
(
use_cuda
)
batch_size
=
batch_size
,
img_shape
=
img_shape
,
label_range
=
999
)
func_1_first_loss
,
func_1_last_loss
=
check_func_1
(
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
iter
=
iter
,
batch_size
=
batch_size
,
use_cuda
=
use_cuda
)
func_2_first_loss
,
func_2_last_loss
=
check_func_2
(
func_2_first_loss
,
func_2_last_loss
=
check_func_2
(
model
,
model
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
_feed_dict
(
use_cuda
),
"label"
:
label
},
iter
=
_iter
(
use_cuda
),
iter
=
iter
,
batch_size
=
_batch_size
(),
batch_size
=
batch_size
,
use_cuda
=
use_cuda
)
use_cuda
=
use_cuda
)
if
compare_seperately
:
if
compare_seperately
:
...
@@ -311,97 +340,55 @@ class TestResnet(TestParallelExecutorBase):
...
@@ -311,97 +340,55 @@ class TestResnet(TestParallelExecutorBase):
self
.
assertAlmostEquals
(
self
.
assertAlmostEquals
(
np
.
mean
(
func_1_last_loss
),
func_2_last_loss
[
0
],
delta
=
delta2
)
np
.
mean
(
func_1_last_loss
),
func_2_last_loss
[
0
],
delta
=
delta2
)
def
_compare_with_fused_all_reduce
(
self
,
model
,
use_cuda
,
iter
=
20
,
delta2
=
1e-5
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
global
remove_bn
remove_bn
=
True
img
,
label
=
init_data
(
batch_size
=
batch_size
,
img_shape
=
img_shape
,
label_range
=
999
)
all_reduce_first_loss
,
all_reduce_last_loss
=
self
.
check_network_convergence
(
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
iter
=
iter
,
batch_size
=
batch_size
,
use_cuda
=
use_cuda
,
fuse_all_reduce_ops
=
False
,
optimizer
=
optimizer
)
reduce_first_loss
,
reduce_last_loss
=
self
.
check_network_convergence
(
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
iter
=
iter
,
batch_size
=
batch_size
,
use_cuda
=
use_cuda
,
fuse_all_reduce_ops
=
True
,
optimizer
=
optimizer
)
for
loss
in
zip
(
all_reduce_first_loss
,
reduce_first_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-5
)
for
loss
in
zip
(
all_reduce_last_loss
,
reduce_last_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
delta2
)
def
test_seresnext_with_reduce
(
self
):
def
test_seresnext_with_reduce
(
self
):
self
.
_compare_reduce_and_allreduce
(
self
.
_compare_reduce_and_allreduce
(
use_cuda
=
False
,
delta2
=
1e-3
)
model
=
SE_ResNeXt50Small
,
use_cuda
=
True
,
delta2
=
1e-2
)
self
.
_compare_reduce_and_allreduce
(
use_cuda
=
True
,
delta2
=
1e-2
)
self
.
_compare_reduce_and_allreduce
(
model
=
SE_ResNeXt50Small
,
use_cuda
=
False
,
iter
=
5
)
def
test_seresnext_with_fused_all_reduce
(
self
):
self
.
_compare_with_fused_all_reduce
(
model
=
SE_ResNeXt50Small
,
use_cuda
=
True
,
delta2
=
1e-3
)
self
.
_compare_with_fused_all_reduce
(
model
=
SE_ResNeXt50Small
,
use_cuda
=
False
,
iter
=
2
,
delta2
=
1e-3
)
def
test_seresnext_with_learning_rate_decay
(
self
):
def
test_seresnext_with_learning_rate_decay
(
self
):
check_func_1
=
partial
(
# NOTE(zcd): This test is compare the result of use parallel_executor and executor,
self
.
check_network_convergence
,
# and the result of drop_out op and batch_norm op in this two executor
optimizer
=
optimizer
,
# have diff, so the two ops should be removed from the model.
use_parallel_executor
=
True
)
check_func_1
=
_get_origin_result
check_func_2
=
partial
(
check_func_2
=
partial
(
self
.
check_network_convergence
,
self
.
check_network_convergence
,
optimizer
=
optimizer
,
optimizer
=
optimizer
,
use_parallel_executor
=
False
)
use_parallel_executor
=
False
)
self
.
_check_resnet_convergence
(
self
.
_compare_result_with_origin_model
(
SE_ResNeXt50Small
,
check_func_1
,
check_func_2
,
use_cuda
=
True
,
compare_seperately
=
False
)
self
.
_check_resnet_convergence
(
SE_ResNeXt50Small
,
check_func_1
,
check_func_1
,
check_func_2
,
check_func_2
,
use_cuda
=
False
,
use_cuda
=
False
,
rm_drop_out
=
True
,
rm_bn
=
True
,
compare_seperately
=
False
,
compare_seperately
=
False
,
iter
=
2
,
delta2
=
1e-3
)
delta2
=
1e-3
)
self
.
_compare_result_with_origin_model
(
check_func_1
,
check_func_2
,
use_cuda
=
True
,
rm_drop_out
=
True
,
rm_bn
=
True
,
compare_seperately
=
False
)
def
test_seresnext_with_fused_optimizer_ops
(
self
):
def
test_seresnext_with_fused_all_reduce
(
self
):
check_func_1
=
partial
(
# NOTE(zcd): In order to make the program faster,
self
.
check_network_convergence
,
fuse_all_optimizer_ops
=
False
)
# this unit test remove drop_out and batch_norm.
check_func_1
=
_get_origin_result
check_func_2
=
partial
(
check_func_2
=
partial
(
self
.
check_network_convergence
,
fuse_all_optimizer_ops
=
True
)
self
.
check_network_convergence
,
# TODO(zcd): this test failed random, I will fix it in next PR.
optimizer
=
optimizer
,
# self._check_resnet_convergence(
fuse_all_reduce_ops
=
True
)
# SE_ResNeXt50Small,
self
.
_compare_result_with_origin_model
(
# check_func_1,
# check_func_2,
# use_cuda=True,
# delta2=1e-3)
self
.
_check_resnet_convergence
(
SE_ResNeXt50Small
,
check_func_1
,
check_func_1
,
check_func_2
,
check_func_2
,
use_cuda
=
False
,
use_cuda
=
False
,
iter
=
2
,
rm_drop_out
=
True
,
rm_bn
=
True
)
self
.
_compare_result_with_origin_model
(
check_func_1
,
check_func_2
,
use_cuda
=
True
,
rm_drop_out
=
True
,
rm_bn
=
True
,
delta2
=
1e-3
)
delta2
=
1e-3
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录