Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
bb20dcfc
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
bb20dcfc
编写于
12月 29, 2020
作者:
L
liuyuhui
提交者:
GitHub
12月 29, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Kunlun] bug fix of PR2: Support MultiDevicePass and BKCL in parallel executor (#29961)
上级
6a0102b0
变更
18
隐藏空白更改
内联
并排
Showing
18 changed file
with
51 addition
and
56 deletion
+51
-56
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+7
-12
python/paddle/fluid/tests/unittests/seresnext_net.py
python/paddle/fluid/tests/unittests/seresnext_net.py
+4
-4
python/paddle/fluid/tests/unittests/seresnext_test_base.py
python/paddle/fluid/tests/unittests/seresnext_test_base.py
+1
-1
python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
...paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+4
-4
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
...e/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+3
-3
python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
.../paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+5
-5
python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
...uid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+3
-3
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+1
-1
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
...dle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+3
-3
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
...id/tests/unittests/test_ir_memory_optimize_transformer.py
+2
-2
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
...uid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+9
-9
python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
...paddle/fluid/tests/unittests/test_parallel_executor_pg.py
+2
-2
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
...ts/unittests/test_parallel_executor_seresnext_base_gpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
...t_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
...tests/test_parallel_executor_seresnext_with_reduce_cpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
...tests/test_parallel_executor_seresnext_with_reduce_gpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
...uid/tests/unittests/test_parallel_executor_transformer.py
+2
-2
未找到文件。
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
bb20dcfc
...
...
@@ -28,19 +28,14 @@ import sys
from
feed_data_reader
import
FeedDataReader
__all__
=
[
'TestParallelExecutorBase'
]
class
DeviceType
:
CPU
=
1
GPU
=
2
XPU
=
3
DeviceType
=
core
.
DeviceType
class
TestParallelExecutorBase
(
unittest
.
TestCase
):
@
classmethod
def
check_network_convergence
(
cls
,
method
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
iter
=
5
,
batch_size
=
None
,
feed_dict
=
None
,
...
...
@@ -81,7 +76,7 @@ class TestParallelExecutorBase(unittest.TestCase):
main
,
method
,
optimizer
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
CUDA
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
XPU
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
...
...
@@ -102,7 +97,7 @@ class TestParallelExecutorBase(unittest.TestCase):
if
batch_size
is
not
None
:
batch_size
*=
fluid
.
core
.
get_cuda_device_count
(
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
core
.
get_xpu_device_count
(
)
if
use_device
==
DeviceType
.
CUDA
else
fluid
.
core
.
get_xpu_device_count
(
)
if
use_device
==
DeviceType
.
XPU
else
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
...
...
@@ -132,7 +127,7 @@ class TestParallelExecutorBase(unittest.TestCase):
@
classmethod
def
check_pass_conflict
(
cls
,
method
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_dict
=
None
,
get_data_from_feeder
=
None
,
use_reduce
=
False
,
...
...
@@ -153,7 +148,7 @@ class TestParallelExecutorBase(unittest.TestCase):
main
,
method
,
optimizer
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
CUDA
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
XPU
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
...
...
@@ -191,7 +186,7 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy
.
enable_inplace
=
enable_inplace
build_strategy
.
enable_sequential_execution
=
enable_sequential_execution
if
use_device
==
DeviceType
.
GPU
and
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
core
.
is_compiled_with_cuda
():
build_strategy
.
remove_unnecessary_lock
=
True
if
use_device
==
DeviceType
.
XPU
and
core
.
is_compiled_with_xpu
():
build_strategy
.
fuse_elewise_add_act_ops
=
False
...
...
python/paddle/fluid/tests/unittests/seresnext_net.py
浏览文件 @
bb20dcfc
...
...
@@ -171,20 +171,20 @@ model = SE_ResNeXt50Small
def
batch_size
(
use_device
):
if
use_device
==
DeviceType
.
GPU
:
if
use_device
==
DeviceType
.
CUDA
:
# Paddle uses 8GB P4 GPU for unittest so we decreased the batch size.
return
8
return
12
def
iter
(
use_device
):
if
use_device
==
DeviceType
.
GPU
:
if
use_device
==
DeviceType
.
CUDA
:
return
10
return
1
gpu_img
,
gpu_label
=
init_data
(
batch_size
=
batch_size
(
use_device
=
DeviceType
.
GPU
),
batch_size
=
batch_size
(
use_device
=
DeviceType
.
CUDA
),
img_shape
=
img_shape
,
label_range
=
999
)
cpu_img
,
cpu_label
=
init_data
(
...
...
@@ -196,6 +196,6 @@ feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
def
feed_dict
(
use_device
):
if
use_device
==
DeviceType
.
GPU
:
if
use_device
==
DeviceType
.
CUDA
:
return
feed_dict_gpu
return
feed_dict_cpu
python/paddle/fluid/tests/unittests/seresnext_test_base.py
浏览文件 @
bb20dcfc
...
...
@@ -26,7 +26,7 @@ class TestResnetBase(TestParallelExecutorBase):
use_device
,
delta2
=
1e-5
,
compare_seperately
=
True
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
func_1_first_loss
,
func_1_last_loss
=
self
.
check_network_convergence
(
...
...
python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -35,7 +35,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
get_data_from_feeder
=
None
,
optimizer
=
None
,
fuse_all_optimizer_ops
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
feed_dict_data
=
None
...
...
@@ -82,12 +82,12 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
fuse_all_optimizer_ops
=
True
)
def
test_simple_fc_with_fuse_all_reduce
(
self
):
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
GPU
)
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
CUDA
)
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
CPU
)
def
test_batchnorm_fc_with_fuse_all_reduce
(
self
):
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
DeviceType
.
GPU
)
DeviceType
.
CUDA
)
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
DeviceType
.
CPU
)
...
...
@@ -126,7 +126,7 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
def
test_simple_bow_net_with_fuse_all_reduce
(
self
):
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
GPU
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
CUDA
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
CPU
)
...
...
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -26,7 +26,7 @@ class TestMNIST(TestParallelExecutorBase):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
def
_compare_fuse_elewise_add_act_ops
(
self
,
model
,
use_device
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
init_data
()
...
...
@@ -66,12 +66,12 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
test_simple_fc_with_fuse_op
(
self
):
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
GPU
)
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
CUDA
)
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
CPU
)
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
DeviceType
.
GPU
)
DeviceType
.
CUDA
)
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
DeviceType
.
CPU
)
...
...
python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -38,7 +38,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
feed_dict
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
fluid
.
optimizer
.
Adam
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
not_fuse_op_first_loss
,
not_fuse_op_last_loss
=
self
.
check_network_convergence
(
...
...
@@ -76,7 +76,7 @@ class TestFuseAdamOps(TestFuseOptimizationOps):
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
CUDA
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
...
...
@@ -121,7 +121,7 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps):
def
test_simple_bow_net_with_fuse_op
(
self
):
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
self
.
_decorate_compare_fused_optimizer_ops
(
model
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
model
,
DeviceType
.
CUDA
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
model
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
...
...
@@ -144,7 +144,7 @@ class TestPassConflictBase(TestFuseAdamOps):
feed_dict
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
fluid
.
optimizer
.
Adam
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
self
.
check_pass_conflict
(
...
...
@@ -165,7 +165,7 @@ class TestFuseAdamOpsPassConflict(TestPassConflictBase):
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
CUDA
,
optimizer
=
self
.
optimizer
)
class
TestFuseSGDOpsPassConflict
(
TestFuseAdamOpsPassConflict
):
...
...
python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
return
img
,
label
def
_compare
(
self
,
model
,
use_device
,
random_data
=
True
,
only_forward
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
self
.
_init_data
(
random_data
)
...
...
@@ -108,11 +108,11 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
test_simple_depthwise_with_fuse_op
(
self
):
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
GPU
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CUDA
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CPU
)
def
test_simple_depthwise_with_fuse_op_only_forward
(
self
):
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
GPU
,
only_forward
=
True
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CUDA
,
only_forward
=
True
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CPU
,
only_forward
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -58,7 +58,7 @@ class TestIrInplace(TestParallelExecutorBase):
fc_with_batchnorm
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
use_ir_memory_optimize
=
ir_memory_optimize
,
enable_inplace
=
enable_inplace
)
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -61,7 +61,7 @@ class TestMNIST(TestParallelExecutorBase):
return
img
,
label
def
_compare_ir_memory_optimize
(
self
,
model
,
use_device
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
self
.
_dummy_data
()
...
...
@@ -84,11 +84,11 @@ class TestMNIST(TestParallelExecutorBase):
def
test_simple_fc_net
(
self
):
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
CPU
)
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
GPU
)
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
CUDA
)
def
test_fc_with_reshape_net
(
self
):
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
CPU
)
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
GPU
)
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
CUDA
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
浏览文件 @
bb20dcfc
...
...
@@ -35,14 +35,14 @@ class TestTransformerWithIR(TestParallelExecutorBase):
# check python transpiler
self
.
check_network_convergence
(
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_data_reader
=
get_feed_data_reader
(),
use_ir_memory_optimize
=
False
,
iter
=
2
)
# check IR memory optimize
self
.
check_network_convergence
(
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_data_reader
=
get_feed_data_reader
(),
use_ir_memory_optimize
=
True
,
iter
=
2
)
...
...
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
浏览文件 @
bb20dcfc
...
...
@@ -84,7 +84,7 @@ class TestResnet(TestParallelExecutorBase):
def
test_model
(
self
):
if
core
.
is_compiled_with_cuda
():
self
.
check_model
(
DeviceType
.
GPU
)
self
.
check_model
(
DeviceType
.
CUDA
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
bb20dcfc
...
...
@@ -81,7 +81,7 @@ class TestMNIST(TestParallelExecutorBase):
use_device
,
delta1
=
1e-6
,
delta2
=
1e-4
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
...
...
@@ -110,7 +110,7 @@ class TestMNIST(TestParallelExecutorBase):
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_device
,
use_reduce
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
...
...
@@ -127,7 +127,7 @@ class TestMNIST(TestParallelExecutorBase):
def
test_simple_fc
(
self
):
# use_device
self
.
check_simple_fc_convergence
(
DeviceType
.
GPU
)
self
.
check_simple_fc_convergence
(
DeviceType
.
CUDA
)
self
.
check_simple_fc_convergence
(
DeviceType
.
CPU
)
self
.
check_simple_fc_convergence
(
DeviceType
.
XPU
)
...
...
@@ -135,13 +135,13 @@ class TestMNIST(TestParallelExecutorBase):
# use_device, use_reduce
# NOTE: the computation result of nccl_reduce is non-deterministic,
# related issue: https://github.com/NVIDIA/nccl/issues/157
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
GPU
,
1e-5
,
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
CUDA
,
1e-5
,
1e-2
)
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
CPU
,
1e-5
,
1e-2
)
def
check_simple_fc_parallel_accuracy
(
self
,
use_device
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
self
.
_init_data
()
...
...
@@ -167,11 +167,11 @@ class TestMNIST(TestParallelExecutorBase):
np
.
mean
(
parallel_last_loss
),
single_last_loss
,
delta
=
1e-6
)
def
test_simple_fc_parallel_accuracy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
GPU
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
CUDA
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
CPU
)
def
check_batchnorm_fc_convergence
(
self
,
use_device
,
use_fast_executor
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
return
...
...
@@ -185,7 +185,7 @@ class TestMNIST(TestParallelExecutorBase):
use_fast_executor
=
use_fast_executor
)
def
test_batchnorm_fc
(
self
):
for
use_device
in
(
DeviceType
.
CPU
,
DeviceType
.
GPU
):
for
use_device
in
(
DeviceType
.
CPU
,
DeviceType
.
CUDA
):
for
use_fast_executor
in
(
False
,
True
):
self
.
check_batchnorm_fc_convergence
(
use_device
,
use_fast_executor
)
...
...
@@ -193,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase):
def
test_batchnorm_fc_with_new_strategy
(
self
):
# NOTE: the computation result of nccl_reduce is non-deterministic,
# related issue: https://github.com/NVIDIA/nccl/issues/157
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
GPU
,
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
CUDA
,
1e-5
,
1e-2
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
CPU
,
1e-5
,
1e-2
)
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
浏览文件 @
bb20dcfc
...
...
@@ -32,7 +32,7 @@ class TestMNIST(TestParallelExecutorBase):
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_device
,
use_reduce
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
init_data
()
...
...
@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
np
.
mean
(
parallel_last_loss
),
single_last_loss
,
delta
=
1e-6
)
def
test_simple_fc_parallel_accuracy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
GPU
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
CUDA
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
浏览文件 @
bb20dcfc
...
...
@@ -30,7 +30,7 @@ class TestResnetGPU(TestResnetBase):
optimizer
=
seresnext_net
.
optimizer
,
use_parallel_executor
=
False
)
self
.
_compare_result_with_origin_model
(
check_func
,
use_device
=
DeviceType
.
GPU
,
compare_seperately
=
False
)
check_func
,
use_device
=
DeviceType
.
CUDA
,
compare_seperately
=
False
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
浏览文件 @
bb20dcfc
...
...
@@ -32,7 +32,7 @@ class TestResnetWithFuseAllReduceGPU(TestResnetBase):
optimizer
=
seresnext_net
.
optimizer
,
fuse_all_reduce_ops
=
True
)
self
.
_compare_result_with_origin_model
(
check_func
,
use_device
=
DeviceType
.
GPU
,
delta2
=
1e-2
)
check_func
,
use_device
=
DeviceType
.
CUDA
,
delta2
=
1e-2
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
浏览文件 @
bb20dcfc
...
...
@@ -21,7 +21,7 @@ import paddle.fluid.core as core
class
TestResnetWithReduceBase
(
TestParallelExecutorBase
):
def
_compare_reduce_and_allreduce
(
self
,
use_device
,
delta2
=
1e-5
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
all_reduce_first_loss
,
all_reduce_last_loss
=
self
.
check_network_convergence
(
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
浏览文件 @
bb20dcfc
...
...
@@ -20,7 +20,7 @@ from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduc
class
TestResnetWithReduceGPU
(
TestResnetWithReduceBase
):
def
test_seresnext_with_reduce
(
self
):
self
.
_compare_reduce_and_allreduce
(
use_device
=
DeviceType
.
GPU
,
delta2
=
1e-2
)
use_device
=
DeviceType
.
CUDA
,
delta2
=
1e-2
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
浏览文件 @
bb20dcfc
...
...
@@ -191,11 +191,11 @@ class TestTransformer(TestParallelExecutorBase):
if
core
.
is_compiled_with_cuda
():
self
.
check_network_convergence
(
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_data_reader
=
get_feed_data_reader
())
self
.
check_network_convergence
(
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
enable_sequential_execution
=
True
,
feed_data_reader
=
get_feed_data_reader
())
self
.
check_network_convergence
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录