Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
bb20dcfc
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
bb20dcfc
编写于
12月 29, 2020
作者:
L
liuyuhui
提交者:
GitHub
12月 29, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Kunlun] bug fix of PR2: Support MultiDevicePass and BKCL in parallel executor (#29961)
上级
6a0102b0
变更
18
显示空白变更内容
内联
并排
Showing
18 changed file
with
51 addition
and
56 deletion
+51
-56
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+7
-12
python/paddle/fluid/tests/unittests/seresnext_net.py
python/paddle/fluid/tests/unittests/seresnext_net.py
+4
-4
python/paddle/fluid/tests/unittests/seresnext_test_base.py
python/paddle/fluid/tests/unittests/seresnext_test_base.py
+1
-1
python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
...paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+4
-4
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
...e/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+3
-3
python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
.../paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+5
-5
python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
...uid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+3
-3
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+1
-1
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
...dle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+3
-3
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
...id/tests/unittests/test_ir_memory_optimize_transformer.py
+2
-2
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
...uid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+9
-9
python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
...paddle/fluid/tests/unittests/test_parallel_executor_pg.py
+2
-2
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
...ts/unittests/test_parallel_executor_seresnext_base_gpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
...t_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
...tests/test_parallel_executor_seresnext_with_reduce_cpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
...tests/test_parallel_executor_seresnext_with_reduce_gpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
...uid/tests/unittests/test_parallel_executor_transformer.py
+2
-2
未找到文件。
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
bb20dcfc
...
...
@@ -28,19 +28,14 @@ import sys
from
feed_data_reader
import
FeedDataReader
__all__
=
[
'TestParallelExecutorBase'
]
class
DeviceType
:
CPU
=
1
GPU
=
2
XPU
=
3
DeviceType
=
core
.
DeviceType
class
TestParallelExecutorBase
(
unittest
.
TestCase
):
@
classmethod
def
check_network_convergence
(
cls
,
method
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
iter
=
5
,
batch_size
=
None
,
feed_dict
=
None
,
...
...
@@ -81,7 +76,7 @@ class TestParallelExecutorBase(unittest.TestCase):
main
,
method
,
optimizer
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
CUDA
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
XPU
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
...
...
@@ -102,7 +97,7 @@ class TestParallelExecutorBase(unittest.TestCase):
if
batch_size
is
not
None
:
batch_size
*=
fluid
.
core
.
get_cuda_device_count
(
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
core
.
get_xpu_device_count
(
)
if
use_device
==
DeviceType
.
CUDA
else
fluid
.
core
.
get_xpu_device_count
(
)
if
use_device
==
DeviceType
.
XPU
else
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
...
...
@@ -132,7 +127,7 @@ class TestParallelExecutorBase(unittest.TestCase):
@
classmethod
def
check_pass_conflict
(
cls
,
method
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_dict
=
None
,
get_data_from_feeder
=
None
,
use_reduce
=
False
,
...
...
@@ -153,7 +148,7 @@ class TestParallelExecutorBase(unittest.TestCase):
main
,
method
,
optimizer
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
CUDA
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
XPU
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
...
...
@@ -191,7 +186,7 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy
.
enable_inplace
=
enable_inplace
build_strategy
.
enable_sequential_execution
=
enable_sequential_execution
if
use_device
==
DeviceType
.
GPU
and
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
core
.
is_compiled_with_cuda
():
build_strategy
.
remove_unnecessary_lock
=
True
if
use_device
==
DeviceType
.
XPU
and
core
.
is_compiled_with_xpu
():
build_strategy
.
fuse_elewise_add_act_ops
=
False
...
...
python/paddle/fluid/tests/unittests/seresnext_net.py
浏览文件 @
bb20dcfc
...
...
@@ -171,20 +171,20 @@ model = SE_ResNeXt50Small
def
batch_size
(
use_device
):
if
use_device
==
DeviceType
.
GPU
:
if
use_device
==
DeviceType
.
CUDA
:
# Paddle uses 8GB P4 GPU for unittest so we decreased the batch size.
return
8
return
12
def
iter
(
use_device
):
if
use_device
==
DeviceType
.
GPU
:
if
use_device
==
DeviceType
.
CUDA
:
return
10
return
1
gpu_img
,
gpu_label
=
init_data
(
batch_size
=
batch_size
(
use_device
=
DeviceType
.
GPU
),
batch_size
=
batch_size
(
use_device
=
DeviceType
.
CUDA
),
img_shape
=
img_shape
,
label_range
=
999
)
cpu_img
,
cpu_label
=
init_data
(
...
...
@@ -196,6 +196,6 @@ feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
def
feed_dict
(
use_device
):
if
use_device
==
DeviceType
.
GPU
:
if
use_device
==
DeviceType
.
CUDA
:
return
feed_dict_gpu
return
feed_dict_cpu
python/paddle/fluid/tests/unittests/seresnext_test_base.py
浏览文件 @
bb20dcfc
...
...
@@ -26,7 +26,7 @@ class TestResnetBase(TestParallelExecutorBase):
use_device
,
delta2
=
1e-5
,
compare_seperately
=
True
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
func_1_first_loss
,
func_1_last_loss
=
self
.
check_network_convergence
(
...
...
python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -35,7 +35,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
get_data_from_feeder
=
None
,
optimizer
=
None
,
fuse_all_optimizer_ops
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
feed_dict_data
=
None
...
...
@@ -82,12 +82,12 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
fuse_all_optimizer_ops
=
True
)
def
test_simple_fc_with_fuse_all_reduce
(
self
):
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
GPU
)
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
CUDA
)
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
CPU
)
def
test_batchnorm_fc_with_fuse_all_reduce
(
self
):
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
DeviceType
.
GPU
)
DeviceType
.
CUDA
)
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
DeviceType
.
CPU
)
...
...
@@ -126,7 +126,7 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
def
test_simple_bow_net_with_fuse_all_reduce
(
self
):
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
GPU
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
CUDA
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
CPU
)
...
...
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -26,7 +26,7 @@ class TestMNIST(TestParallelExecutorBase):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
def
_compare_fuse_elewise_add_act_ops
(
self
,
model
,
use_device
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
init_data
()
...
...
@@ -66,12 +66,12 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
test_simple_fc_with_fuse_op
(
self
):
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
GPU
)
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
CUDA
)
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
CPU
)
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
DeviceType
.
GPU
)
DeviceType
.
CUDA
)
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
DeviceType
.
CPU
)
...
...
python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -38,7 +38,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
feed_dict
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
fluid
.
optimizer
.
Adam
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
not_fuse_op_first_loss
,
not_fuse_op_last_loss
=
self
.
check_network_convergence
(
...
...
@@ -76,7 +76,7 @@ class TestFuseAdamOps(TestFuseOptimizationOps):
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
CUDA
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
...
...
@@ -121,7 +121,7 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps):
def
test_simple_bow_net_with_fuse_op
(
self
):
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
self
.
_decorate_compare_fused_optimizer_ops
(
model
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
model
,
DeviceType
.
CUDA
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
model
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
...
...
@@ -144,7 +144,7 @@ class TestPassConflictBase(TestFuseAdamOps):
feed_dict
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
fluid
.
optimizer
.
Adam
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
self
.
check_pass_conflict
(
...
...
@@ -165,7 +165,7 @@ class TestFuseAdamOpsPassConflict(TestPassConflictBase):
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
CUDA
,
optimizer
=
self
.
optimizer
)
class
TestFuseSGDOpsPassConflict
(
TestFuseAdamOpsPassConflict
):
...
...
python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
return
img
,
label
def
_compare
(
self
,
model
,
use_device
,
random_data
=
True
,
only_forward
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
self
.
_init_data
(
random_data
)
...
...
@@ -108,11 +108,11 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
test_simple_depthwise_with_fuse_op
(
self
):
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
GPU
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CUDA
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CPU
)
def
test_simple_depthwise_with_fuse_op_only_forward
(
self
):
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
GPU
,
only_forward
=
True
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CUDA
,
only_forward
=
True
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CPU
,
only_forward
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -58,7 +58,7 @@ class TestIrInplace(TestParallelExecutorBase):
fc_with_batchnorm
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
use_ir_memory_optimize
=
ir_memory_optimize
,
enable_inplace
=
enable_inplace
)
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
浏览文件 @
bb20dcfc
...
...
@@ -61,7 +61,7 @@ class TestMNIST(TestParallelExecutorBase):
return
img
,
label
def
_compare_ir_memory_optimize
(
self
,
model
,
use_device
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
self
.
_dummy_data
()
...
...
@@ -84,11 +84,11 @@ class TestMNIST(TestParallelExecutorBase):
def
test_simple_fc_net
(
self
):
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
CPU
)
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
GPU
)
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
CUDA
)
def
test_fc_with_reshape_net
(
self
):
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
CPU
)
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
GPU
)
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
CUDA
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
浏览文件 @
bb20dcfc
...
...
@@ -35,14 +35,14 @@ class TestTransformerWithIR(TestParallelExecutorBase):
# check python transpiler
self
.
check_network_convergence
(
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_data_reader
=
get_feed_data_reader
(),
use_ir_memory_optimize
=
False
,
iter
=
2
)
# check IR memory optimize
self
.
check_network_convergence
(
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_data_reader
=
get_feed_data_reader
(),
use_ir_memory_optimize
=
True
,
iter
=
2
)
...
...
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
浏览文件 @
bb20dcfc
...
...
@@ -84,7 +84,7 @@ class TestResnet(TestParallelExecutorBase):
def
test_model
(
self
):
if
core
.
is_compiled_with_cuda
():
self
.
check_model
(
DeviceType
.
GPU
)
self
.
check_model
(
DeviceType
.
CUDA
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
bb20dcfc
...
...
@@ -81,7 +81,7 @@ class TestMNIST(TestParallelExecutorBase):
use_device
,
delta1
=
1e-6
,
delta2
=
1e-4
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
...
...
@@ -110,7 +110,7 @@ class TestMNIST(TestParallelExecutorBase):
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_device
,
use_reduce
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
...
...
@@ -127,7 +127,7 @@ class TestMNIST(TestParallelExecutorBase):
def
test_simple_fc
(
self
):
# use_device
self
.
check_simple_fc_convergence
(
DeviceType
.
GPU
)
self
.
check_simple_fc_convergence
(
DeviceType
.
CUDA
)
self
.
check_simple_fc_convergence
(
DeviceType
.
CPU
)
self
.
check_simple_fc_convergence
(
DeviceType
.
XPU
)
...
...
@@ -135,13 +135,13 @@ class TestMNIST(TestParallelExecutorBase):
# use_device, use_reduce
# NOTE: the computation result of nccl_reduce is non-deterministic,
# related issue: https://github.com/NVIDIA/nccl/issues/157
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
GPU
,
1e-5
,
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
CUDA
,
1e-5
,
1e-2
)
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
CPU
,
1e-5
,
1e-2
)
def
check_simple_fc_parallel_accuracy
(
self
,
use_device
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
self
.
_init_data
()
...
...
@@ -167,11 +167,11 @@ class TestMNIST(TestParallelExecutorBase):
np
.
mean
(
parallel_last_loss
),
single_last_loss
,
delta
=
1e-6
)
def
test_simple_fc_parallel_accuracy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
GPU
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
CUDA
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
CPU
)
def
check_batchnorm_fc_convergence
(
self
,
use_device
,
use_fast_executor
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
return
...
...
@@ -185,7 +185,7 @@ class TestMNIST(TestParallelExecutorBase):
use_fast_executor
=
use_fast_executor
)
def
test_batchnorm_fc
(
self
):
for
use_device
in
(
DeviceType
.
CPU
,
DeviceType
.
GPU
):
for
use_device
in
(
DeviceType
.
CPU
,
DeviceType
.
CUDA
):
for
use_fast_executor
in
(
False
,
True
):
self
.
check_batchnorm_fc_convergence
(
use_device
,
use_fast_executor
)
...
...
@@ -193,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase):
def
test_batchnorm_fc_with_new_strategy
(
self
):
# NOTE: the computation result of nccl_reduce is non-deterministic,
# related issue: https://github.com/NVIDIA/nccl/issues/157
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
GPU
,
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
CUDA
,
1e-5
,
1e-2
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
CPU
,
1e-5
,
1e-2
)
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
浏览文件 @
bb20dcfc
...
...
@@ -32,7 +32,7 @@ class TestMNIST(TestParallelExecutorBase):
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_device
,
use_reduce
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
init_data
()
...
...
@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
np
.
mean
(
parallel_last_loss
),
single_last_loss
,
delta
=
1e-6
)
def
test_simple_fc_parallel_accuracy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
GPU
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
CUDA
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
浏览文件 @
bb20dcfc
...
...
@@ -30,7 +30,7 @@ class TestResnetGPU(TestResnetBase):
optimizer
=
seresnext_net
.
optimizer
,
use_parallel_executor
=
False
)
self
.
_compare_result_with_origin_model
(
check_func
,
use_device
=
DeviceType
.
GPU
,
compare_seperately
=
False
)
check_func
,
use_device
=
DeviceType
.
CUDA
,
compare_seperately
=
False
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
浏览文件 @
bb20dcfc
...
...
@@ -32,7 +32,7 @@ class TestResnetWithFuseAllReduceGPU(TestResnetBase):
optimizer
=
seresnext_net
.
optimizer
,
fuse_all_reduce_ops
=
True
)
self
.
_compare_result_with_origin_model
(
check_func
,
use_device
=
DeviceType
.
GPU
,
delta2
=
1e-2
)
check_func
,
use_device
=
DeviceType
.
CUDA
,
delta2
=
1e-2
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
浏览文件 @
bb20dcfc
...
...
@@ -21,7 +21,7 @@ import paddle.fluid.core as core
class
TestResnetWithReduceBase
(
TestParallelExecutorBase
):
def
_compare_reduce_and_allreduce
(
self
,
use_device
,
delta2
=
1e-5
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
all_reduce_first_loss
,
all_reduce_last_loss
=
self
.
check_network_convergence
(
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
浏览文件 @
bb20dcfc
...
...
@@ -20,7 +20,7 @@ from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduc
class
TestResnetWithReduceGPU
(
TestResnetWithReduceBase
):
def
test_seresnext_with_reduce
(
self
):
self
.
_compare_reduce_and_allreduce
(
use_device
=
DeviceType
.
GPU
,
delta2
=
1e-2
)
use_device
=
DeviceType
.
CUDA
,
delta2
=
1e-2
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
浏览文件 @
bb20dcfc
...
...
@@ -191,11 +191,11 @@ class TestTransformer(TestParallelExecutorBase):
if
core
.
is_compiled_with_cuda
():
self
.
check_network_convergence
(
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_data_reader
=
get_feed_data_reader
())
self
.
check_network_convergence
(
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
enable_sequential_execution
=
True
,
feed_data_reader
=
get_feed_data_reader
())
self
.
check_network_convergence
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录