Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
bb20dcfc
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
bb20dcfc
编写于
12月 29, 2020
作者:
L
liuyuhui
提交者:
GitHub
12月 29, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Kunlun] bug fix of PR2: Support MultiDevicePass and BKCL in parallel executor (#29961)
上级
6a0102b0
变更
18
隐藏空白更改
内联
并排
Showing
18 changed file
with
51 addition
and
56 deletion
+51
-56
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+7
-12
python/paddle/fluid/tests/unittests/seresnext_net.py
python/paddle/fluid/tests/unittests/seresnext_net.py
+4
-4
python/paddle/fluid/tests/unittests/seresnext_test_base.py
python/paddle/fluid/tests/unittests/seresnext_test_base.py
+1
-1
python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
...paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+4
-4
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
...e/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+3
-3
python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
.../paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+5
-5
python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
...uid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+3
-3
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+1
-1
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
...dle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+3
-3
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
...id/tests/unittests/test_ir_memory_optimize_transformer.py
+2
-2
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
...uid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+9
-9
python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
...paddle/fluid/tests/unittests/test_parallel_executor_pg.py
+2
-2
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
...ts/unittests/test_parallel_executor_seresnext_base_gpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
...t_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
...tests/test_parallel_executor_seresnext_with_reduce_cpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
...tests/test_parallel_executor_seresnext_with_reduce_gpu.py
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
...uid/tests/unittests/test_parallel_executor_transformer.py
+2
-2
未找到文件。
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
bb20dcfc
...
@@ -28,19 +28,14 @@ import sys
...
@@ -28,19 +28,14 @@ import sys
from
feed_data_reader
import
FeedDataReader
from
feed_data_reader
import
FeedDataReader
__all__
=
[
'TestParallelExecutorBase'
]
__all__
=
[
'TestParallelExecutorBase'
]
DeviceType
=
core
.
DeviceType
class
DeviceType
:
CPU
=
1
GPU
=
2
XPU
=
3
class
TestParallelExecutorBase
(
unittest
.
TestCase
):
class
TestParallelExecutorBase
(
unittest
.
TestCase
):
@
classmethod
@
classmethod
def
check_network_convergence
(
cls
,
def
check_network_convergence
(
cls
,
method
,
method
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
iter
=
5
,
iter
=
5
,
batch_size
=
None
,
batch_size
=
None
,
feed_dict
=
None
,
feed_dict
=
None
,
...
@@ -81,7 +76,7 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -81,7 +76,7 @@ class TestParallelExecutorBase(unittest.TestCase):
main
,
method
,
optimizer
)
main
,
method
,
optimizer
)
place
=
fluid
.
CUDAPlace
(
place
=
fluid
.
CUDAPlace
(
0
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
CUDA
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
XPU
else
fluid
.
CPUPlace
()
0
)
if
use_device
==
DeviceType
.
XPU
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
exe
.
run
(
startup
)
...
@@ -102,7 +97,7 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -102,7 +97,7 @@ class TestParallelExecutorBase(unittest.TestCase):
if
batch_size
is
not
None
:
if
batch_size
is
not
None
:
batch_size
*=
fluid
.
core
.
get_cuda_device_count
(
batch_size
*=
fluid
.
core
.
get_cuda_device_count
(
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
core
.
get_xpu_device_count
(
)
if
use_device
==
DeviceType
.
CUDA
else
fluid
.
core
.
get_xpu_device_count
(
)
if
use_device
==
DeviceType
.
XPU
else
int
(
)
if
use_device
==
DeviceType
.
XPU
else
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
...
@@ -132,7 +127,7 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -132,7 +127,7 @@ class TestParallelExecutorBase(unittest.TestCase):
@
classmethod
@
classmethod
def
check_pass_conflict
(
cls
,
def
check_pass_conflict
(
cls
,
method
,
method
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_dict
=
None
,
feed_dict
=
None
,
get_data_from_feeder
=
None
,
get_data_from_feeder
=
None
,
use_reduce
=
False
,
use_reduce
=
False
,
...
@@ -153,7 +148,7 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -153,7 +148,7 @@ class TestParallelExecutorBase(unittest.TestCase):
main
,
method
,
optimizer
)
main
,
method
,
optimizer
)
place
=
fluid
.
CUDAPlace
(
place
=
fluid
.
CUDAPlace
(
0
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
CUDA
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
XPU
else
fluid
.
CPUPlace
()
0
)
if
use_device
==
DeviceType
.
XPU
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
exe
.
run
(
startup
)
...
@@ -191,7 +186,7 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -191,7 +186,7 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy
.
enable_inplace
=
enable_inplace
build_strategy
.
enable_inplace
=
enable_inplace
build_strategy
.
enable_sequential_execution
=
enable_sequential_execution
build_strategy
.
enable_sequential_execution
=
enable_sequential_execution
if
use_device
==
DeviceType
.
GPU
and
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
core
.
is_compiled_with_cuda
():
build_strategy
.
remove_unnecessary_lock
=
True
build_strategy
.
remove_unnecessary_lock
=
True
if
use_device
==
DeviceType
.
XPU
and
core
.
is_compiled_with_xpu
():
if
use_device
==
DeviceType
.
XPU
and
core
.
is_compiled_with_xpu
():
build_strategy
.
fuse_elewise_add_act_ops
=
False
build_strategy
.
fuse_elewise_add_act_ops
=
False
...
...
python/paddle/fluid/tests/unittests/seresnext_net.py
浏览文件 @
bb20dcfc
...
@@ -171,20 +171,20 @@ model = SE_ResNeXt50Small
...
@@ -171,20 +171,20 @@ model = SE_ResNeXt50Small
def
batch_size
(
use_device
):
def
batch_size
(
use_device
):
if
use_device
==
DeviceType
.
GPU
:
if
use_device
==
DeviceType
.
CUDA
:
# Paddle uses 8GB P4 GPU for unittest so we decreased the batch size.
# Paddle uses 8GB P4 GPU for unittest so we decreased the batch size.
return
8
return
8
return
12
return
12
def
iter
(
use_device
):
def
iter
(
use_device
):
if
use_device
==
DeviceType
.
GPU
:
if
use_device
==
DeviceType
.
CUDA
:
return
10
return
10
return
1
return
1
gpu_img
,
gpu_label
=
init_data
(
gpu_img
,
gpu_label
=
init_data
(
batch_size
=
batch_size
(
use_device
=
DeviceType
.
GPU
),
batch_size
=
batch_size
(
use_device
=
DeviceType
.
CUDA
),
img_shape
=
img_shape
,
img_shape
=
img_shape
,
label_range
=
999
)
label_range
=
999
)
cpu_img
,
cpu_label
=
init_data
(
cpu_img
,
cpu_label
=
init_data
(
...
@@ -196,6 +196,6 @@ feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
...
@@ -196,6 +196,6 @@ feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
def
feed_dict
(
use_device
):
def
feed_dict
(
use_device
):
if
use_device
==
DeviceType
.
GPU
:
if
use_device
==
DeviceType
.
CUDA
:
return
feed_dict_gpu
return
feed_dict_gpu
return
feed_dict_cpu
return
feed_dict_cpu
python/paddle/fluid/tests/unittests/seresnext_test_base.py
浏览文件 @
bb20dcfc
...
@@ -26,7 +26,7 @@ class TestResnetBase(TestParallelExecutorBase):
...
@@ -26,7 +26,7 @@ class TestResnetBase(TestParallelExecutorBase):
use_device
,
use_device
,
delta2
=
1e-5
,
delta2
=
1e-5
,
compare_seperately
=
True
):
compare_seperately
=
True
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
func_1_first_loss
,
func_1_last_loss
=
self
.
check_network_convergence
(
func_1_first_loss
,
func_1_last_loss
=
self
.
check_network_convergence
(
...
...
python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
浏览文件 @
bb20dcfc
...
@@ -35,7 +35,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
...
@@ -35,7 +35,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
get_data_from_feeder
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
None
,
optimizer
=
None
,
fuse_all_optimizer_ops
=
False
):
fuse_all_optimizer_ops
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
feed_dict_data
=
None
feed_dict_data
=
None
...
@@ -82,12 +82,12 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
...
@@ -82,12 +82,12 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
fuse_all_optimizer_ops
=
True
)
fuse_all_optimizer_ops
=
True
)
def
test_simple_fc_with_fuse_all_reduce
(
self
):
def
test_simple_fc_with_fuse_all_reduce
(
self
):
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
GPU
)
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
CUDA
)
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
CPU
)
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
CPU
)
def
test_batchnorm_fc_with_fuse_all_reduce
(
self
):
def
test_batchnorm_fc_with_fuse_all_reduce
(
self
):
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
DeviceType
.
GPU
)
DeviceType
.
CUDA
)
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
DeviceType
.
CPU
)
DeviceType
.
CPU
)
...
@@ -126,7 +126,7 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
...
@@ -126,7 +126,7 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
def
test_simple_bow_net_with_fuse_all_reduce
(
self
):
def
test_simple_bow_net_with_fuse_all_reduce
(
self
):
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
GPU
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
CUDA
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
CPU
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
CPU
)
...
...
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
浏览文件 @
bb20dcfc
...
@@ -26,7 +26,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -26,7 +26,7 @@ class TestMNIST(TestParallelExecutorBase):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
def
_compare_fuse_elewise_add_act_ops
(
self
,
model
,
use_device
):
def
_compare_fuse_elewise_add_act_ops
(
self
,
model
,
use_device
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
img
,
label
=
init_data
()
img
,
label
=
init_data
()
...
@@ -66,12 +66,12 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -66,12 +66,12 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
test_simple_fc_with_fuse_op
(
self
):
def
test_simple_fc_with_fuse_op
(
self
):
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
GPU
)
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
CUDA
)
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
CPU
)
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
CPU
)
def
test_batchnorm_fc_with_fuse_op
(
self
):
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
DeviceType
.
GPU
)
DeviceType
.
CUDA
)
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
DeviceType
.
CPU
)
DeviceType
.
CPU
)
...
...
python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
浏览文件 @
bb20dcfc
...
@@ -38,7 +38,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
...
@@ -38,7 +38,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
feed_dict
=
None
,
feed_dict
=
None
,
get_data_from_feeder
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
fluid
.
optimizer
.
Adam
):
optimizer
=
fluid
.
optimizer
.
Adam
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
not_fuse_op_first_loss
,
not_fuse_op_last_loss
=
self
.
check_network_convergence
(
not_fuse_op_first_loss
,
not_fuse_op_last_loss
=
self
.
check_network_convergence
(
...
@@ -76,7 +76,7 @@ class TestFuseAdamOps(TestFuseOptimizationOps):
...
@@ -76,7 +76,7 @@ class TestFuseAdamOps(TestFuseOptimizationOps):
def
test_batchnorm_fc_with_fuse_op
(
self
):
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_decorate_compare_fused_optimizer_ops
(
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
CUDA
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
...
@@ -121,7 +121,7 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps):
...
@@ -121,7 +121,7 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps):
def
test_simple_bow_net_with_fuse_op
(
self
):
def
test_simple_bow_net_with_fuse_op
(
self
):
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
self
.
_decorate_compare_fused_optimizer_ops
(
self
.
_decorate_compare_fused_optimizer_ops
(
model
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
model
,
DeviceType
.
CUDA
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
self
.
_decorate_compare_fused_optimizer_ops
(
model
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
model
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
...
@@ -144,7 +144,7 @@ class TestPassConflictBase(TestFuseAdamOps):
...
@@ -144,7 +144,7 @@ class TestPassConflictBase(TestFuseAdamOps):
feed_dict
=
None
,
feed_dict
=
None
,
get_data_from_feeder
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
fluid
.
optimizer
.
Adam
):
optimizer
=
fluid
.
optimizer
.
Adam
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
self
.
check_pass_conflict
(
self
.
check_pass_conflict
(
...
@@ -165,7 +165,7 @@ class TestFuseAdamOpsPassConflict(TestPassConflictBase):
...
@@ -165,7 +165,7 @@ class TestFuseAdamOpsPassConflict(TestPassConflictBase):
self
.
_decorate_compare_fused_optimizer_ops
(
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
CUDA
,
optimizer
=
self
.
optimizer
)
class
TestFuseSGDOpsPassConflict
(
TestFuseAdamOpsPassConflict
):
class
TestFuseSGDOpsPassConflict
(
TestFuseAdamOpsPassConflict
):
...
...
python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
浏览文件 @
bb20dcfc
...
@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
return
img
,
label
return
img
,
label
def
_compare
(
self
,
model
,
use_device
,
random_data
=
True
,
only_forward
=
False
):
def
_compare
(
self
,
model
,
use_device
,
random_data
=
True
,
only_forward
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
img
,
label
=
self
.
_init_data
(
random_data
)
img
,
label
=
self
.
_init_data
(
random_data
)
...
@@ -108,11 +108,11 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -108,11 +108,11 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
test_simple_depthwise_with_fuse_op
(
self
):
def
test_simple_depthwise_with_fuse_op
(
self
):
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
GPU
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CUDA
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CPU
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CPU
)
def
test_simple_depthwise_with_fuse_op_only_forward
(
self
):
def
test_simple_depthwise_with_fuse_op_only_forward
(
self
):
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
GPU
,
only_forward
=
True
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CUDA
,
only_forward
=
True
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CPU
,
only_forward
=
True
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CPU
,
only_forward
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
浏览文件 @
bb20dcfc
...
@@ -58,7 +58,7 @@ class TestIrInplace(TestParallelExecutorBase):
...
@@ -58,7 +58,7 @@ class TestIrInplace(TestParallelExecutorBase):
fc_with_batchnorm
,
fc_with_batchnorm
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
"label"
:
label
},
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
use_ir_memory_optimize
=
ir_memory_optimize
,
use_ir_memory_optimize
=
ir_memory_optimize
,
enable_inplace
=
enable_inplace
)
enable_inplace
=
enable_inplace
)
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
浏览文件 @
bb20dcfc
...
@@ -61,7 +61,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -61,7 +61,7 @@ class TestMNIST(TestParallelExecutorBase):
return
img
,
label
return
img
,
label
def
_compare_ir_memory_optimize
(
self
,
model
,
use_device
):
def
_compare_ir_memory_optimize
(
self
,
model
,
use_device
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
img
,
label
=
self
.
_dummy_data
()
img
,
label
=
self
.
_dummy_data
()
...
@@ -84,11 +84,11 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -84,11 +84,11 @@ class TestMNIST(TestParallelExecutorBase):
def
test_simple_fc_net
(
self
):
def
test_simple_fc_net
(
self
):
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
CPU
)
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
CPU
)
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
GPU
)
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
CUDA
)
def
test_fc_with_reshape_net
(
self
):
def
test_fc_with_reshape_net
(
self
):
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
CPU
)
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
CPU
)
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
GPU
)
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
CUDA
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
浏览文件 @
bb20dcfc
...
@@ -35,14 +35,14 @@ class TestTransformerWithIR(TestParallelExecutorBase):
...
@@ -35,14 +35,14 @@ class TestTransformerWithIR(TestParallelExecutorBase):
# check python transpiler
# check python transpiler
self
.
check_network_convergence
(
self
.
check_network_convergence
(
transformer
,
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_data_reader
=
get_feed_data_reader
(),
feed_data_reader
=
get_feed_data_reader
(),
use_ir_memory_optimize
=
False
,
use_ir_memory_optimize
=
False
,
iter
=
2
)
iter
=
2
)
# check IR memory optimize
# check IR memory optimize
self
.
check_network_convergence
(
self
.
check_network_convergence
(
transformer
,
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_data_reader
=
get_feed_data_reader
(),
feed_data_reader
=
get_feed_data_reader
(),
use_ir_memory_optimize
=
True
,
use_ir_memory_optimize
=
True
,
iter
=
2
)
iter
=
2
)
...
...
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
浏览文件 @
bb20dcfc
...
@@ -84,7 +84,7 @@ class TestResnet(TestParallelExecutorBase):
...
@@ -84,7 +84,7 @@ class TestResnet(TestParallelExecutorBase):
def
test_model
(
self
):
def
test_model
(
self
):
if
core
.
is_compiled_with_cuda
():
if
core
.
is_compiled_with_cuda
():
self
.
check_model
(
DeviceType
.
GPU
)
self
.
check_model
(
DeviceType
.
CUDA
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
bb20dcfc
...
@@ -81,7 +81,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -81,7 +81,7 @@ class TestMNIST(TestParallelExecutorBase):
use_device
,
use_device
,
delta1
=
1e-6
,
delta1
=
1e-6
,
delta2
=
1e-4
):
delta2
=
1e-4
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
...
@@ -110,7 +110,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -110,7 +110,7 @@ class TestMNIST(TestParallelExecutorBase):
# simple_fc
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_device
,
use_reduce
=
False
):
def
check_simple_fc_convergence
(
self
,
use_device
,
use_reduce
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
...
@@ -127,7 +127,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -127,7 +127,7 @@ class TestMNIST(TestParallelExecutorBase):
def
test_simple_fc
(
self
):
def
test_simple_fc
(
self
):
# use_device
# use_device
self
.
check_simple_fc_convergence
(
DeviceType
.
GPU
)
self
.
check_simple_fc_convergence
(
DeviceType
.
CUDA
)
self
.
check_simple_fc_convergence
(
DeviceType
.
CPU
)
self
.
check_simple_fc_convergence
(
DeviceType
.
CPU
)
self
.
check_simple_fc_convergence
(
DeviceType
.
XPU
)
self
.
check_simple_fc_convergence
(
DeviceType
.
XPU
)
...
@@ -135,13 +135,13 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -135,13 +135,13 @@ class TestMNIST(TestParallelExecutorBase):
# use_device, use_reduce
# use_device, use_reduce
# NOTE: the computation result of nccl_reduce is non-deterministic,
# NOTE: the computation result of nccl_reduce is non-deterministic,
# related issue: https://github.com/NVIDIA/nccl/issues/157
# related issue: https://github.com/NVIDIA/nccl/issues/157
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
GPU
,
1e-5
,
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
CUDA
,
1e-5
,
1e-2
)
1e-2
)
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
CPU
,
1e-5
,
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
CPU
,
1e-5
,
1e-2
)
1e-2
)
def
check_simple_fc_parallel_accuracy
(
self
,
use_device
):
def
check_simple_fc_parallel_accuracy
(
self
,
use_device
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
img
,
label
=
self
.
_init_data
()
img
,
label
=
self
.
_init_data
()
...
@@ -167,11 +167,11 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -167,11 +167,11 @@ class TestMNIST(TestParallelExecutorBase):
np
.
mean
(
parallel_last_loss
),
single_last_loss
,
delta
=
1e-6
)
np
.
mean
(
parallel_last_loss
),
single_last_loss
,
delta
=
1e-6
)
def
test_simple_fc_parallel_accuracy
(
self
):
def
test_simple_fc_parallel_accuracy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
GPU
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
CUDA
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
CPU
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
CPU
)
def
check_batchnorm_fc_convergence
(
self
,
use_device
,
use_fast_executor
):
def
check_batchnorm_fc_convergence
(
self
,
use_device
,
use_fast_executor
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
return
return
...
@@ -185,7 +185,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -185,7 +185,7 @@ class TestMNIST(TestParallelExecutorBase):
use_fast_executor
=
use_fast_executor
)
use_fast_executor
=
use_fast_executor
)
def
test_batchnorm_fc
(
self
):
def
test_batchnorm_fc
(
self
):
for
use_device
in
(
DeviceType
.
CPU
,
DeviceType
.
GPU
):
for
use_device
in
(
DeviceType
.
CPU
,
DeviceType
.
CUDA
):
for
use_fast_executor
in
(
False
,
True
):
for
use_fast_executor
in
(
False
,
True
):
self
.
check_batchnorm_fc_convergence
(
use_device
,
self
.
check_batchnorm_fc_convergence
(
use_device
,
use_fast_executor
)
use_fast_executor
)
...
@@ -193,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -193,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase):
def
test_batchnorm_fc_with_new_strategy
(
self
):
def
test_batchnorm_fc_with_new_strategy
(
self
):
# NOTE: the computation result of nccl_reduce is non-deterministic,
# NOTE: the computation result of nccl_reduce is non-deterministic,
# related issue: https://github.com/NVIDIA/nccl/issues/157
# related issue: https://github.com/NVIDIA/nccl/issues/157
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
GPU
,
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
CUDA
,
1e-5
,
1e-2
)
1e-5
,
1e-2
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
CPU
,
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
CPU
,
1e-5
,
1e-2
)
1e-5
,
1e-2
)
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
浏览文件 @
bb20dcfc
...
@@ -32,7 +32,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -32,7 +32,7 @@ class TestMNIST(TestParallelExecutorBase):
# simple_fc
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_device
,
use_reduce
=
False
):
def
check_simple_fc_convergence
(
self
,
use_device
,
use_reduce
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
img
,
label
=
init_data
()
img
,
label
=
init_data
()
...
@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
np
.
mean
(
parallel_last_loss
),
single_last_loss
,
delta
=
1e-6
)
np
.
mean
(
parallel_last_loss
),
single_last_loss
,
delta
=
1e-6
)
def
test_simple_fc_parallel_accuracy
(
self
):
def
test_simple_fc_parallel_accuracy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
GPU
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
CUDA
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
浏览文件 @
bb20dcfc
...
@@ -30,7 +30,7 @@ class TestResnetGPU(TestResnetBase):
...
@@ -30,7 +30,7 @@ class TestResnetGPU(TestResnetBase):
optimizer
=
seresnext_net
.
optimizer
,
optimizer
=
seresnext_net
.
optimizer
,
use_parallel_executor
=
False
)
use_parallel_executor
=
False
)
self
.
_compare_result_with_origin_model
(
self
.
_compare_result_with_origin_model
(
check_func
,
use_device
=
DeviceType
.
GPU
,
compare_seperately
=
False
)
check_func
,
use_device
=
DeviceType
.
CUDA
,
compare_seperately
=
False
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
浏览文件 @
bb20dcfc
...
@@ -32,7 +32,7 @@ class TestResnetWithFuseAllReduceGPU(TestResnetBase):
...
@@ -32,7 +32,7 @@ class TestResnetWithFuseAllReduceGPU(TestResnetBase):
optimizer
=
seresnext_net
.
optimizer
,
optimizer
=
seresnext_net
.
optimizer
,
fuse_all_reduce_ops
=
True
)
fuse_all_reduce_ops
=
True
)
self
.
_compare_result_with_origin_model
(
self
.
_compare_result_with_origin_model
(
check_func
,
use_device
=
DeviceType
.
GPU
,
delta2
=
1e-2
)
check_func
,
use_device
=
DeviceType
.
CUDA
,
delta2
=
1e-2
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
浏览文件 @
bb20dcfc
...
@@ -21,7 +21,7 @@ import paddle.fluid.core as core
...
@@ -21,7 +21,7 @@ import paddle.fluid.core as core
class
TestResnetWithReduceBase
(
TestParallelExecutorBase
):
class
TestResnetWithReduceBase
(
TestParallelExecutorBase
):
def
_compare_reduce_and_allreduce
(
self
,
use_device
,
delta2
=
1e-5
):
def
_compare_reduce_and_allreduce
(
self
,
use_device
,
delta2
=
1e-5
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
CUDA
and
not
core
.
is_compiled_with_cuda
():
return
return
all_reduce_first_loss
,
all_reduce_last_loss
=
self
.
check_network_convergence
(
all_reduce_first_loss
,
all_reduce_last_loss
=
self
.
check_network_convergence
(
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
浏览文件 @
bb20dcfc
...
@@ -20,7 +20,7 @@ from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduc
...
@@ -20,7 +20,7 @@ from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduc
class
TestResnetWithReduceGPU
(
TestResnetWithReduceBase
):
class
TestResnetWithReduceGPU
(
TestResnetWithReduceBase
):
def
test_seresnext_with_reduce
(
self
):
def
test_seresnext_with_reduce
(
self
):
self
.
_compare_reduce_and_allreduce
(
self
.
_compare_reduce_and_allreduce
(
use_device
=
DeviceType
.
GPU
,
delta2
=
1e-2
)
use_device
=
DeviceType
.
CUDA
,
delta2
=
1e-2
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
浏览文件 @
bb20dcfc
...
@@ -191,11 +191,11 @@ class TestTransformer(TestParallelExecutorBase):
...
@@ -191,11 +191,11 @@ class TestTransformer(TestParallelExecutorBase):
if
core
.
is_compiled_with_cuda
():
if
core
.
is_compiled_with_cuda
():
self
.
check_network_convergence
(
self
.
check_network_convergence
(
transformer
,
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
feed_data_reader
=
get_feed_data_reader
())
feed_data_reader
=
get_feed_data_reader
())
self
.
check_network_convergence
(
self
.
check_network_convergence
(
transformer
,
transformer
,
use_device
=
DeviceType
.
GPU
,
use_device
=
DeviceType
.
CUDA
,
enable_sequential_execution
=
True
,
enable_sequential_execution
=
True
,
feed_data_reader
=
get_feed_data_reader
())
feed_data_reader
=
get_feed_data_reader
())
self
.
check_network_convergence
(
self
.
check_network_convergence
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录