Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
b6e7f8e9
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b6e7f8e9
编写于
10月 21, 2021
作者:
X
xiongkun
提交者:
GitHub
10月 21, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
User specified backend (#35745)
上级
921c0917
变更
20
隐藏空白更改
内联
并排
Showing
20 changed file
with
948 addition
and
65 deletion
+948
-65
paddle/fluid/framework/fleet/gloo_wrapper.h
paddle/fluid/framework/fleet/gloo_wrapper.h
+18
-0
paddle/fluid/imperative/gloo_context.cc
paddle/fluid/imperative/gloo_context.cc
+113
-2
paddle/fluid/imperative/gloo_context.h
paddle/fluid/imperative/gloo_context.h
+8
-0
python/paddle/distributed/fleet/launch.py
python/paddle/distributed/fleet/launch.py
+45
-6
python/paddle/distributed/fleet/launch_utils.py
python/paddle/distributed/fleet/launch_utils.py
+56
-7
python/paddle/distributed/parallel.py
python/paddle/distributed/parallel.py
+6
-21
python/paddle/distributed/spawn.py
python/paddle/distributed/spawn.py
+75
-13
python/paddle/distributed/utils.py
python/paddle/distributed/utils.py
+19
-3
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+18
-0
python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
.../fluid/tests/unittests/parallel_dygraph_gradient_check.py
+1
-2
python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
...ddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
+1
-0
python/paddle/fluid/tests/unittests/test_cpuonly_launch.sh
python/paddle/fluid/tests/unittests/test_cpuonly_launch.sh
+42
-0
python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py
python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py
+72
-0
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+170
-9
python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
...uid/tests/unittests/test_parallel_dygraph_dataparallel.py
+65
-0
python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py
.../unittests/test_parallel_dygraph_sparse_embedding_gloo.py
+59
-0
python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py
...est_parallel_dygraph_sparse_embedding_over_height_gloo.py
+44
-0
python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py
...tests/unittests/test_parallel_dygraph_transformer_gloo.py
+61
-0
python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py
.../unittests/test_parallel_dygraph_unused_variables_gloo.py
+72
-0
python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
...fluid/tests/unittests/test_spawn_and_init_parallel_env.py
+3
-2
未找到文件。
paddle/fluid/framework/fleet/gloo_wrapper.h
浏览文件 @
b6e7f8e9
...
...
@@ -238,6 +238,24 @@ class GlooWrapper {
return
ret
;
}
// TODO(xiongkun03): support all gather array of
// numbers with different length
// can use AllgathervOptions, may be work in different
// occasion. Need some survey.
template
<
typename
T
>
void
AllGatherVector
(
T
*
input_ptr
,
T
*
output_ptr
,
size_t
element_num
)
{
// NOLINT
CHECK_EQ
(
is_initialized_
,
true
);
#ifdef PADDLE_WITH_GLOO
gloo
::
AllgatherOptions
opts
(
context_
);
opts
.
setInput
(
input_ptr
,
element_num
);
opts
.
setOutput
(
output_ptr
,
element_num
*
size_
);
gloo
::
allgather
(
opts
);
#else
LOG
(
WARNING
)
<<
"AllGather does nothing when WITH_GLOO=OFF"
;
#endif
}
protected:
bool
is_initialized_
=
false
;
#ifdef PADDLE_WITH_GLOO
...
...
paddle/fluid/imperative/gloo_context.cc
浏览文件 @
b6e7f8e9
...
...
@@ -18,6 +18,7 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/fluid/string/string_helper.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -67,8 +68,36 @@ void GLOOParallelContext::AllReduceByStream(const framework::Variable &src,
framework
::
Variable
*
dst
,
int
ring_id
,
bool
use_calc_stream
)
{
// AllReduce(src, dst, strategy_, ring_id, use_calc_stream);
auto
src_tensor
=
src
.
Get
<
framework
::
LoDTensor
>
();
auto
*
dst_tensor
=
dst
->
GetMutable
<
framework
::
LoDTensor
>
();
if
(
src
.
IsType
<
framework
::
LoDTensor
>
())
{
if
(
!
dst
->
IsType
<
framework
::
LoDTensor
>
())
{
dst
->
Clear
();
}
AllReduce
(
src
.
Get
<
framework
::
LoDTensor
>
(),
dst
->
GetMutable
<
framework
::
LoDTensor
>
());
}
else
if
(
src
.
IsType
<
framework
::
SelectedRows
>
())
{
if
(
&
src
!=
dst
)
{
if
(
!
dst
->
IsType
<
framework
::
SelectedRows
>
())
{
dst
->
Clear
();
}
AllReduce
(
src
.
Get
<
framework
::
SelectedRows
>
(),
dst
->
GetMutable
<
framework
::
SelectedRows
>
());
}
else
{
// SelectedRows cannot be allreduce in-place
framework
::
Variable
tmp_dst
;
AllReduce
(
src
.
Get
<
framework
::
SelectedRows
>
(),
tmp_dst
.
GetMutable
<
framework
::
SelectedRows
>
());
*
dst
=
std
::
move
(
tmp_dst
);
}
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Unsupported variable type %s for imperative allreduce, only "
"LoDTensor and SelectedRows are supported."
,
platform
::
demangle
(
framework
::
ToTypeName
(
src
.
Type
()))));
}
}
void
GLOOParallelContext
::
AllReduce
(
const
framework
::
Tensor
&
src_tensor
,
framework
::
Tensor
*
dst_tensor
)
{
auto
gloo_wrapper
=
framework
::
GlooWrapper
::
GetInstance
();
dst_tensor
->
Resize
(
src_tensor
.
dims
());
switch
(
src_tensor
.
type
())
{
...
...
@@ -84,6 +113,88 @@ void GLOOParallelContext::AllReduceByStream(const framework::Variable &src,
gloo_wrapper
->
Barrier
();
}
#define GLOO_ALL_GATHER_CASE(type, T, gw) \
case type: { \
const auto *src_tensor_ptr = src_tensor.data<T>(); \
gw->AllGatherVector<T>(const_cast<T *>(src_tensor_ptr), \
reinterpret_cast<T *>(dst_tensor_ptr), \
value_sendcount); \
break; \
}
void
GLOOParallelContext
::
AllReduce
(
const
framework
::
SelectedRows
&
src
,
framework
::
SelectedRows
*
dst
)
{
// auto ;
// int local_rank = strategy_.local_rank_;
int
nranks
=
strategy_
.
nranks_
;
VLOG
(
3
)
<<
"SelectedRows AllReduce start"
;
const
auto
&
src_tensor
=
src
.
value
();
const
auto
&
place
=
src_tensor
.
place
();
auto
dtype
=
src_tensor
.
type
();
// 1. Gather rows number from all workers. Here use ncclAllGather to do this,
// but we can use other ways to implement is in the future
const
auto
&
src_rows
=
src
.
rows
();
auto
gloo_wrapper
=
framework
::
GlooWrapper
::
GetInstance
();
size_t
local_row_num
=
src_rows
.
size
();
std
::
vector
<
size_t
>
rows_num_vector
=
gloo_wrapper
->
AllGather
<
size_t
>
(
local_row_num
);
const
auto
*
cpu_rows_num_ptr
=
rows_num_vector
.
data
();
auto
rows_num
=
std
::
accumulate
(
cpu_rows_num_ptr
,
cpu_rows_num_ptr
+
nranks
,
static_cast
<
int64_t
>
(
0
));
dst
->
set_height
(
src
.
height
());
VLOG
(
3
)
<<
"Gather rows: "
<<
string
::
join_strings
(
rows_num_vector
,
','
)
<<
", total rows number: "
<<
rows_num
<<
", height: "
<<
src
.
height
();
auto
*
dst_rows
=
dst
->
mutable_rows
();
dst_rows
->
resize
(
rows_num
);
auto
*
dst_rows_ptr
=
dst_rows
->
MutableData
(
place
);
const
int64_t
*
src_rows_ptr
=
src_rows
.
Data
(
place
);
// VLOG(3) << "Selected Rows of src:" << string::join_strings(dst_rows, ',')
auto
*
dst_tensor
=
dst
->
mutable_value
();
auto
dims
=
src_tensor
.
dims
();
dims
[
0
]
=
rows_num
;
auto
feature_size
=
framework
::
product
(
dims
)
/
dims
[
0
];
dst_tensor
->
Resize
(
dims
);
if
(
std
::
all_of
(
cpu_rows_num_ptr
,
cpu_rows_num_ptr
+
nranks
,
[
&
](
size_t
row
)
{
return
row
==
cpu_rows_num_ptr
[
0
];
}))
{
// During sparse communication, the number of each card is same.
// Because gloo wrapper utility class currently don't support
// broadcast, so we only deal the-same case.
VLOG
(
3
)
<<
"Use the gloo all reduce to sync. SRC:"
<<
src_tensor
;
// framework::SerializeToStream(VLOG(4), src);
VLOG
(
3
)
<<
"allgather replaces broadcast to speed up in sparse allreduce"
;
auto
value_sendcount
=
cpu_rows_num_ptr
[
0
]
*
feature_size
;
auto
*
dst_tensor_ptr
=
dst_tensor
->
mutable_data
(
place
,
dtype
);
gloo_wrapper
->
AllGatherVector
<
int64_t
>
(
const_cast
<
int64_t
*>
(
src_rows_ptr
),
static_cast
<
int64_t
*>
(
dst_rows_ptr
),
rows_num_vector
[
0
]);
switch
(
dtype
)
{
GLOO_ALL_GATHER_CASE
(
framework
::
proto
::
VarType
::
FP32
,
float
,
gloo_wrapper
);
GLOO_ALL_GATHER_CASE
(
framework
::
proto
::
VarType
::
FP64
,
double
,
gloo_wrapper
);
GLOO_ALL_GATHER_CASE
(
framework
::
proto
::
VarType
::
INT32
,
int
,
gloo_wrapper
);
GLOO_ALL_GATHER_CASE
(
framework
::
proto
::
VarType
::
INT64
,
int64_t
,
gloo_wrapper
);
default:
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Invalid datatype for allreduce"
));
}
}
VLOG
(
3
)
<<
"Selected Row DST:"
<<
*
dst_tensor
;
VLOG
(
3
)
<<
"Selected Rows of DST:"
<<
string
::
join_strings
(
std
::
vector
<
int64_t
>
(
*
dst_rows
),
','
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"The number of each card is not the same, gloo only support the-same"
"batch division"
));
}
}
paddle
::
platform
::
DeviceContext
*
GLOOParallelContext
::
GetDeviceContext
(
int
ring_id
)
{
// return the CPUDeviceContext
...
...
paddle/fluid/imperative/gloo_context.h
浏览文件 @
b6e7f8e9
...
...
@@ -16,6 +16,9 @@
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/imperative/parallel_context.h"
#include "paddle/fluid/platform/device_context.h"
...
...
@@ -52,6 +55,11 @@ class GLOOParallelContext : public ParallelContext {
void
SynchronizeCompute
()
override
;
private:
void
AllReduce
(
const
framework
::
Tensor
&
src
,
framework
::
Tensor
*
dst
);
void
AllReduce
(
const
framework
::
SelectedRows
&
src
,
framework
::
SelectedRows
*
dst
);
private:
std
::
unique_ptr
<
platform
::
CPUDeviceContext
>
device_
;
};
...
...
python/paddle/distributed/fleet/launch.py
浏览文件 @
b6e7f8e9
...
...
@@ -103,7 +103,12 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
type
=
str
,
default
=
"log"
,
help
=
"The path for each process's log. Default --log_dir=log/"
)
base_group
.
add_argument
(
"--backend"
,
type
=
str
,
default
=
"auto"
,
help
=
"Specifize the backend, can be gloo|nccl|bkcl|auto. Default value is auto which perfers nccl or bkcl."
)
base_group
.
add_argument
(
"--nproc_per_node"
,
type
=
int
,
...
...
@@ -230,8 +235,21 @@ def get_cluster_from_args(args, device_mode, devices_per_proc):
devices_per_proc
)
def
cpuonly_check
(
args
):
if
args
.
ips
and
len
(
args
.
ips
.
split
(
','
))
>
1
:
raise
RuntimeError
(
"CPUONLY launch only support single trainer, that is len(ips)=1, but got %s."
%
args
.
ips
)
if
args
.
run_mode
:
assert
args
.
run_mode
==
'cpuonly'
,
"CPUONLY launch only support run mode is CPUONLY"
if
args
.
servers
:
raise
RuntimeError
(
"CPUONLY launch can't have --servers as arguments."
)
return
True
def
launch_collective
(
args
):
# parse arguments, used for cloud-single-machine and local
if
args
.
backend
==
'gloo'
:
cpuonly_check
(
args
)
(
device_mode
,
devices_per_proc
)
=
launch_utils
.
get_device_proc_info
(
args
)
trainers_num
=
cloud_utils
.
get_trainers_num
()
logger
.
debug
(
"parsed from args trainerss_num:{} mode:{} devices:{}"
.
format
(
...
...
@@ -265,6 +283,7 @@ def launch_collective(args):
global_envs
[
"PADDLE_WITH_GLOO"
]
=
str
(
os
.
getenv
(
"PADDLE_WITH_GLOO"
,
"0"
))
global_envs
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"3"
global_envs
[
"PADDLE_GLOO_FS_PATH"
]
=
gloo_rendezvous_dir
global_envs
[
"PADDLE_DISTRI_BACKEND"
]
=
args
.
backend
procs
=
start_local_trainers
(
cluster
,
...
...
@@ -349,9 +368,12 @@ def which_distributed_mode(args):
if
fluid
.
core
.
is_compiled_with_cuda
():
accelerators
=
fluid
.
core
.
get_cuda_device_count
()
args
.
backend
=
'nccl'
elif
fluid
.
core
.
is_compiled_with_npu
():
args
.
backend
=
'unknown'
accelerators
=
fluid
.
core
.
get_npu_device_count
()
elif
fluid
.
core
.
is_compiled_with_xpu
():
args
.
backend
=
'bkcl'
accelerators
=
fluid
.
core
.
get_xpu_device_count
()
else
:
accelerators
=
0
...
...
@@ -372,10 +394,14 @@ def which_distributed_mode(args):
else
:
if
not
fluid
.
core
.
is_compiled_with_cuda
(
)
and
not
fluid
.
core
.
is_compiled_with_xpu
():
logger
.
warning
(
"Not found distinct arguments and not compiled with cuda or xpu. Default use ps mode"
)
return
DistributeMode
.
PS
if
args
.
servers
:
logger
.
warning
(
"Not found distinct arguments and not compiled with cuda or xpu.
\
But found args.servers not empty, default use ps mode"
)
return
DistributeMode
.
PS
else
:
args
.
backend
=
"gloo"
return
DistributeMode
.
COLLECTIVE
else
:
logger
.
warning
(
"Not found distinct arguments and compiled with cuda or xpu. Default use collective mode"
...
...
@@ -556,7 +582,20 @@ def launch():
logger
=
get_logger
()
_print_arguments
(
args
)
distribute_mode
=
which_distributed_mode
(
args
)
if
args
.
backend
==
'auto'
:
distribute_mode
=
which_distributed_mode
(
args
)
assert
args
.
backend
in
[
'gloo'
,
'nccl'
,
'bkcl'
,
'unknown'
]
# which_distributed_mode must modify args.backend
else
:
assert
args
.
run_mode
==
'collective'
or
args
.
run_mode
==
None
,
"When backend is not 'auto', run mode must be collective"
check_backend
(
args
.
backend
)
distribute_mode
=
DistributeMode
.
COLLECTIVE
block_windows_and_macos
(
args
.
backend
)
# raise error when using gloo on windows or macos
if
args
.
backend
==
'gloo'
:
logger
.
warning
(
"launch start with CPUONLY mode"
)
if
enable_elastic
(
args
,
distribute_mode
):
launch_elastic
(
args
,
distribute_mode
)
...
...
python/paddle/distributed/fleet/launch_utils.py
浏览文件 @
b6e7f8e9
...
...
@@ -22,6 +22,7 @@ import subprocess
import
tempfile
import
shutil
from
contextlib
import
closing
import
multiprocessing
import
socket
import
warnings
import
six
...
...
@@ -30,6 +31,7 @@ import struct
import
paddle
import
paddle.fluid
as
fluid
from
distutils.util
import
strtobool
import
paddle.utils.cpp_extension.extension_utils
as
utils
logger
=
logging
.
getLogger
(
"root"
)
logger
.
propagate
=
False
...
...
@@ -669,29 +671,31 @@ def get_xpus(xpus):
return
res_xpus
def
get_device_mode
():
def
get_device_mode
(
backend
):
if
fluid
.
core
.
is_compiled_with_npu
()
and
\
fluid
.
core
.
get_npu_device_count
()
>
0
:
print
(
"launch train in ascend npu mode!"
)
return
DeviceMode
.
ASCEND_NPU
if
fluid
.
core
.
is_compiled_with_cuda
()
and
\
if
backend
==
'nccl'
and
\
fluid
.
core
.
get_cuda_device_count
()
>
0
:
print
(
"launch train in GPU mode!"
)
return
DeviceMode
.
GPU
if
fluid
.
core
.
is_compiled_with_xpu
()
and
fluid
.
core
.
get_xpu_device_count
(
)
>
0
:
if
backend
==
'bkcl'
and
fluid
.
core
.
get_xpu_device_count
()
>
0
:
print
(
"launch train in XPU mode"
)
return
DeviceMode
.
XPU
print
(
"launch train in CPU mode"
)
return
DeviceMode
.
CPU
if
backend
==
'gloo'
:
print
(
"launch train in CPU mode"
)
return
DeviceMode
.
CPU
raise
RuntimeError
(
"Don't supported devices"
)
def
get_device_proc_info
(
args
):
# device_mode
device_mode
=
get_device_mode
()
device_mode
=
get_device_mode
(
args
.
backend
)
# devices
devices_per_proc
=
[]
...
...
@@ -722,6 +726,9 @@ def get_device_proc_info(args):
else
:
devices_per_proc
=
xpus
elif
device_mode
==
DeviceMode
.
CPU
:
if
hasattr
(
args
,
"paddle_cpuonly"
)
and
args
.
nproc_per_node
is
None
:
#NOTE (xiongkun03) set it to cpu core number
args
.
nproc_per_node
=
multiprocessing
.
cpu_count
()
if
args
.
nproc_per_node
is
None
:
devices_per_proc
=
[
0
]
else
:
...
...
@@ -1237,3 +1244,45 @@ class ParameterServerLauncher(object):
tp
.
cmd
=
cmd
self
.
procs
[
"heter_worker"
].
append
(
tp
)
def
check_backend
(
backend
):
if
backend
not
in
[
'nccl'
,
'gloo'
,
'bkcl'
,
'auto'
]:
raise
ValueError
(
"paddle.distributed initialize error, "
"backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', but got %s"
%
backend
)
if
backend
==
'nccl'
and
not
fluid
.
core
.
is_compiled_with_cuda
():
raise
ValueError
(
"paddle.distributed initialize error, "
"your paddle is not compiled with cuda but you assign 'nccl' as backend."
)
if
backend
==
'bkcl'
and
not
fluid
.
core
.
is_compiled_with_xpu
():
raise
ValueError
(
"paddle.distributed initialize error, "
"your paddle is not compiled with xpu but you assign 'bkcl' as backend."
)
def
block_windows_and_macos
(
backend
):
if
backend
!=
'gloo'
:
return
if
utils
.
OS_NAME
.
startswith
(
'darwin'
):
# MACOS , block
raise
ValueError
(
"You are going to using gloo on macos, but currently is not supported"
)
if
utils
.
IS_WINDOWS
:
# MACOS , block
raise
ValueError
(
"You are going to using gloo on windows, but currently is not supported"
)
def
get_backend_by_compile_flag
():
if
fluid
.
core
.
is_compiled_with_cuda
():
return
'nccl'
if
fluid
.
core
.
is_compiled_with_xpu
():
return
'bkcl'
return
'gloo'
python/paddle/distributed/parallel.py
浏览文件 @
b6e7f8e9
...
...
@@ -26,6 +26,7 @@ from paddle import compat as cpt
from
paddle.fluid
import
core
from
paddle.fluid.framework
import
_set_expected_place
from
paddle.fluid.dygraph
import
parallel_helper
from
paddle.distributed.fleet.launch_utils
import
check_backend
from
paddle.fluid.dygraph.parallel
import
ParallelEnv
from
paddle.distributed.fleet.base.private_helper_function
import
wait_server_ready
# noqa: F401
...
...
@@ -55,25 +56,8 @@ def _start_kv_server(port, http_server_d, size):
http_server
.
stop
()
def
_check_backend
(
backend
):
if
backend
not
in
[
'nccl'
,
'gloo'
,
'bkcl'
,
'auto'
]:
raise
ValueError
(
"paddle.distributed initialize error, "
"backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', but got %s"
%
backend
)
if
backend
==
'nccl'
and
not
core
.
is_compiled_with_cuda
():
raise
ValueError
(
"paddle.distributed initialize error, "
"your paddle is not compiled with cuda but you assign 'nccl' as backend."
)
if
backend
==
'bkcl'
and
not
core
.
is_compiled_with_xpu
():
raise
ValueError
(
"paddle.distributed initialize error, "
"your paddle is not compiled with xpu but you assign 'bkcl' as backend."
)
def
_is_cpuonly
(
backend
):
check_backend
(
backend
)
if
backend
in
[
'auto'
,
'nccl'
,
'bkcl'
]
and
(
core
.
is_compiled_with_cuda
()
or
core
.
is_compiled_with_xpu
()):
# passes 'auto' and can use cuda or xpu, use the default logics. so return False
...
...
@@ -82,7 +66,7 @@ def _check_backend(backend):
return
True
def
init_parallel_env
(
backend
=
'auto'
):
def
init_parallel_env
():
"""
Initialize parallel training environment in dynamic graph mode.
...
...
@@ -154,7 +138,8 @@ def init_parallel_env(backend='auto'):
return
# NOTE(xiongkun): support cpu gloo only, add this environment variable to
# enable cpu only gloo prarllel training)
is_cpu_only
=
_check_backend
(
backend
)
backend
=
os
.
environ
.
get
(
'PADDLE_DISTRI_BACKEND'
,
'auto'
)
is_cpu_only
=
_is_cpuonly
(
backend
)
# 1. gpu xpu check, must be gpu or xpu,
if
not
(
is_cpu_only
or
core
.
is_compiled_with_cuda
()
or
core
.
is_compiled_with_xpu
()):
...
...
python/paddle/distributed/spawn.py
浏览文件 @
b6e7f8e9
...
...
@@ -24,8 +24,10 @@ import warnings
from
paddle.distributed.utils
import
_print_arguments
from
paddle.distributed.utils
import
_prepare_trainer_env
from
paddle.distributed.utils
import
get_host_name_ip
from
paddle.distributed.cloud_utils
import
get_cluster_and_pod
from
paddle.distributed.cloud_utils
import
get_cluster_and_pod
,
_get_trainers_num
from
paddle.distributed.fleet.launch
import
get_cluster_from_args
from
paddle.distributed.fleet.cloud_utils
import
use_paddlecloud
from
paddle.distributed.fleet.launch_utils
import
DeviceMode
,
check_backend
,
block_windows_and_macos
from
paddle.device
import
get_device
# deprecated module import
...
...
@@ -71,7 +73,9 @@ def _py_supported_check():
def
_options_valid_check
(
options
):
# `print_config` keeped as a debug options, not show to users
supported_options
=
[
'start_method'
,
'ips'
,
'gpus'
,
'xpus'
,
'print_config'
]
supported_options
=
[
'start_method'
,
'ips'
,
'gpus'
,
'xpus'
,
'print_config'
,
'backend'
]
deprecated_options
=
[
'selected_devices'
,
'started_port'
,
'cluster_node_ips'
,
'node_ip'
,
'use_paddlecloud'
...
...
@@ -95,6 +99,22 @@ def _get_default_nprocs():
return
core
.
get_cuda_device_count
()
elif
'xpu'
in
device
:
return
core
.
get_xpu_device_count
()
elif
'cpu'
in
device
:
return
multiprocessing
.
cpu_count
()
else
:
raise
RuntimeError
(
"`paddle.distributed.spawn` does not support parallel training on device `{}` now."
.
format
(
device
))
def
_get_default_backend
():
device
=
get_device
()
if
'gpu'
in
device
:
return
'nccl'
elif
'xpu'
in
device
:
return
'bkcl'
elif
'cpu'
in
device
:
return
'gloo'
else
:
raise
RuntimeError
(
"`paddle.distributed.spawn` does not support parallel training on device `{}` now."
.
...
...
@@ -112,6 +132,16 @@ def _get_node_ip(ips):
def
_get_subprocess_env_list
(
nprocs
,
options
):
# NOTE (xiongkun03) Why put backend deduction here ?
# Becase _get_subprocess_env_list is used by many testcases.
# So for campability, we put backend deduction here
# logic for handle backend option
if
'backend'
not
in
options
or
options
[
'backend'
]
==
'auto'
:
options
[
'backend'
]
=
_get_default_backend
()
check_backend
(
options
[
'backend'
])
block_windows_and_macos
(
options
[
'backend'
])
# contruct processes env list
processes_env_list
=
[]
...
...
@@ -133,7 +163,7 @@ def _get_subprocess_env_list(nprocs, options):
# if we set FLAGS_selected_gpus or FLAGS_selected_xpus to be `0,1,2,3`, it may cause error
# when using `ParallelEnv`
# NOTE(chenweihang): use absolute gpu or xpu card id
if
core
.
is_compiled_with_cuda
()
:
if
options
[
'backend'
]
==
'nccl'
:
args
.
selected_devices
=
options
.
get
(
'gpus'
,
None
)
if
args
.
selected_devices
is
None
:
args
.
selected_devices
=
options
.
get
(
'selected_devices'
,
None
)
...
...
@@ -168,7 +198,7 @@ def _get_subprocess_env_list(nprocs, options):
"CUDA_VISIBLE_DEVICES (%s)."
%
(
card_id
,
","
.
join
(
env_devices_list
)))
elif
core
.
is_compiled_with_xpu
()
:
elif
options
[
'backend'
]
==
'bkcl'
:
args
.
selected_devices
=
options
.
get
(
'xpus'
,
None
)
if
args
.
selected_devices
is
None
:
args
.
selected_devices
=
options
.
get
(
'selected_devices'
,
None
)
...
...
@@ -202,6 +232,23 @@ def _get_subprocess_env_list(nprocs, options):
raise
ValueError
(
"The selected xpu card %s cannot found in "
"XPU_VISIBLE_DEVICES (%s)."
%
(
card_id
,
","
.
join
(
env_devices_list
)))
elif
options
[
'backend'
]
==
'gloo'
:
# TODO check gpu / xpu flag must not exist
warnings
.
warn
(
"Your model will be trained under CPUONLY mode by using GLOO,"
"because CPUPlace is specified manually or your installed PaddlePaddle only support CPU Device."
)
args
.
paddle_cpuonly
=
True
args
.
selected_devices
=
None
args
.
ips
=
args
.
cluster_node_ips
assert
options
.
get
(
'use_paddlecloud'
,
None
)
is
None
,
"CPUONLY spawn doesn't support use paddle cloud"
assert
len
(
args
.
cluster_node_ips
.
split
(
','
)
)
<=
1
,
"CPUONLY spawn only support single trainer, that is len(ips)=1, but got %s."
assert
_get_trainers_num
(
)
==
1
,
"CPUONLY spawn doesn't support multi-trainer"
# set other inner args
args
.
node_ip
=
options
.
get
(
'node_ip'
,
None
)
...
...
@@ -215,11 +262,17 @@ def _get_subprocess_env_list(nprocs, options):
args
.
use_paddlecloud
=
use_paddlecloud
()
# get cluster and pod config
cluster
,
pod
=
get_cluster_and_pod
(
args
)
if
options
[
'backend'
]
==
'gloo'
:
devices_per_proc
=
[
x
for
x
in
range
(
0
,
nprocs
)]
cluster
,
pod
=
get_cluster_from_args
(
args
,
DeviceMode
.
CPU
,
devices_per_proc
)
else
:
cluster
,
pod
=
get_cluster_and_pod
(
args
)
# prepare subprocess env list
for
trainer
in
pod
.
trainers
:
processes_env_list
.
append
(
_prepare_trainer_env
(
cluster
,
trainer
))
processes_env_list
.
append
(
_prepare_trainer_env
(
cluster
,
trainer
,
options
[
'backend'
]))
# [Debug] print config
args
.
print_config
=
options
.
get
(
'print_config'
,
False
)
...
...
@@ -236,27 +289,35 @@ def _remove_risky_env():
os
.
environ
.
pop
(
"https_proxy"
,
None
)
def
_set_trainer_env
(
env_dict
):
def
_set_trainer_env
(
env_dict
,
backend
):
# NOTE(chenweihang): [ Why need set FLAGS_selected_gpus or FLAGS_selected_xpus here? ]
# When the child process starts, it will inherit the configuration of the
# main process and set the FLAGS once, but the environment variable has
# not been set at this time, which leads to the FLAGS_selected_gpus or FLAGS_selected_xpus
# is keep same with mainprocess(usually empty), so manually update the flags here
if
core
.
is_compiled_with_cuda
():
# NOTE(xiongkun): why put backend here? because if gloo, we shouldn't set FLAGS_selectedXXX
#
if
backend
==
'nccl'
:
set_flags
({
'FLAGS_selected_gpus'
:
env_dict
[
'FLAGS_selected_gpus'
]})
elif
core
.
is_compiled_with_xpu
()
:
elif
backend
==
'bkcl'
:
set_flags
({
'FLAGS_selected_xpus'
:
env_dict
[
'FLAGS_selected_xpus'
]})
else
:
raise
ValueError
(
"PaddlePaddle should be compiled with XPU or CUDA."
)
#NOTE(xiongkun) why not raise Error ?
# So far, we added support for CPU parallel, and will be applied when paddle is not
# compiled with cuda or xp. just do nothing.
pass
for
var_name
in
env_dict
:
os
.
environ
[
var_name
]
=
env_dict
[
var_name
]
def
_func_wrapper
(
func
,
args
,
error_queue
,
return_queue
,
env_dict
):
def
_func_wrapper
(
func
,
args
,
error_queue
,
return_queue
,
env_dict
,
backend
):
try
:
# config subprocess environment variables
_remove_risky_env
()
_set_trainer_env
(
env_dict
)
_set_trainer_env
(
env_dict
,
backend
)
# execute function
result
=
func
(
*
args
)
# record function return value
...
...
@@ -487,7 +548,8 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
return_queue
=
mp
.
SimpleQueue
()
process
=
mp
.
Process
(
target
=
_func_wrapper
,
args
=
(
func
,
args
,
error_queue
,
return_queue
,
procs_env_list
[
i
]))
args
=
(
func
,
args
,
error_queue
,
return_queue
,
procs_env_list
[
i
],
options
[
'backend'
]))
process
.
daemon
=
daemon
process
.
start
()
error_queues
.
append
(
error_queue
)
...
...
python/paddle/distributed/utils.py
浏览文件 @
b6e7f8e9
...
...
@@ -25,6 +25,7 @@ import subprocess
from
contextlib
import
closing
import
socket
from
paddle.fluid
import
core
from
paddle.distributed.fleet.launch_utils
import
get_backend_by_compile_flag
from
distutils.util
import
strtobool
from
paddle.fluid.layer_helper
import
LayerHelper
...
...
@@ -613,8 +614,10 @@ def find_free_ports(num):
return
None
def
_prepare_trainer_env
(
cluster
,
trainer
):
if
core
.
is_compiled_with_xpu
():
def
_prepare_trainer_env
(
cluster
,
trainer
,
backend
=
None
):
if
backend
is
None
:
backend
=
get_backend_by_compile_flag
()
# for compatibility
if
backend
==
'bkcl'
:
proc_env
=
{
"FLAGS_selected_xpus"
:
"%s"
%
","
.
join
([
str
(
g
)
for
g
in
trainer
.
gpus
]),
...
...
@@ -623,7 +626,7 @@ def _prepare_trainer_env(cluster, trainer):
"PADDLE_TRAINERS_NUM"
:
"%d"
%
cluster
.
trainers_nranks
(),
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
cluster
.
trainers_endpoints
())
}
elif
core
.
is_compiled_with_cuda
()
:
elif
backend
==
'nccl'
:
proc_env
=
{
"FLAGS_selected_gpus"
:
"%s"
%
","
.
join
([
str
(
g
)
for
g
in
trainer
.
gpus
]),
...
...
@@ -632,6 +635,19 @@ def _prepare_trainer_env(cluster, trainer):
"PADDLE_TRAINERS_NUM"
:
"%d"
%
cluster
.
trainers_nranks
(),
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
cluster
.
trainers_endpoints
())
}
elif
backend
==
'gloo'
:
# NOTE (xiongkun) default fall back into cpu only
proc_env
=
{
"PADDLE_TRAINER_ID"
:
"%d"
%
trainer
.
rank
,
"PADDLE_CURRENT_ENDPOINT"
:
"%s"
%
trainer
.
endpoint
,
"PADDLE_TRAINERS_NUM"
:
"%d"
%
cluster
.
trainers_nranks
(),
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
cluster
.
trainers_endpoints
()),
"PADDLE_DISTRI_BACKEND"
:
backend
,
# only add here, other will be auto
}
else
:
raise
ValueError
(
"backend must be one of 'gloo, nccl, bkcl'"
)
return
proc_env
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
b6e7f8e9
...
...
@@ -200,8 +200,14 @@ endif()
list
(
REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel
)
LIST
(
REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer_gloo
)
# NOTE: @xiongkun03, cpu is too slow, fix it in next PR
if
(
NOT WITH_GLOO
)
LIST
(
REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel_cpuonly
)
LIST
(
REMOVE_ITEM TEST_OPS test_parallel_dygraph_unused_variables_gloo
)
LIST
(
REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height_gloo
)
LIST
(
REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_gloo
)
endif
()
if
((
NOT WITH_GPU
)
AND
(
NOT WITH_ROCM
))
...
...
@@ -491,6 +497,10 @@ if (APPLE OR WIN32)
list
(
REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dataset
)
endif
()
if
(
NOT WITH_GLOO
)
LIST
(
REMOVE_ITEM TEST_OPS test_cpuonly_spawn
)
endif
()
if
(
NOT WITH_GPU OR WIN32 OR APPLE
)
list
(
REMOVE_ITEM TEST_OPS test_build_strategy_fusion_group_pass
)
endif
()
...
...
@@ -654,6 +664,9 @@ if(WITH_DISTRIBUTE)
endforeach
(
TEST_OP
)
# solve it later.
bash_test_modules
(
test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS
"RUN_TYPE=EXCLUSIVE"
ENVS
"PADDLE_DIST_UT_PORT=
${
dist_ut_port
}
"
PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
if
(
WITH_GLOO
)
bash_test_modules
(
test_cpuonly_launch START_BASH test_cpuonly_launch.sh SERIAL LABELS
"RUN_TYPE=EXCLUSIVE"
ENVS
"PADDLE_DIST_UT_PORT=
${
dist_ut_port
}
"
PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
endif
()
bash_test_modules
(
test_new_group START_BASH test_new_group.sh SERIAL LABELS
"RUN_TYPE=EXCLUSIVE"
ENVS
"PADDLE_DIST_UT_PORT=
${
dist_ut_port
}
+20"
PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
endif
(
NOT APPLE
)
endif
()
...
...
@@ -1070,3 +1083,8 @@ set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
set_tests_properties
(
test_eigvals_op PROPERTIES TIMEOUT 400
)
set_tests_properties
(
test_tensordot PROPERTIES TIMEOUT 1000
)
set_tests_properties
(
test_tensordot PROPERTIES LABELS
"RUN_TYPE=NIGHTLY"
)
if
(
WITH_GLOO
)
set_tests_properties
(
test_parallel_dygraph_unused_variables_gloo PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_parallel_dygraph_sparse_embedding_gloo PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_parallel_dygraph_sparse_embedding_over_height_gloo PROPERTIES TIMEOUT 120
)
endif
()
python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
浏览文件 @
b6e7f8e9
...
...
@@ -66,8 +66,7 @@ class SimpleNet(fluid.Layer):
class
TestDistTraning
(
unittest
.
TestCase
):
def
test_multiple_gpus
(
self
):
backend
=
os
.
environ
.
get
(
'PADDLE_DISTRI_BACKEND'
,
'auto'
)
dist
.
init_parallel_env
(
backend
)
dist
.
init_parallel_env
()
self
.
trainer_id
=
dist
.
get_rank
()
model_a
=
SimpleNet
(
self
.
trainer_id
)
...
...
python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
浏览文件 @
b6e7f8e9
...
...
@@ -324,6 +324,7 @@ class TestSeResNeXt(TestParallelDyGraphRunnerBase):
bs
=
len
(
data
)
dy_x_data
=
np
.
array
([
x
[
0
].
reshape
(
3
,
224
,
224
)
for
x
in
data
]).
astype
(
'float32'
)
dy_x_data
=
dy_x_data
/
255.0
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
bs
,
1
)
img
=
to_variable
(
dy_x_data
)
label
=
to_variable
(
y_data
)
...
...
python/paddle/fluid/tests/unittests/test_cpuonly_launch.sh
0 → 100644
浏览文件 @
b6e7f8e9
#!/bin/bash
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
function
test_launch_cpuonly
(){
python
-m
paddle.distributed.launch
--nproc_per_node
=
4
--backend
=
gloo
\
parallel_dygraph_gradient_check.py 2>ut.elog
if
grep
-q
"ABORT"
ut.elog
;
then
echo
"test cpu only failed"
exit
-1
else
if
grep
-q
"CPUONLY"
ut.elog
;
then
echo
"test_launch_cpuonly successfully"
else
echo
"test_launch_cpuonly failed"
exit
-1
fi
fi
}
function
test_launch_error_case1
(){
python
-m
paddle.distributed.launch
--nproc_per_node
=
4
--backend
=
random_str
\
parallel_dygraph_gradient_check.py 2>ut.elog
if
grep
-q
"ValueError"
ut.elog
;
then
echo
"test_launch_error_case1 successfully"
else
exit
-1
fi
}
test_launch_cpuonly
test_launch_error_case1
python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py
0 → 100644
浏览文件 @
b6e7f8e9
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
paddle
import
paddle.nn
as
nn
import
paddle.optimizer
as
opt
import
paddle.distributed
as
dist
class
LinearNet
(
nn
.
Layer
):
def
__init__
(
self
):
super
(
LinearNet
,
self
).
__init__
()
self
.
_linear1
=
nn
.
Linear
(
10
,
10
)
self
.
_linear2
=
nn
.
Linear
(
10
,
1
)
def
forward
(
self
,
x
):
return
self
.
_linear2
(
self
.
_linear1
(
x
))
def
train
(
print_result
=
False
):
# 1. initialize parallel environment
dist
.
init_parallel_env
()
# 2. create data parallel layer & optimizer
layer
=
LinearNet
()
dp_layer
=
paddle
.
DataParallel
(
layer
)
loss_fn
=
nn
.
MSELoss
()
adam
=
opt
.
Adam
(
learning_rate
=
0.001
,
parameters
=
dp_layer
.
parameters
())
# 3. run layer
inputs
=
paddle
.
randn
([
10
,
10
],
'float32'
)
outputs
=
dp_layer
(
inputs
)
labels
=
paddle
.
randn
([
10
,
1
],
'float32'
)
loss
=
loss_fn
(
outputs
,
labels
)
if
print_result
is
True
:
print
(
"loss:"
,
loss
.
numpy
())
loss
.
backward
()
print
(
"Grad is"
,
layer
.
_linear1
.
weight
.
grad
)
adam
.
step
()
adam
.
clear_grad
()
class
TestSpawn
(
unittest
.
TestCase
):
def
test_spawn
(
self
):
dist
.
spawn
(
train
,
backend
=
'gloo'
,
nprocs
=
4
)
def
test_wrong_backend
(
self
):
try
:
dist
.
spawn
(
train
,
backend
=
'something'
,
nprocs
=
4
)
except
ValueError
as
e
:
self
.
assertEqual
(
type
(
e
),
ValueError
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
b6e7f8e9
...
...
@@ -209,7 +209,11 @@ class TestDistRunnerBase(object):
def
get_data
():
origin_batch
=
next
(
reader_generator
)
if
args
.
update_method
!=
"local"
and
args
.
use_reader_alloc
:
if
paddle
.
distributed
.
get_world_size
(
)
==
1
and
args
.
update_method
==
'gloo'
:
# Gloo single mode
return
origin_batch
elif
args
.
update_method
!=
"local"
and
args
.
use_reader_alloc
:
new_batch
=
[]
for
offset
,
item
in
enumerate
(
origin_batch
):
if
offset
%
2
==
args
.
trainer_id
:
...
...
@@ -506,7 +510,10 @@ class TestParallelDyGraphRunnerBase(object):
"train_one_loop should be implemented by the child classes."
)
def
_get_data
(
self
,
batch
,
args
):
if
args
.
update_method
!=
"local"
:
if
paddle
.
distributed
.
get_world_size
(
)
==
1
and
args
.
update_method
==
'gloo'
:
# Gloo single mode
return
batch
elif
args
.
update_method
!=
"local"
:
new_batch
=
[]
for
offset
,
item
in
enumerate
(
batch
):
if
offset
%
2
==
args
.
trainer_id
:
...
...
@@ -518,14 +525,16 @@ class TestParallelDyGraphRunnerBase(object):
def
run_trainer
(
self
,
args
):
seed
=
90
if
fluid
.
core
.
is_compiled_with_cuda
():
if
args
.
update_method
==
'gloo'
:
place
=
fluid
.
CPUPlace
()
elif
fluid
.
core
.
is_compiled_with_cuda
():
device_id
=
int
(
os
.
getenv
(
"FLAGS_selected_gpus"
,
"0"
))
place
=
fluid
.
CUDAPlace
(
device_id
)
elif
fluid
.
core
.
is_compiled_with_xpu
():
device_id
=
int
(
os
.
getenv
(
"FLAGS_selected_xpus"
,
"0"
))
place
=
fluid
.
XPUPlace
(
device_id
)
else
:
assert
(
"Only support CUDAPlace or XPUPlace for now."
)
assert
(
"Only support CUDAPlace or XPUPlace
or CPU(Gloo)
for now."
)
with
fluid
.
dygraph
.
guard
(
place
):
fluid
.
default_startup_program
().
random_seed
=
seed
...
...
@@ -554,6 +563,16 @@ class TestParallelDyGraphRunnerBase(object):
model
=
dygraph
.
parallel
.
DataParallel
(
model
,
strategy
,
find_unused_parameters
=
True
)
print_to_err
(
type
(
self
).
__name__
,
"model built in dygraph"
)
elif
args
.
update_method
==
"gloo"
:
paddle
.
distributed
.
init_parallel_env
()
if
not
args
.
find_unused_parameters
:
model
=
dygraph
.
parallel
.
DataParallel
(
model
,
find_unused_parameters
=
False
)
else
:
model
=
dygraph
.
parallel
.
DataParallel
(
model
,
find_unused_parameters
=
True
)
out_losses
=
[]
print_to_err
(
type
(
self
).
__name__
,
"begin to run dygraph training"
)
for
step_id
,
data
in
enumerate
(
train_reader
()):
...
...
@@ -588,12 +607,12 @@ class TestParallelDyGraphRunnerBase(object):
args
.
trainer_id
=
paddle
.
distributed
.
get_rank
()
# 3. init parallel env
if
args
.
update_method
==
"nccl2"
:
if
args
.
update_method
in
[
"nccl2"
,
"gloo"
]
:
paddle
.
distributed
.
init_parallel_env
()
# 4. train model
model
,
train_reader
,
opt
=
self
.
get_model
()
if
args
.
update_method
==
"nccl2"
:
if
args
.
update_method
in
[
"nccl2"
,
"gloo"
]
:
if
args
.
find_unused_parameters
:
model
=
paddle
.
DataParallel
(
model
,
find_unused_parameters
=
True
)
else
:
...
...
@@ -668,7 +687,9 @@ def runtime_main(test_class):
'--update_method'
,
type
=
str
,
default
=
"local"
,
choices
=
[
"pserver"
,
"nccl2"
,
"bkcl"
,
"local"
,
"nccl2_reduce_layer"
])
choices
=
[
"pserver"
,
"nccl2"
,
"bkcl"
,
"local"
,
"nccl2_reduce_layer"
,
"gloo"
])
parser
.
add_argument
(
'--trainer_id'
,
type
=
int
,
required
=
False
,
default
=
0
)
parser
.
add_argument
(
'--trainers'
,
type
=
int
,
required
=
False
,
default
=
1
)
parser
.
add_argument
(
'--nccl_comm_num'
,
type
=
int
,
required
=
False
,
default
=
1
)
...
...
@@ -685,6 +706,7 @@ def runtime_main(test_class):
'--current_endpoint'
,
type
=
str
,
required
=
False
,
default
=
""
)
parser
.
add_argument
(
'--sync_mode'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_cuda'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_cpu'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_xpu'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_dgc'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--accumulate_gradient'
,
action
=
'store_true'
)
...
...
@@ -713,6 +735,9 @@ def runtime_main(test_class):
args
=
parser
.
parse_args
()
if
args
.
update_method
==
'gloo'
:
paddle
.
set_device
(
"cpu"
)
model
=
test_class
()
if
args
.
role
==
"pserver"
and
args
.
update_method
==
"pserver"
:
model
.
run_pserver
(
args
)
...
...
@@ -770,6 +795,7 @@ class TestDistBase(unittest.TestCase):
self
.
_use_reader_alloc
=
True
self
.
_nccl2_mode
=
False
self
.
_bkcl_mode
=
False
self
.
_gloo_mode
=
False
# now, support gloo backend
self
.
_pipeline_mode
=
False
self
.
_mp_mode
=
False
# FIXME(typhoonzero): I added this stupid argument to enable
...
...
@@ -875,7 +901,7 @@ class TestDistBase(unittest.TestCase):
batch_size
=
DEFAULT_BATCH_SIZE
,
batch_merge_repeat
=
1
,
log_name
=
""
,
devices
=
"
0
"
):
devices
=
"
1
"
):
cmd
=
self
.
_python_interp
...
...
@@ -947,6 +973,21 @@ class TestDistBase(unittest.TestCase):
return
pickle
.
loads
(
local_out
)
def
_run_local_gloo
(
self
,
model
,
envs
,
check_error_log
=
False
,
batch_size
=
DEFAULT_BATCH_SIZE
,
batch_merge_repeat
=
1
,
log_name
=
""
,
devices
=
"0"
):
saved_endpoints
=
self
.
_ps_endpoints
self
.
_ps_endpoints
=
self
.
_ps_endpoints
.
split
(
','
)[
0
]
result
=
self
.
_run_cluster_gloo
(
model
,
envs
,
'gloo'
,
check_error_log
,
log_name
)
self
.
_ps_endpoints
=
saved_endpoints
return
result
def
_run_cluster
(
self
,
model
,
envs
,
check_error_log
,
log_name
):
# Run dist train to compare with local results
ps0
,
ps1
,
ps0_pipe
,
ps1_pipe
=
self
.
start_pserver
(
...
...
@@ -1037,6 +1078,62 @@ class TestDistBase(unittest.TestCase):
return
pickle
.
loads
(
tr0_out
),
pickle
.
loads
(
tr1_out
)
def
_get_gloo_trainer_cmd
(
self
,
model
,
ep
,
update_method
,
trainer_id
,
trainer_num
):
env
=
{}
tr_cmd
=
"%s -u"
if
os
.
getenv
(
'WITH_COVERAGE'
,
'OFF'
)
==
'ON'
:
tr_cmd
+=
" -m coverage run --branch -p"
tr_cmd
+=
" %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f"
tr_cmd
=
tr_cmd
%
\
(
self
.
_python_interp
,
model
,
self
.
_ps_endpoints
,
trainer_id
,
ep
,
update_method
,
self
.
_lr
)
if
self
.
_use_reduce
:
tr_cmd
+=
" --use_reduce"
if
self
.
_use_reader_alloc
:
tr_cmd
+=
" --use_reader_alloc"
#assert self._use_reduce == False, "gloo not support _use_reduce"
#assert self._use_reader_alloc == False, "gloo not support _use_reduce"
if
self
.
_save_model
:
tr_cmd
+=
" --save_model"
self
.
__use_cuda
=
False
self
.
__use_xpu
=
False
assert
self
.
__use_cuda
==
False
,
"gloo not support use cuda"
assert
self
.
__use_xpu
==
False
,
"gloo not support use xpu"
tr_cmd
+=
" --use_cpu"
env
.
update
({
"PADDLE_TRAINERS_NUM"
:
"{}"
.
format
(
trainer_num
),
"PADDLE_TRAINER_ID"
:
"{}"
.
format
(
trainer_id
),
"PADDLE_TRAINER_ENDPOINTS"
:
self
.
_ps_endpoints
,
"PADDLE_CURRENT_ENDPOINT"
:
ep
,
"PADDLE_CURRENT_ENDPOINT"
:
ep
,
"PADDLE_DISTRI_BACKEND"
:
"gloo"
,
"GLOG_v"
:
"2"
,
})
assert
self
.
_use_dgc
==
False
,
"gloo not support use dgc"
if
self
.
_accumulate_gradient
:
tr_cmd
+=
" --accumulate_gradient"
if
self
.
_find_unused_parameters
:
tr_cmd
+=
" --find_unused_parameters"
assert
self
.
_pipeline_mode
==
False
,
"gloo not support use pipeline"
if
self
.
_enable_backward_deps
:
# build strategy, save it
tr_cmd
+=
" --enable_backward_deps"
if
self
.
_fuse_all_reduce
is
not
None
:
tr_cmd
+=
" --fuse_all_reduce {}"
.
format
(
self
.
_fuse_all_reduce
)
assert
self
.
_use_fleet_api
==
False
,
"gloo not support use fleet api"
assert
self
.
_use_fleet_api_20
==
False
,
"gloo not support use fleet api"
return
tr_cmd
,
env
def
_get_nccl2_trainer_cmd
(
self
,
model
,
ep
,
update_method
,
trainer_id
,
trainer_num
):
env
=
{}
...
...
@@ -1123,6 +1220,57 @@ class TestDistBase(unittest.TestCase):
return
tr_cmd
,
env
def
_run_cluster_gloo
(
self
,
model
,
envs
,
update_method
,
check_error_log
,
log_name
):
assert
update_method
==
"gloo"
,
"_run_cluster_gloo must have update_method: gloo, but get %s"
%
update_method
assert
not
self
.
_use_hallreduce
,
"_run_cluster_gloo must have _use_hallreduce = false"
worker_endpoints
=
self
.
_ps_endpoints
.
split
(
","
)
trainer_num
=
len
(
worker_endpoints
)
procs
=
[]
pipes
=
[]
for
i
in
range
(
0
,
trainer_num
):
tr_cmd
,
tr_env
=
self
.
_get_gloo_trainer_cmd
(
model
,
worker_endpoints
[
i
],
update_method
,
i
,
trainer_num
)
tr_env
.
update
(
envs
)
tr_env
[
"GLOG_vmodule"
]
=
'gloo_context=4'
tr_env
[
"GLOG_v"
]
=
'3'
print
(
"use_hallreduce:{} tr_cmd:{}, env: {}"
.
format
(
self
.
_use_hallreduce
,
tr_cmd
,
tr_env
))
tr_pipe
=
open
(
log_name
+
"_tr{}_err.log"
.
format
(
i
),
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
"going to start process {} with nccl2"
.
format
(
i
))
tr_proc
=
subprocess
.
Popen
(
tr_cmd
.
strip
().
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
tr_pipe
,
env
=
tr_env
)
procs
.
append
(
tr_proc
)
pipes
.
append
(
tr_pipe
)
outs
=
[]
for
i
in
range
(
0
,
trainer_num
):
tr_out
,
tr_err
=
procs
[
i
].
communicate
()
outs
.
append
(
tr_out
)
pipes
[
i
].
close
()
sys
.
stderr
.
write
(
'trainer {} stderr: {}
\n
'
.
format
(
i
,
tr_err
))
if
trainer_num
==
1
:
if
check_error_log
:
print
(
"outs[0]:"
,
outs
[
0
])
return
pickle
.
loads
(
outs
[
0
])
else
:
if
check_error_log
:
print
(
"outs[0]:"
,
outs
[
0
])
print
(
"outs[1]:"
,
outs
[
1
])
return
pickle
.
loads
(
outs
[
0
]),
pickle
.
loads
(
outs
[
1
])
def
_run_cluster_nccl2
(
self
,
model
,
envs
,
update_method
,
check_error_log
,
log_name
):
if
self
.
_use_hallreduce
:
...
...
@@ -1262,7 +1410,12 @@ class TestDistBase(unittest.TestCase):
required_envs
=
self
.
_get_required_envs
(
check_error_log
,
need_envs
)
local_losses
\
if
self
.
_gloo_mode
:
local_losses
\
=
self
.
_run_local_gloo
(
model_file
,
required_envs
,
check_error_log
,
log_name
=
log_name
)
else
:
local_losses
\
=
self
.
_run_local
(
model_file
,
required_envs
,
check_error_log
,
log_name
=
log_name
)
...
...
@@ -1288,6 +1441,14 @@ class TestDistBase(unittest.TestCase):
update_method
=
'bkcl'
,
check_error_log
=
check_error_log
,
log_name
=
log_name
)
elif
self
.
_gloo_mode
:
# gloo mode, cpu only parallel train @xiongkun03
tr0_losses
,
tr1_losses
=
self
.
_run_cluster_gloo
(
model_file
,
required_envs
,
update_method
=
'gloo'
,
check_error_log
=
check_error_log
,
log_name
=
log_name
)
elif
self
.
_pipeline_mode
:
tr0_losses
,
tr1_losses
=
self
.
_run_pipeline
(
...
...
python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
浏览文件 @
b6e7f8e9
...
...
@@ -49,6 +49,51 @@ def get_gpus(selected_gpus):
return
selected_gpus
def
start_local_trainers_cpu
(
trainer_endpoints
,
training_script
,
training_script_args
,
log_dir
=
None
):
current_env
=
copy
.
copy
(
os
.
environ
.
copy
())
current_env
.
pop
(
"http_proxy"
,
None
)
current_env
.
pop
(
"https_proxy"
,
None
)
procs
=
[]
n_rank
=
len
(
trainer_endpoints
)
print
(
trainer_endpoints
)
for
rank_id
,
endpoint
in
enumerate
(
trainer_endpoints
):
proc_env
=
{
"PADDLE_DISTRI_BACKEND"
:
"gloo"
,
"PADDLE_TRAINER_ID"
:
"%d"
%
rank_id
,
"PADDLE_CURRENT_ENDPOINT"
:
"%s"
%
endpoint
,
"PADDLE_TRAINERS_NUM"
:
"%d"
%
n_rank
,
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
trainer_endpoints
)
}
current_env
.
update
(
proc_env
)
print
(
"trainer proc env:{}"
.
format
(
current_env
))
assert
os
.
getenv
(
'WITH_COVERAGE'
,
'OFF'
)
==
'OFF'
,
"Gloo don't support WITH_COVERAGE."
cmd
=
"python -u "
+
training_script
print
(
"start trainer proc:{} env:{}"
.
format
(
cmd
,
proc_env
))
fn
=
None
proc
=
subprocess
.
Popen
(
cmd
.
split
(
" "
),
env
=
current_env
)
tp
=
TrainerProc
()
tp
.
proc
=
proc
tp
.
rank
=
rank_id
tp
.
log_fn
=
fn
tp
.
cmd
=
cmd
procs
.
append
(
tp
)
return
procs
def
start_local_trainers
(
cluster
,
pod
,
training_script
,
...
...
@@ -116,6 +161,26 @@ class TestMultipleGpus(unittest.TestCase):
training_script
=
target_file_name
,
training_script_args
=
[])
while
True
:
alive
=
watch_local_trainers
(
procs
,
cluster
.
trainers_endpoints
())
if
not
alive
:
print
(
"Local procs complete, POD info:{}"
.
format
(
pod
))
break
time
.
sleep
(
3
)
class
TestMultipleWithGloo
(
unittest
.
TestCase
):
def
run_mnist_2cpu
(
self
,
target_file_name
):
cluster
,
pod
=
get_cluster_from_args
(
[
0
,
1
])
#tmp use. for getting trainer_nranks()
procs
=
start_local_trainers_cpu
(
cluster
.
trainers_endpoints
(),
training_script
=
target_file_name
,
training_script_args
=
[])
while
True
:
alive
=
watch_local_trainers
(
procs
,
cluster
.
trainers_nranks
())
...
...
python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py
0 → 100644
浏览文件 @
b6e7f8e9
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
sys
import
unittest
import
paddle.fluid
as
fluid
from
test_dist_base
import
TestDistBase
from
spawn_runner_base
import
TestDistSpawnRunner
from
parallel_dygraph_sparse_embedding
import
TestSparseEmbedding
from
parallel_dygraph_sparse_embedding_fp64
import
TestSparseEmbeddingFP64
flag_name
=
os
.
path
.
splitext
(
__file__
)[
0
]
class
TestParallelDygraphSparseEmdedding_GLOO
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_gloo_mode
=
True
self
.
_dygraph
=
True
def
test_sparse_embedding
(
self
):
self
.
check_with_place
(
"parallel_dygraph_sparse_embedding.py"
,
delta
=
1e-5
,
check_error_log
=
True
,
log_name
=
flag_name
)
class
TestParallelDygraphSparseEmdeddingFP64_GLOO
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_gloo_mode
=
True
self
.
_dygraph
=
True
def
test_sparse_embedding_fp64
(
self
):
self
.
check_with_place
(
"parallel_dygraph_sparse_embedding_fp64.py"
,
delta
=
1e-5
,
check_error_log
=
True
,
log_name
=
flag_name
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py
0 → 100644
浏览文件 @
b6e7f8e9
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
sys
import
unittest
import
paddle.fluid
as
fluid
from
test_dist_base
import
TestDistBase
from
spawn_runner_base
import
TestDistSpawnRunner
from
parallel_dygraph_sparse_embedding_over_height
import
TestSparseEmbeddingOverHeight
flag_name
=
os
.
path
.
splitext
(
__file__
)[
0
]
class
TestParallelDygraphSparseEmdeddingOverHeight_GLOO
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_gloo_mode
=
True
self
.
_dygraph
=
True
def
test_sparse_embedding
(
self
):
self
.
check_with_place
(
"parallel_dygraph_sparse_embedding_over_height.py"
,
delta
=
1e-7
,
check_error_log
=
True
,
log_name
=
flag_name
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py
0 → 100644
浏览文件 @
b6e7f8e9
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
sys
import
unittest
import
paddle.fluid
as
fluid
from
test_dist_base
import
TestDistBase
from
spawn_runner_base
import
TestDistSpawnRunner
from
parallel_dygraph_transformer
import
TestTransformer
flag_name
=
os
.
path
.
splitext
(
__file__
)[
0
]
class
TestParallelDygraphTransformer_GLOO
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_gloo_mode
=
True
self
.
_dygraph
=
True
def
test_transformer
(
self
):
self
.
check_with_place
(
"parallel_dygraph_transformer.py"
,
delta
=
1e-5
,
check_error_log
=
True
,
log_name
=
flag_name
)
class
TestParallelDygraphTransformerAccGrad_GLOO
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_gloo_mode
=
True
self
.
_dygraph
=
True
self
.
_accumulate_gradient
=
True
self
.
_find_unused_parameters
=
False
def
test_transformer
(
self
):
if
fluid
.
core
.
is_compiled_with_cuda
():
self
.
check_with_place
(
"parallel_dygraph_transformer.py"
,
delta
=
1e-5
,
check_error_log
=
True
,
log_name
=
flag_name
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py
0 → 100644
浏览文件 @
b6e7f8e9
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
sys
import
unittest
import
paddle.fluid
as
fluid
from
test_dist_base
import
TestDistBase
from
spawn_runner_base
import
TestDistSpawnRunner
from
parallel_dygraph_unused_variables
import
TestSparseEmbeddingUnusedVars
flag_name
=
os
.
path
.
splitext
(
__file__
)[
0
]
class
TestParallelDygraphUnusedVar_GLOO
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_gloo_mode
=
True
self
.
_dygraph
=
True
def
test_net
(
self
):
self
.
check_with_place
(
"parallel_dygraph_unused_variables.py"
,
delta
=
1e-5
,
check_error_log
=
True
,
log_name
=
flag_name
)
class
TestParallelDygraphNoVar_GLOO
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_gloo_mode
=
True
self
.
_dygraph
=
True
def
test_net
(
self
):
self
.
check_with_place
(
"parallel_dygraph_none_var.py"
,
delta
=
1e-5
,
check_error_log
=
True
,
log_name
=
flag_name
)
class
TestParallelDygraphSharedUnusedVariables_GLOO
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_gloo_mode
=
True
self
.
_dygraph
=
True
def
test_mnist
(
self
):
self
.
check_with_place
(
"parallel_dygraph_shared_unused_var.py"
,
delta
=
1e-5
,
check_error_log
=
True
,
log_name
=
flag_name
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
浏览文件 @
b6e7f8e9
...
...
@@ -24,6 +24,7 @@ from paddle.distributed.spawn import _get_subprocess_env_list, _options_valid_ch
from
paddle.fluid
import
core
from
paddle.fluid.dygraph
import
parallel_helper
import
multiprocessing
# NOTE(chenweihang): Coverage CI is currently not able to count python3
# unittest, so the unittests here covers some cases that will only be
...
...
@@ -89,8 +90,8 @@ class TestSpawnAssistMethod(unittest.TestCase):
def
test_get_default_nprocs
(
self
):
paddle
.
set_device
(
'cpu'
)
with
self
.
assertRaises
(
RuntimeError
):
nprocs
=
_get_default_nprocs
(
)
nprocs
=
_get_default_nprocs
()
self
.
assertEqual
(
nprocs
,
multiprocessing
.
cpu_count
()
)
paddle
.
set_device
(
'gpu'
)
nprocs
=
_get_default_nprocs
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录