Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
c0a82748
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
c0a82748
编写于
7月 11, 2019
作者:
G
gongweibao
提交者:
GitHub
7月 11, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Polish backwards optimizer dependency codes and use more default values. (#18255)
上级
d3003a16
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
152 addition
and
129 deletion
+152
-129
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+9
-1
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+1
-1
paddle/fluid/framework/details/nccl_op_handle.h
paddle/fluid/framework/details/nccl_op_handle.h
+2
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+21
-4
paddle/fluid/platform/nccl_helper.h
paddle/fluid/platform/nccl_helper.h
+4
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+2
-9
python/paddle/distributed/launch.py
python/paddle/distributed/launch.py
+1
-1
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+2
-3
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+0
-1
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+79
-102
python/paddle/fluid/tests/unittests/test_dist_mnist_nccl.py
python/paddle/fluid/tests/unittests/test_dist_mnist_nccl.py
+14
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+17
-6
未找到文件。
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
c0a82748
...
...
@@ -204,7 +204,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass
(
"all_reduce_deps_pass"
);
}
if
(
strategy_
.
enable_backward_optimizer_op_deps_
)
{
if
(
strategy_
.
num_trainers_
>
1
&&
!
strategy_
.
async_mode_
&&
!
strategy_
.
is_distribution_
&&
strategy_
.
enable_backward_optimizer_op_deps_
)
{
VLOG
(
1
)
<<
"Add backward_op_deps_pass"
;
AppendPass
(
"backward_optimizer_op_deps_pass"
);
}
...
...
@@ -351,6 +353,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
}
else
if
(
pass
->
Type
()
==
"mkldnn_placement_pass"
)
{
pass
->
Set
(
"mkldnn_enabled_op_types"
,
new
std
::
unordered_set
<
std
::
string
>
(
mkldnn_enabled_op_types_
));
}
else
if
(
pass
->
Type
()
==
"backward_optimizer_op_deps_pass"
)
{
if
(
!
use_cuda
)
{
VLOG
(
1
)
<<
"backward_optimizer_op_deps_pass is only supported on "
"GPU, skipped."
;
continue
;
}
}
VLOG
(
3
)
<<
"Start Apply Pass "
<<
pass
->
Type
();
graph
=
pass
->
Apply
(
graph
);
...
...
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
c0a82748
...
...
@@ -72,7 +72,7 @@ struct BuildStrategy {
// Add dependency between backward ops and optimization ops, make sure that
// all the backward ops are finished before running the optimization ops.
// It might make the training speed of data parallelism faster.
bool
enable_backward_optimizer_op_deps_
{
fals
e
};
bool
enable_backward_optimizer_op_deps_
{
tru
e
};
// TODO(dev-paddle): enable_sequential_execution depends on
// kStaleProgramOpDescs, it is not appropriate, because kStaleProgramOpDescs
// will be removed in the near future.
...
...
paddle/fluid/framework/details/nccl_op_handle.h
浏览文件 @
c0a82748
...
...
@@ -59,7 +59,8 @@ class NCCLOpHandleBase : public OpHandleBase {
VLOG
(
10
)
<<
"SetRunEnv "
<<
" run_order:"
<<
run_order
<<
", use_hierarchical_allreduce:"
<<
use_hierarchical_allreduce
;
<<
", use_hierarchical_allreduce:"
<<
use_hierarchical_allreduce
<<
", nccl_ctx_:"
<<
nccl_ctxs_
;
if
(
nccl_ctxs_
==
nullptr
)
{
return
;
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
c0a82748
...
...
@@ -113,9 +113,12 @@ class ParallelExecutorPrivate {
auto
nccl_id_var
=
scope
->
FindVar
(
var_name
);
if
(
nccl_id_var
)
{
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
VLOG
(
10
)
<<
"find nccl_id_var:"
<<
var_name
<<
", nccl_id:"
<<
nccl_id
;
}
else
{
nccl_id
=
new
ncclUniqueId
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclGetUniqueId
(
nccl_id
));
VLOG
(
10
)
<<
"can't find nccl_id_var:"
<<
var_name
<<
", nccl_id:"
<<
nccl_id
;
}
flat_nccl_ids
.
push_back
(
nccl_id
);
...
...
@@ -170,8 +173,7 @@ class ParallelExecutorPrivate {
}
}
void
InitOrGetNCCLCommunicator
(
framework
::
Scope
*
scope
,
const
BuildStrategy
&
bst
)
{
void
InitOrGetNCCLCommunicator
(
framework
::
Scope
*
scope
,
BuildStrategy
*
bst
)
{
const
std
::
string
var_name
=
"NCCLCommunicator"
;
auto
var
=
scope
->
FindVar
(
var_name
);
if
(
var
!=
nullptr
)
{
...
...
@@ -183,9 +185,24 @@ class ParallelExecutorPrivate {
return
;
}
if
(
bst
->
use_hierarchical_allreduce_
)
{
PADDLE_ENFORCE
(
bst
->
num_trainers_
>
1
,
"num_trainers:%llu < 1"
,
bst
->
num_trainers_
);
PADDLE_ENFORCE
(
bst
->
hierarchical_allreduce_inter_nranks_
>
1
,
"inter_nranks:%d < 1"
,
bst
->
hierarchical_allreduce_inter_nranks_
);
PADDLE_ENFORCE
(
(
bst
->
num_trainers_
%
bst
->
hierarchical_allreduce_inter_nranks_
==
0
),
"num_trainers:%llu mod inter_nranks:%d != 0"
,
bst
->
num_trainers_
,
bst
->
hierarchical_allreduce_inter_nranks_
);
bst
->
hierarchical_allreduce_exter_nranks_
=
bst
->
num_trainers_
/
bst
->
hierarchical_allreduce_inter_nranks_
;
}
VLOG
(
1
)
<<
"not find "
<<
var_name
<<
" in scope, so recreate it!"
;
nccl_ctxs_
=
scope
->
Var
(
var_name
)
->
GetMutable
<
platform
::
NCCLCommunicator
>
();
InitNCCLCtxs
(
scope
,
bst
);
InitNCCLCtxs
(
scope
,
*
bst
);
}
#endif
...
...
@@ -383,7 +400,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
if
(
member_
->
use_cuda_
&&
member_
->
nranks_
>
1
)
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
member_
->
InitOrGetNCCLCommunicator
(
scope
,
member_
->
build_strategy_
);
member_
->
InitOrGetNCCLCommunicator
(
scope
,
&
member_
->
build_strategy_
);
// Initialize device context's nccl comm, will be used by normal
// Operators like sync_batch_norm, and collective ops.
...
...
paddle/fluid/platform/nccl_helper.h
浏览文件 @
c0a82748
...
...
@@ -286,10 +286,14 @@ class NCCLCommunicator {
bool
NeedExterAllReduce
()
const
{
return
h_exter_ctxs_
.
size
()
>
0
;
}
NCCLContextMap
*
GetHierarchicalInterCtx
(
size_t
run_order
)
const
{
PADDLE_ENFORCE
(
h_inter_ctxs_
.
size
()
>
0
,
"must init hierarchical ctxs first!"
);
return
h_inter_ctxs_
[
run_order
%
h_inter_ctxs_
.
size
()].
get
();
}
NCCLContextMap
*
GetHierarchicalExterCtx
(
size_t
run_order
)
const
{
PADDLE_ENFORCE
(
h_exter_ctxs_
.
size
()
>
0
,
"must init hierarchical ctxs first!"
);
return
h_exter_ctxs_
[
run_order
%
h_exter_ctxs_
.
size
()].
get
();
}
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
c0a82748
...
...
@@ -1408,27 +1408,20 @@ All parameter, weight, gradient are variables in Paddle.
[](
BuildStrategy
&
self
,
int
nccl_comm_num
)
{
self
.
nccl_comm_num_
=
nccl_comm_num
;
})
.
def_property
(
"use_hierarchical_allreduce
_
"
,
.
def_property
(
"use_hierarchical_allreduce"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
use_hierarchical_allreduce_
;
},
[](
BuildStrategy
&
self
,
bool
use
)
{
self
.
use_hierarchical_allreduce_
=
use
;
})
.
def_property
(
"hierarchical_allreduce_inter_nranks
_
"
,
.
def_property
(
"hierarchical_allreduce_inter_nranks"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
hierarchical_allreduce_inter_nranks_
;
},
[](
BuildStrategy
&
self
,
int
nranks
)
{
self
.
hierarchical_allreduce_inter_nranks_
=
nranks
;
})
.
def_property
(
"hierarchical_allreduce_exter_nranks_"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
hierarchical_allreduce_exter_nranks_
;
},
[](
BuildStrategy
&
self
,
int
nranks
)
{
self
.
hierarchical_allreduce_exter_nranks_
=
nranks
;
})
.
def_property
(
"fuse_elewise_add_act_ops"
,
...
...
python/paddle/distributed/launch.py
浏览文件 @
c0a82748
...
...
@@ -154,7 +154,7 @@ def start_procs(args):
for
i
in
range
(
selected_gpus_num
):
if
trainers_endpoints
!=
""
:
trainers_endpoints
+=
","
trainers_endpoints
+=
"%s:
617%d"
%
(
ip
,
i
)
trainers_endpoints
+=
"%s:
%d"
%
(
ip
,
args
.
started_port
+
i
)
nranks
=
num_nodes
*
selected_gpus_num
...
...
python/paddle/fluid/compiler.py
浏览文件 @
c0a82748
...
...
@@ -288,9 +288,8 @@ class CompiledProgram(object):
if
self
.
_program
:
self
.
_build_strategy
.
nccl_comm_num
=
self
.
_program
.
_nccl_comm_num
self
.
_build_strategy
.
use_hierarchical_allreduce_
=
self
.
_program
.
_use_hierarchical_allreduce
self
.
_build_strategy
.
hierarchical_allreduce_inter_nranks_
=
self
.
_program
.
_hierarchical_allreduce_inter_nranks
self
.
_build_strategy
.
hierarchical_allreduce_exter_nranks_
=
self
.
_program
.
_hierarchical_allreduce_exter_nranks
self
.
_build_strategy
.
use_hierarchical_allreduce
=
self
.
_program
.
_use_hierarchical_allreduce
self
.
_build_strategy
.
hierarchical_allreduce_inter_nranks
=
self
.
_program
.
_hierarchical_allreduce_inter_nranks
if
self
.
_build_strategy
.
sync_batch_norm
:
self
.
_build_strategy
.
enable_sequential_execution
=
True
...
...
python/paddle/fluid/framework.py
浏览文件 @
c0a82748
...
...
@@ -2844,7 +2844,6 @@ class Program(object):
self
.
_nccl_comm_num
=
1
self
.
_use_hierarchical_allreduce
=
False
self
.
_hierarchical_allreduce_inter_nranks
=
0
self
.
_hierarchical_allreduce_exter_nranks
=
0
# @deprecated(the python memory optimize transpiler is deprecated)
# whether the program is optimized by memory_optimize_transpiler
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
c0a82748
...
...
@@ -129,6 +129,9 @@ class TestDistRunnerBase(object):
config
=
fluid
.
DistributeTranspilerConfig
()
config
.
mode
=
"nccl2"
config
.
nccl_comm_num
=
args
.
nccl_comm_num
if
args
.
use_hallreduce
:
config
.
use_hierarchical_allreduce
=
True
config
.
hierarchical_allreduce_inter_nranks
=
args
.
hallreduce_inter_nranks
my_print
(
type
(
self
).
__name__
,
"begin to run transpile on trainer with nccl2 mode"
)
...
...
@@ -198,15 +201,6 @@ class TestDistRunnerBase(object):
exec_strategy
=
exec_strategy
)
my_print
(
type
(
self
).
__name__
,
"program compiled with data parallel"
)
if
args
.
use_cuda
and
args
.
update_method
==
"nccl2"
:
# it just for test share_vars_from feature.
test_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
True
,
loss_name
=
avg_cost
.
name
,
build_strategy
=
build_stra
,
main_program
=
test_program
,
share_vars_from
=
binary
.
_executor
)
feed_var_list
=
[
var
for
var
in
trainer_prog
.
global_block
().
vars
.
values
()
if
var
.
is_data
...
...
@@ -327,8 +321,10 @@ def runtime_main(test_class):
parser
.
add_argument
(
'--trainer_id'
,
type
=
int
,
required
=
False
,
default
=
0
)
parser
.
add_argument
(
'--trainers'
,
type
=
int
,
required
=
False
,
default
=
1
)
parser
.
add_argument
(
'--nccl_comm_num'
,
type
=
int
,
required
=
False
,
default
=
1
)
parser
.
add_argument
(
'--enable_backward_deps'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_hallreduce'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--
enable_backward_deps'
,
type
=
bool
,
required
=
False
,
default
=
1
)
'--
hallreduce_inter_nranks'
,
type
=
int
,
required
=
False
,
default
=
2
)
parser
.
add_argument
(
'--current_endpoint'
,
type
=
str
,
required
=
False
,
default
=
""
)
parser
.
add_argument
(
'--sync_mode'
,
action
=
'store_true'
)
...
...
@@ -407,9 +403,10 @@ class TestDistBase(unittest.TestCase):
self
.
_use_dgc
=
False
self
.
_dygraph
=
False
self
.
_nccl_comm_num
=
1
self
.
_enable_backward_deps
=
False
self
.
_use_hallreduce
=
False
self
.
_setup_config
()
self
.
_after_setup_config
()
self
.
_enable_backward_deps
=
False
def
_find_free_port
(
self
):
def
__free_port
():
...
...
@@ -597,118 +594,97 @@ class TestDistBase(unittest.TestCase):
ps0
.
terminate
()
ps1
.
terminate
()
# print server log
'''
with open("/tmp/ps0_err.log", "rb") as fn:
sys.stderr.write("ps0 stderr: %s
\n
" % fn.read())
with open("/tmp/ps1_err.log", "rb") as fn:
sys.stderr.write("ps1 stderr: %s
\n
" % fn.read())
'''
# print log
'''
with open("/tmp/tr0_err.log", "rb") as fn:
sys.stderr.write('trainer 0 stderr: %s
\n
' % fn.read())
with open("/tmp/tr1_err.log", "rb") as fn:
sys.stderr.write('trainer 1 stderr: %s
\n
' % fn.read())
'''
return
pickle
.
loads
(
tr0_out
),
pickle
.
loads
(
tr1_out
)
def
_run_cluster_nccl2
(
self
,
model
,
envs
,
nccl2_reduce_layer
,
check_error_log
):
# NOTE: we reuse ps_endpoints as nccl2 worker endpoints
worker_endpoints
=
self
.
_ps_endpoints
.
split
(
","
)
w0_ep
,
w1_ep
=
worker_endpoints
if
nccl2_reduce_layer
:
update_method
=
"nccl2_reduce_layer"
else
:
update_method
=
"nccl2"
tr_cmd
=
"%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f"
tr0_cmd
=
tr_cmd
%
\
(
self
.
_python_interp
,
model
,
self
.
_ps_endpoints
,
0
,
w0_ep
,
update_method
,
self
.
_lr
)
tr1_cmd
=
tr_cmd
%
\
def
_get_nccl2_trainer_cmd
(
self
,
model
,
ep
,
update_method
,
trainer_id
,
trainer_num
):
env
=
{}
tr_cmd
=
"%s -u %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f"
tr_cmd
=
tr_cmd
%
\
(
self
.
_python_interp
,
model
,
self
.
_ps_endpoints
,
1
,
w1_
ep
,
update_method
,
self
.
_lr
)
trainer_id
,
ep
,
update_method
,
self
.
_lr
)
if
self
.
_mem_opt
:
tr0_cmd
+=
" --mem_opt"
tr1_cmd
+=
" --mem_opt"
tr_cmd
+=
" --mem_opt"
if
self
.
_use_reduce
:
tr0_cmd
+=
" --use_reduce"
tr1_cmd
+=
" --use_reduce"
tr_cmd
+=
" --use_reduce"
if
self
.
_use_reader_alloc
:
tr0_cmd
+=
" --use_reader_alloc"
tr1_cmd
+=
" --use_reader_alloc"
tr_cmd
+=
" --use_reader_alloc"
if
self
.
__use_cuda
:
tr0_cmd
+=
" --use_cuda"
tr1_cmd
+=
" --use_cuda"
env0
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
,
# for test nccl2 layer
"PADDLE_TRAINERS_NUM"
:
"2"
,
"PADDLE_TRAINER_ID"
:
"0"
}
env1
=
{
"CUDA_VISIBLE_DEVICES"
:
"1"
,
"PADDLE_TRAINERS_NUM"
:
"2"
,
"PADDLE_TRAINER_ID"
:
"1"
}
tr_cmd
+=
" --use_cuda"
env
.
update
({
"CUDA_VISIBLE_DEVICES"
:
"{}"
.
format
(
trainer_id
),
"PADDLE_TRAINERS_NUM"
:
"{}"
.
format
(
trainer_num
),
"PADDLE_TRAINER_ID"
:
"{}"
.
format
(
trainer_id
)
})
else
:
env0
=
{
'CPU_NUM'
:
'1'
}
env1
=
{
'CPU_NUM'
:
'1'
}
env
.
update
({
'CPU_NUM'
:
'1'
})
if
self
.
_use_dgc
:
tr0_cmd
+=
" --use_dgc"
tr1_cmd
+=
" --use_dgc"
tr_cmd
+=
" --use_dgc"
if
self
.
_mp_mode
:
env
=
{
"FLAGS_selected_gpus"
:
"{}"
.
format
(
trainer_id
)}
if
self
.
_nccl_comm_num
>
1
:
tr0_cmd
+=
" --nccl_comm_num {}"
.
format
(
self
.
_nccl_comm_num
)
tr1_cmd
+=
" --nccl_comm_num {}"
.
format
(
self
.
_nccl_comm_num
)
tr_cmd
+=
" --nccl_comm_num {}"
.
format
(
self
.
_nccl_comm_num
)
if
self
.
_mp_mode
:
env0
=
{
"FLAGS_selected_gpus"
:
"0"
}
env1
=
{
"FLAGS_selected_gpus"
:
"1"
}
if
self
.
_use_hallreduce
:
tr_cmd
+=
" --use_hallreduce --hallreduce_inter_nranks 2"
if
self
.
_enable_backward_deps
:
tr0_cmd
+=
" --enable_backward_deps 1"
tr1_cmd
+=
" --enable_backward_deps 1"
tr_cmd
+=
" --enable_backward_deps"
env0
.
update
(
envs
)
env1
.
update
(
envs
)
return
tr_cmd
,
env
print
(
"tr0_cmd:{}, env: {}"
.
format
(
tr0_cmd
,
env0
))
print
(
"tr1_cmd:{}, env: {}"
.
format
(
tr1_cmd
,
env1
))
tr0_pipe
=
open
(
"/tmp/tr0_err.log"
,
"wb"
)
tr1_pipe
=
open
(
"/tmp/tr1_err.log"
,
"wb"
)
def
_run_cluster_nccl2
(
self
,
model
,
envs
,
nccl2_reduce_layer
,
check_error_log
):
if
self
.
_use_hallreduce
:
self
.
_ps_endpoints
=
""
for
i
in
range
(
0
,
4
):
self
.
_ps_endpoints
+=
"127.0.0.1:%s,"
%
(
self
.
_find_free_port
())
self
.
_ps_endpoints
=
self
.
_ps_endpoints
[:
-
1
]
my_print
(
type
(
self
).
__name__
,
"going to start process 0 with nccl2"
)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
strip
().
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
tr0_pipe
,
env
=
env0
)
my_print
(
type
(
self
).
__name__
,
"going to start process 1 with nccl2"
)
tr1_proc
=
subprocess
.
Popen
(
tr1_cmd
.
strip
().
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
tr1_pipe
,
env
=
env1
)
# NOTE: we reuse ps_endpoints as nccl2 worker endpoints
worker_endpoints
=
self
.
_ps_endpoints
.
split
(
","
)
if
nccl2_reduce_layer
:
update_method
=
"nccl2_reduce_layer"
else
:
update_method
=
"nccl2"
tr0_out
,
tr0_err
=
tr0_proc
.
communicate
()
tr1_out
,
tr1_err
=
tr1_proc
.
communicate
()
trainer_num
=
len
(
worker_endpoints
)
# close trainer file
tr0_pipe
.
close
()
tr1_pipe
.
close
()
procs
=
[]
pipes
=
[]
for
i
in
range
(
0
,
trainer_num
):
tr_cmd
,
tr_env
=
self
.
_get_nccl2_trainer_cmd
(
model
,
worker_endpoints
[
i
],
update_method
,
i
,
trainer_num
)
tr_env
.
update
(
envs
)
print
(
"use_hallreduce:{} tr_cmd:{}, env: {}"
.
format
(
self
.
_use_hallreduce
,
tr_cmd
,
tr_env
))
# print log
sys
.
stderr
.
write
(
'trainer 0 stderr: %s
\n
'
%
tr0_err
)
sys
.
stderr
.
write
(
'trainer 1 stderr: %s
\n
'
%
tr1_err
)
tr_pipe
=
open
(
"/tmp/tr{}_err.log"
.
format
(
i
),
"wb"
)
return
pickle
.
loads
(
tr0_out
),
pickle
.
loads
(
tr1_out
)
my_print
(
type
(
self
).
__name__
,
"going to start process {} with nccl2"
.
format
(
i
))
tr_proc
=
subprocess
.
Popen
(
tr_cmd
.
strip
().
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
tr_pipe
,
env
=
tr_env
)
procs
.
append
(
tr_proc
)
pipes
.
append
(
tr_pipe
)
outs
=
[]
for
i
in
range
(
0
,
trainer_num
):
tr_out
,
tr_err
=
procs
[
i
].
communicate
()
outs
.
append
(
tr_out
)
pipes
[
i
].
close
()
sys
.
stderr
.
write
(
'trainer {} stderr: {}
\n
'
.
format
(
i
,
tr_err
))
return
pickle
.
loads
(
outs
[
0
]),
pickle
.
loads
(
outs
[
1
])
def
check_with_place
(
self
,
model_file
,
...
...
@@ -724,13 +700,14 @@ class TestDistBase(unittest.TestCase):
"FLAGS_rpc_deadline"
:
"30000"
,
# 5sec to fail fast
"FLAGS_cudnn_deterministic"
:
"1"
,
"http_proxy"
:
""
,
"NCCL_P2P_DISABLE"
:
"1"
"NCCL_P2P_DISABLE"
:
"1"
,
"NCCL_SHM_DISABLE"
:
"1"
}
required_envs
.
update
(
need_envs
)
if
check_error_log
:
required_envs
[
"GLOG_v"
]
=
"
3
"
required_envs
[
"GLOG_v"
]
=
"
10
"
required_envs
[
"GLOG_logtostderr"
]
=
"1"
local_losses
\
...
...
python/paddle/fluid/tests/unittests/test_dist_mnist_nccl.py
浏览文件 @
c0a82748
...
...
@@ -72,5 +72,19 @@ class TestDistMnistNCCL2BackWardDeps(TestDistBase):
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1e-5
)
class
TestDistMnistNCCL2HAllreduce
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_use_reduce
=
False
self
.
_use_reader_alloc
=
False
self
.
_nccl2_mode
=
True
self
.
_use_hallreduce
=
True
def
test_dist_train
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1e-5
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
c0a82748
...
...
@@ -172,8 +172,6 @@ class DistributeTranspilerConfig(object):
use_hierarchical_allreduce
=
False
#Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu cards' number in most cases.
hierarchical_allreduce_inter_nranks
=
0
#Nccl ranks bewteen nodes when use hierarchical allreduce, it's setted to nodes number.
hierarchical_allreduce_exter_nranks
=
0
# if mode is collective
# supported modes: sgd, local_sgd
...
...
@@ -428,10 +426,23 @@ class DistributeTranspiler(object):
self
.
origin_program
.
_trainers_endpoints
=
trainers
.
split
(
","
)
self
.
origin_program
.
_nccl_comm_num
=
self
.
config
.
nccl_comm_num
self
.
origin_program
.
_use_hierarchical_allreduce
=
self
.
config
.
use_hierarchical_allreduce
self
.
origin_program
.
_hierarchical_allreduce_inter_nranks
=
\
int
(
self
.
config
.
hierarchical_allreduce_inter_nranks
)
self
.
origin_program
.
_hierarchical_allreduce_exter_nranks
=
\
int
(
self
.
config
.
hierarchical_allreduce_exter_nranks
)
# check use_hierarchical_allreduce options
if
self
.
config
.
use_hierarchical_allreduce
:
trainers_num
=
len
(
self
.
origin_program
.
_trainers_endpoints
)
# selected automaticly
if
self
.
config
.
hierarchical_allreduce_inter_nranks
<=
1
:
self
.
config
.
hierarchical_allreduce_inter_nranks
=
fluid
.
core
.
get_cuda_device_count
(
)
assert
trainers_num
>
self
.
config
.
hierarchical_allreduce_inter_nranks
,
\
"trainers_num:{} < hierarchical_allreduce_inter_nranks:{}"
.
format
(
trainers_num
,
self
.
config
.
hierarchical_allreduce_inter_nranks
)
assert
trainers_num
%
self
.
config
.
hierarchical_allreduce_inter_nranks
==
0
,
\
"trainers_num:{} mod hierarchical_allreduce_inter_nranks:{} != 0"
.
format
(
trainers_num
,
self
.
config
.
hierarchical_allreduce_inter_nranks
)
self
.
origin_program
.
_hierarchical_allreduce_inter_nranks
=
\
int
(
self
.
config
.
hierarchical_allreduce_inter_nranks
)
self
.
_transpile_nccl2
(
trainer_id
,
trainers
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录