Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
266cdf7d
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
266cdf7d
编写于
4月 09, 2019
作者:
G
gongweibao
提交者:
GitHub
4月 09, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix dgc bug. (#16709)
上级
7e560558
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
14 addition
and
4 deletion
+14
-4
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+5
-1
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+1
-1
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+8
-2
未找到文件。
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
266cdf7d
...
@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
...
@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
this
->
SetDeviceContext
(
p
,
nccl_ctxs_
->
DevCtx
(
p
));
this
->
SetDeviceContext
(
p
,
nccl_ctxs_
->
DevCtx
(
p
));
}
}
}
}
// TODO(gongwb) :polish them!
if
(
is_encoded
)
{
VLOG
(
1
)
<<
"Use dgc allreduce mode"
;
}
}
}
#else
#else
AllReduceOpHandle
::
AllReduceOpHandle
(
ir
::
Node
*
node
,
AllReduceOpHandle
::
AllReduceOpHandle
(
ir
::
Node
*
node
,
...
@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
...
@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
paddle
::
framework
::
GradOriginalVarName
(
in_var_handles
[
i
]
->
name
());
paddle
::
framework
::
GradOriginalVarName
(
in_var_handles
[
i
]
->
name
());
auto
encode_var_name
=
original_name
+
g_dgc_encoded
;
auto
encode_var_name
=
original_name
+
g_dgc_encoded
;
auto
*
in_var
=
local_scope
->
FindVar
(
encode_var_name
);
auto
*
in_var
=
local_scope
->
FindVar
(
encode_var_name
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
,
"%s should not be null"
,
encode_var_name
);
auto
&
in
=
in_var
->
Get
<
LoDTensor
>
();
auto
&
in
=
in_var
->
Get
<
LoDTensor
>
();
ins
.
emplace_back
(
&
in
);
ins
.
emplace_back
(
&
in
);
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
266cdf7d
...
@@ -752,7 +752,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
...
@@ -752,7 +752,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
force_cpu
=
True
)
force_cpu
=
True
)
for
param_var
,
grad_var
in
param_and_grads
:
for
param_var
,
grad_var
in
param_and_grads
:
var_numel
=
reduce
(
lambda
x
,
y
:
x
*
y
,
param_var
.
shape
)
var_numel
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
param_var
.
shape
)
)
if
var_numel
<
16384
or
\
if
var_numel
<
16384
or
\
param_var
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
or
\
param_var
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
or
\
grad_var
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
or
\
grad_var
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
or
\
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
266cdf7d
...
@@ -104,10 +104,11 @@ class ParallelExecutor(object):
...
@@ -104,10 +104,11 @@ class ParallelExecutor(object):
self
.
_scope
=
scope
if
scope
is
not
None
else
executor
.
global_scope
()
self
.
_scope
=
scope
if
scope
is
not
None
else
executor
.
global_scope
()
if
main_program
is
not
None
and
main_program
.
_enable_dgc
:
if
main_program
is
not
None
and
main_program
.
_enable_dgc
:
assert
num_trainers
>
1
,
"dgc is not useful for single trainer training."
assert
build_strategy
.
reduce_strategy
==
BuildStrategy
.
ReduceStrategy
.
AllReduce
assert
build_strategy
.
reduce_strategy
==
BuildStrategy
.
ReduceStrategy
.
AllReduce
assert
num_trainers
*
len
(
assert
num_trainers
*
len
(
self
.
_places
)
>
1
,
"dgc is not useful for single card training"
self
.
_places
)
>
1
,
"dgc is not useful for single card training
.
"
assert
use_cuda
assert
use_cuda
,
"dgc only used when cuda is used."
main_program
=
main_program
if
main_program
is
not
None
\
main_program
=
main_program
if
main_program
is
not
None
\
else
framework
.
default_main_program
()
else
framework
.
default_main_program
()
...
@@ -123,6 +124,11 @@ class ParallelExecutor(object):
...
@@ -123,6 +124,11 @@ class ParallelExecutor(object):
exec_strategy
=
exec_strategy
,
exec_strategy
=
exec_strategy
,
share_vars_from
=
share_vars_from
.
_compiled_program
share_vars_from
=
share_vars_from
.
_compiled_program
if
share_vars_from
else
None
)
if
share_vars_from
else
None
)
# FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr.
if
main_program
.
_enable_dgc
:
self
.
_compiled_program
.
_build_strategy
.
is_distribution
=
True
self
.
_place
=
core
.
CUDAPlace
(
0
)
if
use_cuda
else
core
.
CPUPlace
()
self
.
_place
=
core
.
CUDAPlace
(
0
)
if
use_cuda
else
core
.
CPUPlace
()
self
.
_exe
=
executor
.
Executor
(
self
.
_place
)
self
.
_exe
=
executor
.
Executor
(
self
.
_place
)
self
.
_compiled_program
.
_compile
(
place
=
self
.
_place
,
scope
=
self
.
_scope
)
self
.
_compiled_program
.
_compile
(
place
=
self
.
_place
,
scope
=
self
.
_scope
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录