Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
3232618a
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3232618a
编写于
7月 05, 2019
作者:
G
gongweibao
提交者:
GitHub
7月 05, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
checkerrpick Make fuse_all_reduce_op_pass support mix_precision test=develop test=release (#18490)
上级
24107006
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
332 addition
and
152 deletion
+332
-152
paddle/fluid/framework/details/multi_devices_helper.h
paddle/fluid/framework/details/multi_devices_helper.h
+3
-3
paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc
...luid/framework/ir/alloc_continuous_space_for_grad_pass.cc
+170
-113
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
...work/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+9
-2
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
...rk/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+38
-31
paddle/fluid/operators/alloc_continuous_space_op.cc
paddle/fluid/operators/alloc_continuous_space_op.cc
+5
-0
paddle/fluid/operators/optimizers/sgd_op.cc
paddle/fluid/operators/optimizers/sgd_op.cc
+12
-1
paddle/fluid/operators/optimizers/sgd_op.cu
paddle/fluid/operators/optimizers/sgd_op.cu
+4
-2
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
...uid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+91
-0
未找到文件。
paddle/fluid/framework/details/multi_devices_helper.h
浏览文件 @
3232618a
...
...
@@ -58,15 +58,15 @@ constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
typedef
std
::
string
FusedOptType
;
constexpr
char
kFusedOptType
[]
=
"fused_opt_type"
;
typedef
std
::
string
FusedGrads
;
typedef
std
::
vector
<
std
::
string
>
FusedGrads
;
constexpr
char
kFusedGrads
[]
=
"fused_gradients"
;
typedef
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
ParamsAndGrads
;
constexpr
char
kParamsAndGrads
[]
=
"params_grads"
;
typedef
std
::
vector
<
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>>
Group
GradsAndParam
s
;
constexpr
char
kGroup
GradsAndParams
[]
=
"group_grads_param
s"
;
Group
ParamsAndGrad
s
;
constexpr
char
kGroup
ParamsAndGrads
[]
=
"group_params_grad
s"
;
}
// namespace details
}
// namespace framework
...
...
paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc
浏览文件 @
3232618a
...
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h"
#include <algorithm>
#include <map>
#include <string>
#include <unordered_map>
#include <unordered_set>
...
...
@@ -52,18 +53,13 @@ static constexpr double kMB = 1048576.0;
void
SetFuseParameterGroupsSize
(
int
group_size
)
{
FLAGS_fuse_parameter_groups_size
=
group_size
;
}
int
GetFuseParameterGroupsSize
()
{
return
FLAGS_fuse_parameter_groups_size
;
}
void
SetFuseParameterMemorySize
(
double
memory_size
)
{
FLAGS_fuse_parameter_memory_size
=
memory_size
;
}
double
GetFuseParameterMemorySize
()
{
return
FLAGS_fuse_parameter_memory_size
;
}
static
framework
::
proto
::
VarType
::
Type
kDefaultDtype
=
framework
::
proto
::
VarType
::
Type
::
VarType_Type_BOOL
;
class
AllocContinuousSpaceForGradPass
:
public
ir
::
Pass
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
...
...
@@ -73,19 +69,16 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
auto
&
local_scopes
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
ResetAttribute
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
,
&
result
);
ResetAttribute
<
details
::
Group
GradsAndParams
>
(
details
::
kGroupGradsAndParam
s
,
ResetAttribute
<
details
::
Group
ParamsAndGrads
>
(
details
::
kGroupParamsAndGrad
s
,
&
result
);
// NOTE: The operator nodes should be in topology order.
std
::
vector
<
ir
::
Node
*>
topo_nodes
=
ir
::
TopologySortOperations
(
result
);
auto
&
params_grads
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
);
for
(
auto
&
node
:
topo_nodes
)
{
RecordParamsAndGrads
(
node
,
&
params_grads
);
}
RecordParamsAndGrads
(
result
,
&
params_grads
);
if
(
params_grads
.
size
()
==
0
)
{
LOG
(
INFO
)
<<
"Doesn't find gradients"
;
auto
num_params_grads
=
params_grads
.
size
();
VLOG
(
10
)
<<
"The number of params and grads is:"
<<
num_params_grads
;
if
(
num_params_grads
==
0
)
{
return
;
}
...
...
@@ -101,24 +94,43 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
}
}
auto
&
group_grads_params
=
result
.
Get
<
details
::
GroupGradsAndParams
>
(
details
::
kGroupGradsAndParams
);
// Note: the order of params_grads may be changed by SetGroupGradsAndParams.
SetGroupGradsAndParams
(
var_name2node
,
params_grads
,
&
group_grads_params
);
auto
&
group_params_grads
=
result
.
Get
<
details
::
GroupParamsAndGrads
>
(
details
::
kGroupParamsAndGrads
);
// Note: the order of params_grads may be changed by SetGroupParamsAndGrads.
SetGroupParamsAndGrads
(
var_name2node
,
params_grads
,
&
group_params_grads
);
params_grads
.
clear
();
for
(
auto
&
group_p_g
:
group_grads_params
)
{
params_grads
.
insert
(
params_grads
.
begin
(),
group_p_g
.
begin
(),
params_grads
.
reserve
(
num_params_grads
);
for
(
auto
&
group_p_g
:
group_params_grads
)
{
params_grads
.
insert
(
params_grads
.
end
(),
group_p_g
.
begin
(),
group_p_g
.
end
());
}
for
(
auto
&
p_g
:
params_grads
)
{
std
::
swap
(
p_g
.
first
,
p_g
.
second
);
PADDLE_ENFORCE_EQ
(
num_params_grads
,
params_grads
.
size
(),
"The number of params_grads is not consistent with before."
);
if
(
IsUnifiedDtype
(
params_grads
,
var_name2node
))
{
SetGradientPersistable
(
params_grads
,
var_name2node
,
var_name2node_set
);
AllocContinuousAddressSpace
(
places
,
local_scopes
,
var_name2node
,
params_grads
,
&
result
);
}
else
{
// Set Gradients as Persistable to prevent this var becoming reusable.
for
(
auto
&
sub_param_grad
:
group_params_grads
)
{
SetGradientPersistable
(
params_grads
,
var_name2node
,
var_name2node_set
);
PADDLE_ENFORCE
(
IsUnifiedDtype
(
sub_param_grad
,
var_name2node
),
"The data type of the same group is not consistent."
);
AllocContinuousAddressSpace
(
places
,
local_scopes
,
var_name2node
,
sub_param_grad
,
&
result
);
}
}
}
// Set Gradients as Persistable to prevent this var becoming reusable.
auto
dtype
=
kDefaultDtype
;
for
(
auto
&
p_g
:
params_grads
)
{
void
SetGradientPersistable
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
&
sub_param_grad
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_name2node
,
const
std
::
unordered_map
<
std
::
string
,
std
::
unordered_set
<
ir
::
Node
*>>
&
var_name2node_set
)
const
{
for
(
auto
&
p_g
:
sub_param_grad
)
{
// Get gradient var
auto
iter
=
var_name2node
.
find
(
p_g
.
second
);
PADDLE_ENFORCE
(
iter
!=
var_name2node
.
end
(),
"%s is not found."
,
...
...
@@ -132,32 +144,45 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
}
PADDLE_ENFORCE
(
IsSupportedVarType
(
iter
->
second
->
Var
()
->
GetType
()));
}
}
// Get Dtype
auto
ele_dtype
=
iter
->
second
->
Var
()
->
GetDataType
();
if
(
dtype
==
kDefaultDtype
)
{
dtype
=
ele_dtype
;
PADDLE_ENFORCE_NE
(
ele_dtype
,
kDefaultDtype
,
"The data type should not be bool."
);
bool
IsUnifiedDtype
(
const
details
::
ParamsAndGrads
&
params_grads
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_name2node
)
const
{
auto
dtype
=
this
->
GetDtypeOfVar
(
var_name2node
,
params_grads
.
front
().
second
);
for
(
auto
p_g
:
params_grads
)
{
auto
next_dtype
=
this
->
GetDtypeOfVar
(
var_name2node
,
p_g
.
second
);
if
(
next_dtype
!=
dtype
)
{
return
false
;
}
PADDLE_ENFORCE_EQ
(
ele_dtype
,
dtype
,
"The data type of input is not consistent."
);
}
return
true
;
}
void
AllocContinuousAddressSpace
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_name2node
,
const
details
::
ParamsAndGrads
&
params_grads
,
Graph
*
result
)
const
{
// Create a FusedVarsSet to avoid duplicating names for fused_var in other
// pass.
if
(
!
result
.
Has
(
details
::
kFusedVars
))
{
result
.
Set
(
details
::
kFusedVars
,
new
details
::
FusedVars
);
if
(
!
result
->
Has
(
details
::
kFusedVars
))
{
result
->
Set
(
details
::
kFusedVars
,
new
details
::
FusedVars
);
}
// the kFusedGrads is used be fuse_optimizer_op_pass.
result
.
Set
(
details
::
kFusedGrads
,
new
details
::
FusedGrads
);
if
(
!
result
->
Has
(
details
::
kFusedGrads
))
{
result
->
Set
(
details
::
kFusedGrads
,
new
details
::
FusedGrads
);
}
// the fused_var_name should be unique, so it appends
// params_grads.begin()->second.
auto
fused_var_name
=
std
::
string
(
details
::
kFusedVarNamePrefix
)
+
"@GRAD@"
+
params_grads
.
begin
()
->
second
;
result
.
Get
<
details
::
FusedGrads
>
(
details
::
kFusedGrads
)
=
fused_var_name
;
auto
&
fused_var_set
=
result
.
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
result
->
Get
<
details
::
FusedGrads
>
(
details
::
kFusedGrads
)
.
emplace_back
(
fused_var_name
);
auto
&
fused_var_set
=
result
->
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
PADDLE_ENFORCE_EQ
(
fused_var_set
.
count
(
fused_var_name
),
0
,
"%s is duplicate in FusedVars."
,
fused_var_name
);
fused_var_set
.
insert
(
fused_var_name
);
...
...
@@ -175,109 +200,126 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
graph
->
Set
(
attr_name
,
new
AttrType
);
}
void
SetGroup
GradsAndParam
s
(
void
SetGroup
ParamsAndGrad
s
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
const
details
::
ParamsAndGrads
&
params_grads
,
details
::
Group
GradsAndParams
*
group_grads_param
s
)
const
{
SetGroupAccordingToLayers
(
var_nodes
,
params_grads
,
group_
grads_param
s
);
SetGroupAccordingToMemorySize
(
var_nodes
,
group_
grads_param
s
);
details
::
Group
ParamsAndGrads
*
group_params_grad
s
)
const
{
SetGroupAccordingToLayers
(
var_nodes
,
params_grads
,
group_
params_grad
s
);
SetGroupAccordingToMemorySize
(
var_nodes
,
group_
params_grad
s
);
}
void
SetGroupAccordingToLayers
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
const
details
::
ParamsAndGrads
&
params_grads
,
details
::
GroupGradsAndParams
*
group_grads_params
)
const
{
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int
>>
layer_params
;
details
::
GroupParamsAndGrads
*
group_params_grads
)
const
{
using
var_dtype
=
std
::
pair
<
std
::
string
,
proto
::
VarType
::
Type
>
;
std
::
map
<
var_dtype
,
size_t
>
var_idx
;
for
(
size_t
i
=
0
;
i
<
params_grads
.
size
();
++
i
)
{
auto
pos
=
params_grads
[
i
].
first
.
find_first_of
(
"."
);
auto
dtype
=
GetDtypeOfVar
(
var_nodes
,
params_grads
[
i
].
second
);
var_dtype
var_key
;
if
(
pos
==
std
::
string
::
npos
)
{
layer_params
[
params_grads
[
i
].
first
].
emplace_back
(
i
);
var_key
=
std
::
make_pair
(
params_grads
[
i
].
first
,
dtype
);
}
else
{
layer_params
[
params_grads
[
i
].
first
.
substr
(
0
,
pos
)].
emplace_back
(
i
);
var_key
=
std
::
make_pair
(
params_grads
[
i
].
first
.
substr
(
0
,
pos
),
dtype
);
}
}
group_grads_params
->
reserve
(
layer_params
.
size
());
for
(
size_t
i
=
0
;
i
<
params_grads
.
size
();
++
i
)
{
auto
pos
=
params_grads
[
i
].
first
.
find_first_of
(
"."
);
std
::
string
key
=
params_grads
[
i
].
first
;
if
(
pos
!=
std
::
string
::
npos
)
{
key
=
params_grads
[
i
].
first
.
substr
(
0
,
pos
);
}
auto
iter
=
layer_params
.
find
(
key
);
if
(
iter
==
layer_params
.
end
())
continue
;
group_grads_params
->
emplace_back
();
auto
&
local_group_grads_params
=
group_grads_params
->
back
();
for
(
auto
&
idx
:
iter
->
second
)
{
local_group_grads_params
.
emplace_back
(
std
::
make_pair
(
params_grads
[
idx
].
second
,
params_grads
[
idx
].
first
));
size_t
idx
=
0
;
auto
var_idx_iter
=
var_idx
.
find
(
var_key
);
if
(
var_idx_iter
!=
var_idx
.
end
())
{
idx
=
var_idx_iter
->
second
;
}
else
{
group_params_grads
->
emplace_back
();
idx
=
group_params_grads
->
size
()
-
1
;
var_idx
[
var_key
]
=
idx
;
}
layer_params
.
erase
(
iter
);
auto
&
local_group_params_grads
=
group_params_grads
->
at
(
idx
);
local_group_params_grads
.
emplace_back
(
std
::
make_pair
(
params_grads
[
i
].
first
,
params_grads
[
i
].
second
));
}
VLOG
(
10
)
<<
"SetGroupAccordingToLayers: "
;
if
(
VLOG_IS_ON
(
10
))
{
PrintGroupInfo
(
var_nodes
,
group_grads_params
);
VLOG
(
10
)
<<
"SetGroupAccordingToLayers: "
;
PrintGroupInfo
(
var_nodes
,
group_params_grads
);
}
}
void
PrintGroupInfo
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
details
::
Group
GradsAndParams
*
group_grads_param
s
)
const
{
for
(
size_t
i
=
0
;
i
<
group_
grads_param
s
->
size
();
++
i
)
{
details
::
Group
ParamsAndGrads
*
group_params_grad
s
)
const
{
for
(
size_t
i
=
0
;
i
<
group_
params_grad
s
->
size
();
++
i
)
{
VLOG
(
10
)
<<
"group "
<<
i
;
std
::
stringstream
out
;
size_t
gps_size
=
0
;
for
(
auto
&
g_p
:
group_grads_param
s
->
at
(
i
))
{
auto
iter
=
var_nodes
.
find
(
g_p
.
second
);
PADDLE_ENFORCE
(
iter
!=
var_nodes
.
end
(),
"%s is not found."
,
g_p
.
second
);
for
(
auto
&
p_g
:
group_params_grad
s
->
at
(
i
))
{
auto
iter
=
var_nodes
.
find
(
p_g
.
first
);
PADDLE_ENFORCE
(
iter
!=
var_nodes
.
end
(),
"%s is not found."
,
p_g
.
first
);
auto
shape
=
iter
->
second
->
Var
()
->
GetShape
();
size_t
size
=
framework
::
SizeOfType
(
iter
->
second
->
Var
()
->
GetDataType
());
std
::
for_each
(
shape
.
begin
(),
shape
.
end
(),
[
&
size
](
const
int64_t
&
n
)
{
size
*=
n
;
});
gps_size
+=
size
;
out
<<
string
::
Sprintf
(
"(%s(%d), %s)"
,
g_p
.
second
,
size
,
g_p
.
first
);
out
<<
string
::
Sprintf
(
"(%s(%d), %s)"
,
p_g
.
first
,
size
,
p_g
.
second
);
}
auto
dtype
=
this
->
GetDtypeOfVar
(
var_nodes
,
group_params_grads
->
at
(
i
).
front
().
first
);
VLOG
(
10
)
<<
out
.
str
()
<<
", group size:"
<<
group_
grads_param
s
->
at
(
i
).
size
()
<<
", group size:"
<<
group_
params_grad
s
->
at
(
i
).
size
()
<<
", group memory size:"
<<
static_cast
<
double
>
(
gps_size
)
/
kMB
<<
"(MB)"
;
<<
"(MB)"
<<
", dtype:"
<<
dtype
;
}
}
void
SetGroupAccordingToMemorySize
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
details
::
Group
GradsAndParams
*
group_grads_param
s
)
const
{
details
::
Group
ParamsAndGrads
*
group_params_grad
s
)
const
{
const
double
group_memory_size
=
GetFuseParameterMemorySize
();
if
(
group_memory_size
<=
0.0
)
{
return
;
}
details
::
GroupGradsAndParams
local_group_grads_params
;
details
::
GroupParamsAndGrads
local_group_params_grads
;
size_t
j
=
0
;
while
(
j
<
group_grads_params
->
size
())
{
local_group_grads_params
.
emplace_back
();
auto
&
group_p_g
=
local_group_grads_params
.
back
();
while
(
j
<
group_params_grads
->
size
())
{
local_group_params_grads
.
emplace_back
();
auto
&
group_p_g
=
local_group_params_grads
.
back
();
auto
&
grad_name
=
group_params_grads
->
at
(
j
).
front
().
second
;
auto
var_type
=
GetDtypeOfVar
(
var_nodes
,
grad_name
);
size_t
local_group_memory_size
=
0
;
while
(
j
<
group_
grads_param
s
->
size
())
{
while
(
j
<
group_
params_grad
s
->
size
())
{
std
::
for_each
(
group_
grads_params
->
at
(
j
).
begin
(),
group_grads_param
s
->
at
(
j
).
end
(),
group_
params_grads
->
at
(
j
).
begin
(),
group_params_grad
s
->
at
(
j
).
end
(),
[
&
local_group_memory_size
,
&
var_nodes
](
const
std
::
pair
<
std
::
string
,
std
::
string
>
&
g_p
)
{
auto
iter
=
var_nodes
.
find
(
g_p
.
second
);
&
var_nodes
](
const
std
::
pair
<
std
::
string
,
std
::
string
>
&
p_g
)
{
auto
iter
=
var_nodes
.
find
(
p_g
.
second
);
PADDLE_ENFORCE
(
iter
!=
var_nodes
.
end
(),
"%s is not found."
,
g_p
.
second
);
auto
shape
=
iter
->
second
->
Var
()
->
GetShape
();
p_g
.
second
);
size_t
size
=
framework
::
SizeOfType
(
iter
->
second
->
Var
()
->
GetDataType
());
auto
shape
=
iter
->
second
->
Var
()
->
GetShape
();
std
::
for_each
(
shape
.
begin
(),
shape
.
end
(),
[
&
size
](
const
int64_t
&
n
)
{
size
*=
n
;
});
local_group_memory_size
+=
size
;
});
group_p_g
.
insert
(
group_p_g
.
end
(),
group_grads_params
->
at
(
j
).
begin
(),
group_grads_params
->
at
(
j
).
end
());
group_p_g
.
insert
(
group_p_g
.
end
(),
group_params_grads
->
at
(
j
).
begin
(),
group_params_grads
->
at
(
j
).
end
());
++
j
;
if
(
j
>=
group_params_grads
->
size
())
{
break
;
}
if
(
GetFuseParameterGroupsSize
()
>
1
&&
group_p_g
.
size
()
>
static_cast
<
size_t
>
(
GetFuseParameterGroupsSize
()))
{
...
...
@@ -288,49 +330,64 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
group_memory_size
)
{
break
;
}
auto
next_var_type
=
GetDtypeOfVar
(
var_nodes
,
group_params_grads
->
at
(
j
).
front
().
second
);
if
(
next_var_type
!=
var_type
)
{
break
;
}
}
}
std
::
swap
(
*
group_grads_params
,
local_group_grads_params
);
VLOG
(
10
)
<<
string
::
Sprintf
(
"SetGroupAccordingToMemorySize(memory_size: %f):"
,
group_memory_size
);
std
::
swap
(
*
group_params_grads
,
local_group_params_grads
);
if
(
VLOG_IS_ON
(
10
))
{
PrintGroupInfo
(
var_nodes
,
group_grads_params
);
VLOG
(
10
)
<<
string
::
Sprintf
(
"SetGroupAccordingToMemorySize(memory_size: %f):"
,
group_memory_size
);
PrintGroupInfo
(
var_nodes
,
group_params_grads
);
}
}
proto
::
VarType
::
Type
GetDtypeOfVar
(
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_nodes
,
const
std
::
string
&
name
)
const
{
auto
grad_iter
=
var_nodes
.
find
(
name
);
PADDLE_ENFORCE
(
grad_iter
!=
var_nodes
.
end
());
PADDLE_ENFORCE_NOT_NULL
(
grad_iter
->
second
->
Var
());
return
grad_iter
->
second
->
Var
()
->
GetDataType
();
}
private:
bool
IsSupportedVarType
(
const
proto
::
VarType
::
Type
&
type
)
const
{
// Current only support LOD_TENSOR.
return
type
==
proto
::
VarType
::
LOD_TENSOR
;
}
void
RecordParamsAndGrads
(
ir
::
Node
*
node
,
void
RecordParamsAndGrads
(
const
ir
::
Graph
&
graph
,
details
::
ParamsAndGrads
*
params_grads
)
const
{
try
{
bool
is_bk_op
=
static_cast
<
bool
>
(
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
&
static_cast
<
int
>
(
OpRole
::
kBackward
));
if
(
!
is_bk_op
)
return
;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
auto
backward_vars
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
backward_vars
.
size
()
%
2
,
static_cast
<
size_t
>
(
0
));
for
(
size_t
i
=
0
;
i
<
backward_vars
.
size
();
i
+=
2
)
{
VLOG
(
10
)
<<
"Trainable parameter: "
<<
backward_vars
[
i
]
<<
", gradient: "
<<
backward_vars
[
i
+
1
];
params_grads
->
emplace_back
(
std
::
make_pair
(
backward_vars
[
i
]
/*param*/
,
backward_vars
[
i
+
1
]
/*grad*/
));
std
::
vector
<
ir
::
Node
*>
topo_nodes
=
ir
::
TopologySortOperations
(
graph
);
for
(
auto
&
node
:
topo_nodes
)
{
try
{
bool
is_bk_op
=
static_cast
<
bool
>
(
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
&
static_cast
<
int
>
(
OpRole
::
kBackward
));
if
(
!
is_bk_op
)
continue
;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
auto
backward_vars
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
backward_vars
.
size
()
%
2
,
static_cast
<
size_t
>
(
0
));
for
(
size_t
i
=
0
;
i
<
backward_vars
.
size
();
i
+=
2
)
{
VLOG
(
10
)
<<
"Trainable parameter: "
<<
backward_vars
[
i
]
<<
", gradient: "
<<
backward_vars
[
i
+
1
];
params_grads
->
emplace_back
(
std
::
make_pair
(
backward_vars
[
i
]
/*param*/
,
backward_vars
[
i
+
1
]
/*grad*/
));
}
}
catch
(
boost
::
bad_get
e
)
{
}
}
catch
(
boost
::
bad_get
e
)
{
}
}
...
...
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
浏览文件 @
3232618a
...
...
@@ -101,10 +101,17 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
"this pass."
);
}
auto
&
fused_grad
=
result
.
Get
<
details
::
FusedGrads
>
(
details
::
kFusedGrads
);
PADDLE_ENFORCE_NE
(
fused_grad
.
size
(),
0
,
"The fused gradient should not be empty."
);
PADDLE_ENFORCE_EQ
(
fused_grad
.
size
(),
1
,
"Because the dtype of those gradients "
"is not unified, so the number of fused gradients is "
"more than one, but it is not supported currently."
);
auto
&
fused_vars
=
result
.
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
auto
iter
=
std
::
find
(
fused_vars
.
begin
(),
fused_vars
.
end
(),
fused_grad
);
auto
iter
=
std
::
find
(
fused_vars
.
begin
(),
fused_vars
.
end
(),
fused_grad
.
front
());
PADDLE_ENFORCE
(
iter
!=
fused_vars
.
end
(),
"Not find the fused_grad."
);
fused_vars_name
[
kGrad
]
=
fused_grad
;
fused_vars_name
[
kGrad
]
=
fused_grad
.
front
()
;
// Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
浏览文件 @
3232618a
...
...
@@ -30,7 +30,6 @@ class FuseAllReduceOpPass : public ir::Pass {
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
{
ir
::
Graph
&
result
=
*
graph
;
auto
&
places
=
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
details
::
kPlaces
);
auto
&
local_scopes
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...
...
@@ -38,38 +37,17 @@ class FuseAllReduceOpPass : public ir::Pass {
&
Get
<
platform
::
NCCLCommunicator
>
(
details
::
kNCCLCtxs
);
#endif
std
::
unordered_set
<
std
::
string
>
grads
;
auto
&
params_grads
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
);
size_t
num_of_all_reduce
=
params_grads
.
size
();
std
::
unordered_set
<
std
::
string
>
grads
;
grads
.
reserve
(
num_of_all_reduce
);
for
(
auto
p_g
:
params_grads
)
{
grads
.
insert
(
p_g
.
second
);
}
size_t
num_place
=
places
.
size
();
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
all_reduce_ops
;
all_reduce_ops
.
reserve
(
grads
.
size
());
for
(
auto
&
node
:
result
.
Nodes
())
{
if
(
node
->
IsOp
())
{
PADDLE_ENFORCE
(
node
->
IsWrappedBy
<
details
::
OpHandleBase
>
());
auto
*
all_reduce_op_handle
=
dynamic_cast
<
details
::
AllReduceOpHandle
*>
(
&
node
->
Wrapper
<
details
::
OpHandleBase
>
());
if
(
all_reduce_op_handle
)
{
auto
inputs
=
details
::
DynamicCast
<
details
::
VarHandle
>
(
all_reduce_op_handle
->
Inputs
());
PADDLE_ENFORCE_EQ
(
inputs
.
size
(),
num_place
);
// The inputs' name should be the same.
auto
&
grad_name
=
inputs
[
0
]
->
name
();
for
(
size_t
i
=
1
;
i
<
inputs
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
inputs
[
i
]
->
name
(),
grad_name
,
"The input name should be the same."
);
}
PADDLE_ENFORCE_NE
(
grads
.
count
(
grad_name
),
static_cast
<
size_t
>
(
0
));
all_reduce_ops
.
emplace
(
grad_name
,
node
);
}
}
}
std
::
unordered_map
<
std
::
string
,
Node
*>
all_reduce_ops
=
GetAllReduceOps
(
result
,
places
,
grads
);
VLOG
(
10
)
<<
"Find all_reduce_ops: "
<<
all_reduce_ops
.
size
();
if
(
all_reduce_ops
.
size
()
==
0
)
{
...
...
@@ -82,16 +60,16 @@ class FuseAllReduceOpPass : public ir::Pass {
"it is not supported currently."
);
VLOG
(
10
)
<<
"Insert fused_all_reduce"
;
auto
&
group_
grads_param
s
=
graph
->
Get
<
details
::
Group
GradsAndParams
>
(
details
::
kGroupGradsAndParam
s
);
auto
&
group_
params_grad
s
=
graph
->
Get
<
details
::
Group
ParamsAndGrads
>
(
details
::
kGroupParamsAndGrad
s
);
for
(
auto
&
group_
g_p
:
group_grads_param
s
)
{
size_t
group_size
=
group_
g_p
.
size
();
for
(
auto
&
group_
p_g
:
group_params_grad
s
)
{
size_t
group_size
=
group_
p_g
.
size
();
PADDLE_ENFORCE_GT
(
group_size
,
static_cast
<
size_t
>
(
0
));
std
::
vector
<
ir
::
Node
*>
group_all_reduce_ops
;
group_all_reduce_ops
.
reserve
(
group_size
);
for
(
auto
&
g_p
:
group_g_p
)
{
group_all_reduce_ops
.
emplace_back
(
all_reduce_ops
.
at
(
g_p
.
first
));
for
(
auto
&
p_g
:
group_p_g
)
{
group_all_reduce_ops
.
emplace_back
(
all_reduce_ops
.
at
(
p_g
.
second
));
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
InsertFusedAllReduce
(
places
,
local_scopes
,
group_size
,
...
...
@@ -103,6 +81,35 @@ class FuseAllReduceOpPass : public ir::Pass {
}
}
std
::
unordered_map
<
std
::
string
,
Node
*>
GetAllReduceOps
(
const
Graph
&
result
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
grads
)
const
{
size_t
num_place
=
places
.
size
();
std
::
unordered_map
<
std
::
string
,
Node
*>
all_reduce_ops
;
all_reduce_ops
.
reserve
(
grads
.
size
());
for
(
auto
&
node
:
result
.
Nodes
())
{
if
(
node
->
IsOp
())
{
PADDLE_ENFORCE
(
node
->
IsWrappedBy
<
details
::
OpHandleBase
>
());
auto
*
all_reduce_op_handle
=
dynamic_cast
<
details
::
AllReduceOpHandle
*>
(
&
node
->
Wrapper
<
details
::
OpHandleBase
>
());
if
(
all_reduce_op_handle
)
{
auto
inputs
=
details
::
DynamicCast
<
details
::
VarHandle
>
(
all_reduce_op_handle
->
Inputs
());
PADDLE_ENFORCE_EQ
(
inputs
.
size
(),
num_place
);
// The inputs' name should be the same.
auto
&
grad_name
=
inputs
[
0
]
->
name
();
for
(
size_t
i
=
1
;
i
<
inputs
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
inputs
[
i
]
->
name
(),
grad_name
,
"The input name should be the same."
);
}
PADDLE_ENFORCE_NE
(
grads
.
count
(
grad_name
),
static_cast
<
size_t
>
(
0
));
all_reduce_ops
.
emplace
(
grad_name
,
node
);
}
}
}
return
all_reduce_ops
;
}
void
InsertFusedAllReduce
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
num_of_all_reduce
,
...
...
paddle/fluid/operators/alloc_continuous_space_op.cc
浏览文件 @
3232618a
...
...
@@ -227,8 +227,11 @@ REGISTER_OPERATOR(alloc_continuous_space,
paddle
::
operators
::
AllocContinuousSpaceOp
,
paddle
::
operators
::
AllocContinuousSpaceOpMaker
);
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CPU_KERNEL
(
alloc_continuous_space
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
plat
::
float16
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
...
...
@@ -237,6 +240,8 @@ REGISTER_OP_CPU_KERNEL(
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL
(
alloc_continuous_space
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/fluid/operators/optimizers/sgd_op.cc
浏览文件 @
3232618a
...
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/sgd_op.h"
#include <string>
namespace
paddle
{
namespace
operators
{
...
...
@@ -46,6 +46,17 @@ class SGDOp : public framework::OperatorWithKernel {
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Param"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
framework
::
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
{
if
(
var_name
==
"LearningRate"
)
{
return
framework
::
OpKernelType
(
tensor
.
type
(),
tensor
.
place
(),
tensor
.
layout
());
}
return
framework
::
OpKernelType
(
expected_kernel_type
.
data_type_
,
tensor
.
place
(),
tensor
.
layout
());
}
};
class
SGDOpInferVarType
:
public
framework
::
VarTypeInference
{
...
...
paddle/fluid/operators/optimizers/sgd_op.cu
浏览文件 @
3232618a
...
...
@@ -46,7 +46,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
// Atomic Operation to avoid concurrent write error.
paddle
::
platform
::
CudaAtomicAdd
(
tensor_out_ptr
+
index
,
-
1.0
*
learning_rate
[
0
]
*
selected_rows_ptr
[
index
]);
-
static_cast
<
T
>
(
1.0
)
*
learning_rate
[
0
]
*
selected_rows_ptr
[
index
]);
}
}
}
...
...
@@ -122,5 +122,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
sgd
,
ops
::
SGDOpCUDAKernel
<
float
>
,
ops
::
SGDOpCUDAKernel
<
double
>
);
ops
::
SGDOpCUDAKernel
<
double
>
,
ops
::
SGDOpCUDAKernel
<
plat
::
float16
>
);
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
0 → 100644
浏览文件 @
3232618a
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
paddle.fluid.core
as
core
import
math
import
os
import
sys
import
unittest
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
simple_nets
import
init_data
from
parallel_executor_test_base
import
TestParallelExecutorBase
batch_size
=
12
img_shape
=
[
1
,
28
,
28
]
def
loss_net
(
hidden
,
label
):
prediction
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
,
act
=
'softmax'
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
return
avg_loss
def
conv_net
(
use_feed
):
img
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
img_shape
,
dtype
=
'float16'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
conv_pool_1
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
img
,
filter_size
=
5
,
num_filters
=
20
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
conv_pool_1
=
fluid
.
layers
.
batch_norm
(
conv_pool_1
)
conv_pool_1
=
fluid
.
layers
.
cast
(
conv_pool_1
,
np
.
float32
)
conv_pool_2
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
conv_pool_1
,
filter_size
=
5
,
num_filters
=
50
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
hidden
=
fluid
.
layers
.
cast
(
conv_pool_2
,
np
.
float32
)
return
loss_net
(
hidden
,
label
)
def
_optimizer
(
learning_rate
=
1e-6
):
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
learning_rate
)
return
optimizer
class
TestResnet
(
TestParallelExecutorBase
):
def
check_model
(
self
,
use_cuda
):
img
,
label
=
init_data
(
batch_size
=
batch_size
,
img_shape
=
img_shape
,
label_range
=
9
)
img
=
np
.
float16
(
img
).
view
(
np
.
uint16
)
feed_dict
=
{
"image"
:
img
,
"label"
:
label
}
TestParallelExecutorBase
.
check_network_convergence
(
conv_net
,
feed_dict
=
feed_dict
,
iter
=
10
,
use_cuda
=
use_cuda
,
fuse_all_reduce_ops
=
True
,
optimizer
=
_optimizer
)
def
test_model
(
self
):
if
core
.
is_compiled_with_cuda
():
self
.
check_model
(
True
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录