Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
74538573
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
74538573
编写于
7月 04, 2019
作者:
C
chengduo
提交者:
gongweibao
7月 04, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Make fuse_all_reduce_op_pass support mix_precision (#17652)
上级
55baeced
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
332 addition
and
152 deletion
+332
-152
paddle/fluid/framework/details/multi_devices_helper.h
paddle/fluid/framework/details/multi_devices_helper.h
+3
-3
paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc
...luid/framework/ir/alloc_continuous_space_for_grad_pass.cc
+170
-113
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
...work/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+9
-2
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
...rk/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+38
-31
paddle/fluid/operators/alloc_continuous_space_op.cc
paddle/fluid/operators/alloc_continuous_space_op.cc
+5
-0
paddle/fluid/operators/optimizers/sgd_op.cc
paddle/fluid/operators/optimizers/sgd_op.cc
+12
-1
paddle/fluid/operators/optimizers/sgd_op.cu
paddle/fluid/operators/optimizers/sgd_op.cu
+4
-2
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
...uid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+91
-0
未找到文件。
paddle/fluid/framework/details/multi_devices_helper.h
浏览文件 @
74538573
...
...
@@ -58,15 +58,15 @@ constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
typedef
std
::
string
FusedOptType
;
constexpr
char
kFusedOptType
[]
=
"fused_opt_type"
;
typedef
std
::
string
FusedGrads
;
typedef
std
::
vector
<
std
::
string
>
FusedGrads
;
constexpr
char
kFusedGrads
[]
=
"fused_gradients"
;
typedef
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
ParamsAndGrads
;
constexpr
char
kParamsAndGrads
[]
=
"params_grads"
;
typedef
std
::
vector
<
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>>
Group
GradsAndParam
s
;
constexpr
char
kGroup
GradsAndParams
[]
=
"group_grads_param
s"
;
Group
ParamsAndGrad
s
;
constexpr
char
kGroup
ParamsAndGrads
[]
=
"group_params_grad
s"
;
}
// namespace details
}
// namespace framework
...
...
paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc
浏览文件 @
74538573
...
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h"
#include <algorithm>
#include <map>
#include <string>
#include <unordered_map>
#include <unordered_set>
...
...
@@ -52,18 +53,13 @@ static constexpr double kMB = 1048576.0;
void
SetFuseParameterGroupsSize
(
int
group_size
)
{
FLAGS_fuse_parameter_groups_size
=
group_size
;
}
int
GetFuseParameterGroupsSize
()
{
return
FLAGS_fuse_parameter_groups_size
;
}
void
SetFuseParameterMemorySize
(
double
memory_size
)
{
FLAGS_fuse_parameter_memory_size
=
memory_size
;
}
double
GetFuseParameterMemorySize
()
{
return
FLAGS_fuse_parameter_memory_size
;
}
static
framework
::
proto
::
VarType
::
Type
kDefaultDtype
=
framework
::
proto
::
VarType
::
Type
::
VarType_Type_BOOL
;
class
AllocContinuousSpaceForGradPass
:
public
ir
::
Pass
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
...
...
@@ -73,19 +69,16 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
auto
&
local_scopes
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
ResetAttribute
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
,
&
result
);
ResetAttribute
<
details
::
Group
GradsAndParams
>
(
details
::
kGroupGradsAndParam
s
,
ResetAttribute
<
details
::
Group
ParamsAndGrads
>
(
details
::
kGroupParamsAndGrad
s
,
&
result
);
// NOTE: The operator nodes should be in topology order.
std
::
vector
<
ir
::
Node
*>
topo_nodes
=
ir
::
TopologySortOperations
(
result
);
auto
&
params_grads
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
);
for
(
auto
&
node
:
topo_nodes
)
{
RecordParamsAndGrads
(
node
,
&
params_grads
);
}
RecordParamsAndGrads
(
result
,
&
params_grads
);
if
(
params_grads
.
size
()
==
0
)
{
LOG
(
INFO
)
<<
"Doesn't find gradients"
;
auto
num_params_grads
=
params_grads
.
size
();
VLOG
(
10
)
<<
"The number of params and grads is:"
<<
num_params_grads
;
if
(
num_params_grads
==
0
)
{
return
;
}
...
...
@@ -101,24 +94,43 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
}
}
auto
&
group_grads_params
=
result
.
Get
<
details
::
GroupGradsAndParams
>
(
details
::
kGroupGradsAndParams
);
// Note: the order of params_grads may be changed by SetGroupGradsAndParams.
SetGroupGradsAndParams
(
var_name2node
,
params_grads
,
&
group_grads_params
);
auto
&
group_params_grads
=
result
.
Get
<
details
::
GroupParamsAndGrads
>
(
details
::
kGroupParamsAndGrads
);
// Note: the order of params_grads may be changed by SetGroupParamsAndGrads.
SetGroupParamsAndGrads
(
var_name2node
,
params_grads
,
&
group_params_grads
);
params_grads
.
clear
();
for
(
auto
&
group_p_g
:
group_grads_params
)
{
params_grads
.
insert
(
params_grads
.
begin
(),
group_p_g
.
begin
(),
params_grads
.
reserve
(
num_params_grads
);
for
(
auto
&
group_p_g
:
group_params_grads
)
{
params_grads
.
insert
(
params_grads
.
end
(),
group_p_g
.
begin
(),
group_p_g
.
end
());
}
for
(
auto
&
p_g
:
params_grads
)
{
std
::
swap
(
p_g
.
first
,
p_g
.
second
);
PADDLE_ENFORCE_EQ
(
num_params_grads
,
params_grads
.
size
(),
"The number of params_grads is not consistent with before."
);
if
(
IsUnifiedDtype
(
params_grads
,
var_name2node
))
{
SetGradientPersistable
(
params_grads
,
var_name2node
,
var_name2node_set
);
AllocContinuousAddressSpace
(
places
,
local_scopes
,
var_name2node
,
params_grads
,
&
result
);
}
else
{
// Set Gradients as Persistable to prevent this var becoming reusable.
for
(
auto
&
sub_param_grad
:
group_params_grads
)
{
SetGradientPersistable
(
params_grads
,
var_name2node
,
var_name2node_set
);
PADDLE_ENFORCE
(
IsUnifiedDtype
(
sub_param_grad
,
var_name2node
),
"The data type of the same group is not consistent."
);
AllocContinuousAddressSpace
(
places
,
local_scopes
,
var_name2node
,
sub_param_grad
,
&
result
);
}
}
}
// Set Gradients as Persistable to prevent this var becoming reusable.
auto
dtype
=
kDefaultDtype
;
for
(
auto
&
p_g
:
params_grads
)
{
void
SetGradientPersistable
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
&
sub_param_grad
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_name2node
,
const
std
::
unordered_map
<
std
::
string
,
std
::
unordered_set
<
ir
::
Node
*>>
&
var_name2node_set
)
const
{
for
(
auto
&
p_g
:
sub_param_grad
)
{
// Get gradient var
auto
iter
=
var_name2node
.
find
(
p_g
.
second
);
PADDLE_ENFORCE
(
iter
!=
var_name2node
.
end
(),
"%s is not found."
,
...
...
@@ -132,32 +144,45 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
}
PADDLE_ENFORCE
(
IsSupportedVarType
(
iter
->
second
->
Var
()
->
GetType
()));
}
}
// Get Dtype
auto
ele_dtype
=
iter
->
second
->
Var
()
->
GetDataType
();
if
(
dtype
==
kDefaultDtype
)
{
dtype
=
ele_dtype
;
PADDLE_ENFORCE_NE
(
ele_dtype
,
kDefaultDtype
,
"The data type should not be bool."
);
bool
IsUnifiedDtype
(
const
details
::
ParamsAndGrads
&
params_grads
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_name2node
)
const
{
auto
dtype
=
this
->
GetDtypeOfVar
(
var_name2node
,
params_grads
.
front
().
second
);
for
(
auto
p_g
:
params_grads
)
{
auto
next_dtype
=
this
->
GetDtypeOfVar
(
var_name2node
,
p_g
.
second
);
if
(
next_dtype
!=
dtype
)
{
return
false
;
}
PADDLE_ENFORCE_EQ
(
ele_dtype
,
dtype
,
"The data type of input is not consistent."
);
}
return
true
;
}
void
AllocContinuousAddressSpace
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_name2node
,
const
details
::
ParamsAndGrads
&
params_grads
,
Graph
*
result
)
const
{
// Create a FusedVarsSet to avoid duplicating names for fused_var in other
// pass.
if
(
!
result
.
Has
(
details
::
kFusedVars
))
{
result
.
Set
(
details
::
kFusedVars
,
new
details
::
FusedVars
);
if
(
!
result
->
Has
(
details
::
kFusedVars
))
{
result
->
Set
(
details
::
kFusedVars
,
new
details
::
FusedVars
);
}
// the kFusedGrads is used be fuse_optimizer_op_pass.
result
.
Set
(
details
::
kFusedGrads
,
new
details
::
FusedGrads
);
if
(
!
result
->
Has
(
details
::
kFusedGrads
))
{
result
->
Set
(
details
::
kFusedGrads
,
new
details
::
FusedGrads
);
}
// the fused_var_name should be unique, so it appends
// params_grads.begin()->second.
auto
fused_var_name
=
std
::
string
(
details
::
kFusedVarNamePrefix
)
+
"@GRAD@"
+
params_grads
.
begin
()
->
second
;
result
.
Get
<
details
::
FusedGrads
>
(
details
::
kFusedGrads
)
=
fused_var_name
;
auto
&
fused_var_set
=
result
.
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
result
->
Get
<
details
::
FusedGrads
>
(
details
::
kFusedGrads
)
.
emplace_back
(
fused_var_name
);
auto
&
fused_var_set
=
result
->
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
PADDLE_ENFORCE_EQ
(
fused_var_set
.
count
(
fused_var_name
),
0
,
"%s is duplicate in FusedVars."
,
fused_var_name
);
fused_var_set
.
insert
(
fused_var_name
);
...
...
@@ -175,109 +200,126 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
graph
->
Set
(
attr_name
,
new
AttrType
);
}
void
SetGroup
GradsAndParam
s
(
void
SetGroup
ParamsAndGrad
s
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
const
details
::
ParamsAndGrads
&
params_grads
,
details
::
Group
GradsAndParams
*
group_grads_param
s
)
const
{
SetGroupAccordingToLayers
(
var_nodes
,
params_grads
,
group_
grads_param
s
);
SetGroupAccordingToMemorySize
(
var_nodes
,
group_
grads_param
s
);
details
::
Group
ParamsAndGrads
*
group_params_grad
s
)
const
{
SetGroupAccordingToLayers
(
var_nodes
,
params_grads
,
group_
params_grad
s
);
SetGroupAccordingToMemorySize
(
var_nodes
,
group_
params_grad
s
);
}
void
SetGroupAccordingToLayers
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
const
details
::
ParamsAndGrads
&
params_grads
,
details
::
GroupGradsAndParams
*
group_grads_params
)
const
{
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int
>>
layer_params
;
details
::
GroupParamsAndGrads
*
group_params_grads
)
const
{
using
var_dtype
=
std
::
pair
<
std
::
string
,
proto
::
VarType
::
Type
>
;
std
::
map
<
var_dtype
,
size_t
>
var_idx
;
for
(
size_t
i
=
0
;
i
<
params_grads
.
size
();
++
i
)
{
auto
pos
=
params_grads
[
i
].
first
.
find_first_of
(
"."
);
auto
dtype
=
GetDtypeOfVar
(
var_nodes
,
params_grads
[
i
].
second
);
var_dtype
var_key
;
if
(
pos
==
std
::
string
::
npos
)
{
layer_params
[
params_grads
[
i
].
first
].
emplace_back
(
i
);
var_key
=
std
::
make_pair
(
params_grads
[
i
].
first
,
dtype
);
}
else
{
layer_params
[
params_grads
[
i
].
first
.
substr
(
0
,
pos
)].
emplace_back
(
i
);
var_key
=
std
::
make_pair
(
params_grads
[
i
].
first
.
substr
(
0
,
pos
),
dtype
);
}
}
group_grads_params
->
reserve
(
layer_params
.
size
());
for
(
size_t
i
=
0
;
i
<
params_grads
.
size
();
++
i
)
{
auto
pos
=
params_grads
[
i
].
first
.
find_first_of
(
"."
);
std
::
string
key
=
params_grads
[
i
].
first
;
if
(
pos
!=
std
::
string
::
npos
)
{
key
=
params_grads
[
i
].
first
.
substr
(
0
,
pos
);
}
auto
iter
=
layer_params
.
find
(
key
);
if
(
iter
==
layer_params
.
end
())
continue
;
group_grads_params
->
emplace_back
();
auto
&
local_group_grads_params
=
group_grads_params
->
back
();
for
(
auto
&
idx
:
iter
->
second
)
{
local_group_grads_params
.
emplace_back
(
std
::
make_pair
(
params_grads
[
idx
].
second
,
params_grads
[
idx
].
first
));
size_t
idx
=
0
;
auto
var_idx_iter
=
var_idx
.
find
(
var_key
);
if
(
var_idx_iter
!=
var_idx
.
end
())
{
idx
=
var_idx_iter
->
second
;
}
else
{
group_params_grads
->
emplace_back
();
idx
=
group_params_grads
->
size
()
-
1
;
var_idx
[
var_key
]
=
idx
;
}
layer_params
.
erase
(
iter
);
auto
&
local_group_params_grads
=
group_params_grads
->
at
(
idx
);
local_group_params_grads
.
emplace_back
(
std
::
make_pair
(
params_grads
[
i
].
first
,
params_grads
[
i
].
second
));
}
VLOG
(
10
)
<<
"SetGroupAccordingToLayers: "
;
if
(
VLOG_IS_ON
(
10
))
{
PrintGroupInfo
(
var_nodes
,
group_grads_params
);
VLOG
(
10
)
<<
"SetGroupAccordingToLayers: "
;
PrintGroupInfo
(
var_nodes
,
group_params_grads
);
}
}
void
PrintGroupInfo
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
details
::
Group
GradsAndParams
*
group_grads_param
s
)
const
{
for
(
size_t
i
=
0
;
i
<
group_
grads_param
s
->
size
();
++
i
)
{
details
::
Group
ParamsAndGrads
*
group_params_grad
s
)
const
{
for
(
size_t
i
=
0
;
i
<
group_
params_grad
s
->
size
();
++
i
)
{
VLOG
(
10
)
<<
"group "
<<
i
;
std
::
stringstream
out
;
size_t
gps_size
=
0
;
for
(
auto
&
g_p
:
group_grads_param
s
->
at
(
i
))
{
auto
iter
=
var_nodes
.
find
(
g_p
.
second
);
PADDLE_ENFORCE
(
iter
!=
var_nodes
.
end
(),
"%s is not found."
,
g_p
.
second
);
for
(
auto
&
p_g
:
group_params_grad
s
->
at
(
i
))
{
auto
iter
=
var_nodes
.
find
(
p_g
.
first
);
PADDLE_ENFORCE
(
iter
!=
var_nodes
.
end
(),
"%s is not found."
,
p_g
.
first
);
auto
shape
=
iter
->
second
->
Var
()
->
GetShape
();
size_t
size
=
framework
::
SizeOfType
(
iter
->
second
->
Var
()
->
GetDataType
());
std
::
for_each
(
shape
.
begin
(),
shape
.
end
(),
[
&
size
](
const
int64_t
&
n
)
{
size
*=
n
;
});
gps_size
+=
size
;
out
<<
string
::
Sprintf
(
"(%s(%d), %s)"
,
g_p
.
second
,
size
,
g_p
.
first
);
out
<<
string
::
Sprintf
(
"(%s(%d), %s)"
,
p_g
.
first
,
size
,
p_g
.
second
);
}
auto
dtype
=
this
->
GetDtypeOfVar
(
var_nodes
,
group_params_grads
->
at
(
i
).
front
().
first
);
VLOG
(
10
)
<<
out
.
str
()
<<
", group size:"
<<
group_
grads_param
s
->
at
(
i
).
size
()
<<
", group size:"
<<
group_
params_grad
s
->
at
(
i
).
size
()
<<
", group memory size:"
<<
static_cast
<
double
>
(
gps_size
)
/
kMB
<<
"(MB)"
;
<<
"(MB)"
<<
", dtype:"
<<
dtype
;
}
}
void
SetGroupAccordingToMemorySize
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
details
::
Group
GradsAndParams
*
group_grads_param
s
)
const
{
details
::
Group
ParamsAndGrads
*
group_params_grad
s
)
const
{
const
double
group_memory_size
=
GetFuseParameterMemorySize
();
if
(
group_memory_size
<=
0.0
)
{
return
;
}
details
::
GroupGradsAndParams
local_group_grads_params
;
details
::
GroupParamsAndGrads
local_group_params_grads
;
size_t
j
=
0
;
while
(
j
<
group_grads_params
->
size
())
{
local_group_grads_params
.
emplace_back
();
auto
&
group_p_g
=
local_group_grads_params
.
back
();
while
(
j
<
group_params_grads
->
size
())
{
local_group_params_grads
.
emplace_back
();
auto
&
group_p_g
=
local_group_params_grads
.
back
();
auto
&
grad_name
=
group_params_grads
->
at
(
j
).
front
().
second
;
auto
var_type
=
GetDtypeOfVar
(
var_nodes
,
grad_name
);
size_t
local_group_memory_size
=
0
;
while
(
j
<
group_
grads_param
s
->
size
())
{
while
(
j
<
group_
params_grad
s
->
size
())
{
std
::
for_each
(
group_
grads_params
->
at
(
j
).
begin
(),
group_grads_param
s
->
at
(
j
).
end
(),
group_
params_grads
->
at
(
j
).
begin
(),
group_params_grad
s
->
at
(
j
).
end
(),
[
&
local_group_memory_size
,
&
var_nodes
](
const
std
::
pair
<
std
::
string
,
std
::
string
>
&
g_p
)
{
auto
iter
=
var_nodes
.
find
(
g_p
.
second
);
&
var_nodes
](
const
std
::
pair
<
std
::
string
,
std
::
string
>
&
p_g
)
{
auto
iter
=
var_nodes
.
find
(
p_g
.
second
);
PADDLE_ENFORCE
(
iter
!=
var_nodes
.
end
(),
"%s is not found."
,
g_p
.
second
);
auto
shape
=
iter
->
second
->
Var
()
->
GetShape
();
p_g
.
second
);
size_t
size
=
framework
::
SizeOfType
(
iter
->
second
->
Var
()
->
GetDataType
());
auto
shape
=
iter
->
second
->
Var
()
->
GetShape
();
std
::
for_each
(
shape
.
begin
(),
shape
.
end
(),
[
&
size
](
const
int64_t
&
n
)
{
size
*=
n
;
});
local_group_memory_size
+=
size
;
});
group_p_g
.
insert
(
group_p_g
.
end
(),
group_grads_params
->
at
(
j
).
begin
(),
group_grads_params
->
at
(
j
).
end
());
group_p_g
.
insert
(
group_p_g
.
end
(),
group_params_grads
->
at
(
j
).
begin
(),
group_params_grads
->
at
(
j
).
end
());
++
j
;
if
(
j
>=
group_params_grads
->
size
())
{
break
;
}
if
(
GetFuseParameterGroupsSize
()
>
1
&&
group_p_g
.
size
()
>
static_cast
<
size_t
>
(
GetFuseParameterGroupsSize
()))
{
...
...
@@ -288,49 +330,64 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
group_memory_size
)
{
break
;
}
auto
next_var_type
=
GetDtypeOfVar
(
var_nodes
,
group_params_grads
->
at
(
j
).
front
().
second
);
if
(
next_var_type
!=
var_type
)
{
break
;
}
}
}
std
::
swap
(
*
group_grads_params
,
local_group_grads_params
);
VLOG
(
10
)
<<
string
::
Sprintf
(
"SetGroupAccordingToMemorySize(memory_size: %f):"
,
group_memory_size
);
std
::
swap
(
*
group_params_grads
,
local_group_params_grads
);
if
(
VLOG_IS_ON
(
10
))
{
PrintGroupInfo
(
var_nodes
,
group_grads_params
);
VLOG
(
10
)
<<
string
::
Sprintf
(
"SetGroupAccordingToMemorySize(memory_size: %f):"
,
group_memory_size
);
PrintGroupInfo
(
var_nodes
,
group_params_grads
);
}
}
proto
::
VarType
::
Type
GetDtypeOfVar
(
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_nodes
,
const
std
::
string
&
name
)
const
{
auto
grad_iter
=
var_nodes
.
find
(
name
);
PADDLE_ENFORCE
(
grad_iter
!=
var_nodes
.
end
());
PADDLE_ENFORCE_NOT_NULL
(
grad_iter
->
second
->
Var
());
return
grad_iter
->
second
->
Var
()
->
GetDataType
();
}
private:
bool
IsSupportedVarType
(
const
proto
::
VarType
::
Type
&
type
)
const
{
// Current only support LOD_TENSOR.
return
type
==
proto
::
VarType
::
LOD_TENSOR
;
}
void
RecordParamsAndGrads
(
ir
::
Node
*
node
,
void
RecordParamsAndGrads
(
const
ir
::
Graph
&
graph
,
details
::
ParamsAndGrads
*
params_grads
)
const
{
try
{
bool
is_bk_op
=
static_cast
<
bool
>
(
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
&
static_cast
<
int
>
(
OpRole
::
kBackward
));
if
(
!
is_bk_op
)
return
;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
auto
backward_vars
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
backward_vars
.
size
()
%
2
,
static_cast
<
size_t
>
(
0
));
for
(
size_t
i
=
0
;
i
<
backward_vars
.
size
();
i
+=
2
)
{
VLOG
(
10
)
<<
"Trainable parameter: "
<<
backward_vars
[
i
]
<<
", gradient: "
<<
backward_vars
[
i
+
1
];
params_grads
->
emplace_back
(
std
::
make_pair
(
backward_vars
[
i
]
/*param*/
,
backward_vars
[
i
+
1
]
/*grad*/
));
std
::
vector
<
ir
::
Node
*>
topo_nodes
=
ir
::
TopologySortOperations
(
graph
);
for
(
auto
&
node
:
topo_nodes
)
{
try
{
bool
is_bk_op
=
static_cast
<
bool
>
(
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
&
static_cast
<
int
>
(
OpRole
::
kBackward
));
if
(
!
is_bk_op
)
continue
;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
auto
backward_vars
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
backward_vars
.
size
()
%
2
,
static_cast
<
size_t
>
(
0
));
for
(
size_t
i
=
0
;
i
<
backward_vars
.
size
();
i
+=
2
)
{
VLOG
(
10
)
<<
"Trainable parameter: "
<<
backward_vars
[
i
]
<<
", gradient: "
<<
backward_vars
[
i
+
1
];
params_grads
->
emplace_back
(
std
::
make_pair
(
backward_vars
[
i
]
/*param*/
,
backward_vars
[
i
+
1
]
/*grad*/
));
}
}
catch
(
boost
::
bad_get
e
)
{
}
}
catch
(
boost
::
bad_get
e
)
{
}
}
...
...
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
浏览文件 @
74538573
...
...
@@ -101,10 +101,17 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
"this pass."
);
}
auto
&
fused_grad
=
result
.
Get
<
details
::
FusedGrads
>
(
details
::
kFusedGrads
);
PADDLE_ENFORCE_NE
(
fused_grad
.
size
(),
0
,
"The fused gradient should not be empty."
);
PADDLE_ENFORCE_EQ
(
fused_grad
.
size
(),
1
,
"Because the dtype of those gradients "
"is not unified, so the number of fused gradients is "
"more than one, but it is not supported currently."
);
auto
&
fused_vars
=
result
.
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
auto
iter
=
std
::
find
(
fused_vars
.
begin
(),
fused_vars
.
end
(),
fused_grad
);
auto
iter
=
std
::
find
(
fused_vars
.
begin
(),
fused_vars
.
end
(),
fused_grad
.
front
());
PADDLE_ENFORCE
(
iter
!=
fused_vars
.
end
(),
"Not find the fused_grad."
);
fused_vars_name
[
kGrad
]
=
fused_grad
;
fused_vars_name
[
kGrad
]
=
fused_grad
.
front
()
;
// Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
浏览文件 @
74538573
...
...
@@ -30,7 +30,6 @@ class FuseAllReduceOpPass : public ir::Pass {
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
{
ir
::
Graph
&
result
=
*
graph
;
auto
&
places
=
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
details
::
kPlaces
);
auto
&
local_scopes
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...
...
@@ -38,38 +37,17 @@ class FuseAllReduceOpPass : public ir::Pass {
&
Get
<
platform
::
NCCLCommunicator
>
(
details
::
kNCCLCtxs
);
#endif
std
::
unordered_set
<
std
::
string
>
grads
;
auto
&
params_grads
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
);
size_t
num_of_all_reduce
=
params_grads
.
size
();
std
::
unordered_set
<
std
::
string
>
grads
;
grads
.
reserve
(
num_of_all_reduce
);
for
(
auto
p_g
:
params_grads
)
{
grads
.
insert
(
p_g
.
second
);
}
size_t
num_place
=
places
.
size
();
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
all_reduce_ops
;
all_reduce_ops
.
reserve
(
grads
.
size
());
for
(
auto
&
node
:
result
.
Nodes
())
{
if
(
node
->
IsOp
())
{
PADDLE_ENFORCE
(
node
->
IsWrappedBy
<
details
::
OpHandleBase
>
());
auto
*
all_reduce_op_handle
=
dynamic_cast
<
details
::
AllReduceOpHandle
*>
(
&
node
->
Wrapper
<
details
::
OpHandleBase
>
());
if
(
all_reduce_op_handle
)
{
auto
inputs
=
details
::
DynamicCast
<
details
::
VarHandle
>
(
all_reduce_op_handle
->
Inputs
());
PADDLE_ENFORCE_EQ
(
inputs
.
size
(),
num_place
);
// The inputs' name should be the same.
auto
&
grad_name
=
inputs
[
0
]
->
name
();
for
(
size_t
i
=
1
;
i
<
inputs
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
inputs
[
i
]
->
name
(),
grad_name
,
"The input name should be the same."
);
}
PADDLE_ENFORCE_NE
(
grads
.
count
(
grad_name
),
static_cast
<
size_t
>
(
0
));
all_reduce_ops
.
emplace
(
grad_name
,
node
);
}
}
}
std
::
unordered_map
<
std
::
string
,
Node
*>
all_reduce_ops
=
GetAllReduceOps
(
result
,
places
,
grads
);
VLOG
(
10
)
<<
"Find all_reduce_ops: "
<<
all_reduce_ops
.
size
();
if
(
all_reduce_ops
.
size
()
==
0
)
{
...
...
@@ -82,16 +60,16 @@ class FuseAllReduceOpPass : public ir::Pass {
"it is not supported currently."
);
VLOG
(
10
)
<<
"Insert fused_all_reduce"
;
auto
&
group_
grads_param
s
=
graph
->
Get
<
details
::
Group
GradsAndParams
>
(
details
::
kGroupGradsAndParam
s
);
auto
&
group_
params_grad
s
=
graph
->
Get
<
details
::
Group
ParamsAndGrads
>
(
details
::
kGroupParamsAndGrad
s
);
for
(
auto
&
group_
g_p
:
group_grads_param
s
)
{
size_t
group_size
=
group_
g_p
.
size
();
for
(
auto
&
group_
p_g
:
group_params_grad
s
)
{
size_t
group_size
=
group_
p_g
.
size
();
PADDLE_ENFORCE_GT
(
group_size
,
static_cast
<
size_t
>
(
0
));
std
::
vector
<
ir
::
Node
*>
group_all_reduce_ops
;
group_all_reduce_ops
.
reserve
(
group_size
);
for
(
auto
&
g_p
:
group_g_p
)
{
group_all_reduce_ops
.
emplace_back
(
all_reduce_ops
.
at
(
g_p
.
first
));
for
(
auto
&
p_g
:
group_p_g
)
{
group_all_reduce_ops
.
emplace_back
(
all_reduce_ops
.
at
(
p_g
.
second
));
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
InsertFusedAllReduce
(
places
,
local_scopes
,
group_size
,
...
...
@@ -103,6 +81,35 @@ class FuseAllReduceOpPass : public ir::Pass {
}
}
std
::
unordered_map
<
std
::
string
,
Node
*>
GetAllReduceOps
(
const
Graph
&
result
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
grads
)
const
{
size_t
num_place
=
places
.
size
();
std
::
unordered_map
<
std
::
string
,
Node
*>
all_reduce_ops
;
all_reduce_ops
.
reserve
(
grads
.
size
());
for
(
auto
&
node
:
result
.
Nodes
())
{
if
(
node
->
IsOp
())
{
PADDLE_ENFORCE
(
node
->
IsWrappedBy
<
details
::
OpHandleBase
>
());
auto
*
all_reduce_op_handle
=
dynamic_cast
<
details
::
AllReduceOpHandle
*>
(
&
node
->
Wrapper
<
details
::
OpHandleBase
>
());
if
(
all_reduce_op_handle
)
{
auto
inputs
=
details
::
DynamicCast
<
details
::
VarHandle
>
(
all_reduce_op_handle
->
Inputs
());
PADDLE_ENFORCE_EQ
(
inputs
.
size
(),
num_place
);
// The inputs' name should be the same.
auto
&
grad_name
=
inputs
[
0
]
->
name
();
for
(
size_t
i
=
1
;
i
<
inputs
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
inputs
[
i
]
->
name
(),
grad_name
,
"The input name should be the same."
);
}
PADDLE_ENFORCE_NE
(
grads
.
count
(
grad_name
),
static_cast
<
size_t
>
(
0
));
all_reduce_ops
.
emplace
(
grad_name
,
node
);
}
}
}
return
all_reduce_ops
;
}
void
InsertFusedAllReduce
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
num_of_all_reduce
,
...
...
paddle/fluid/operators/alloc_continuous_space_op.cc
浏览文件 @
74538573
...
...
@@ -227,8 +227,11 @@ REGISTER_OPERATOR(alloc_continuous_space,
paddle
::
operators
::
AllocContinuousSpaceOp
,
paddle
::
operators
::
AllocContinuousSpaceOpMaker
);
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CPU_KERNEL
(
alloc_continuous_space
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
plat
::
float16
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
...
...
@@ -237,6 +240,8 @@ REGISTER_OP_CPU_KERNEL(
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL
(
alloc_continuous_space
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/fluid/operators/optimizers/sgd_op.cc
浏览文件 @
74538573
...
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/sgd_op.h"
#include <string>
namespace
paddle
{
namespace
operators
{
...
...
@@ -46,6 +46,17 @@ class SGDOp : public framework::OperatorWithKernel {
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Param"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
framework
::
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
{
if
(
var_name
==
"LearningRate"
)
{
return
framework
::
OpKernelType
(
tensor
.
type
(),
tensor
.
place
(),
tensor
.
layout
());
}
return
framework
::
OpKernelType
(
expected_kernel_type
.
data_type_
,
tensor
.
place
(),
tensor
.
layout
());
}
};
class
SGDOpInferVarType
:
public
framework
::
VarTypeInference
{
...
...
paddle/fluid/operators/optimizers/sgd_op.cu
浏览文件 @
74538573
...
...
@@ -46,7 +46,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
// Atomic Operation to avoid concurrent write error.
paddle
::
platform
::
CudaAtomicAdd
(
tensor_out_ptr
+
index
,
-
1.0
*
learning_rate
[
0
]
*
selected_rows_ptr
[
index
]);
-
static_cast
<
T
>
(
1.0
)
*
learning_rate
[
0
]
*
selected_rows_ptr
[
index
]);
}
}
}
...
...
@@ -122,5 +122,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
sgd
,
ops
::
SGDOpCUDAKernel
<
float
>
,
ops
::
SGDOpCUDAKernel
<
double
>
);
ops
::
SGDOpCUDAKernel
<
double
>
,
ops
::
SGDOpCUDAKernel
<
plat
::
float16
>
);
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
0 → 100644
浏览文件 @
74538573
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
paddle.fluid.core
as
core
import
math
import
os
import
sys
import
unittest
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
simple_nets
import
init_data
from
parallel_executor_test_base
import
TestParallelExecutorBase
batch_size
=
12
img_shape
=
[
1
,
28
,
28
]
def
loss_net
(
hidden
,
label
):
prediction
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
,
act
=
'softmax'
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
return
avg_loss
def
conv_net
(
use_feed
):
img
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
img_shape
,
dtype
=
'float16'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
conv_pool_1
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
img
,
filter_size
=
5
,
num_filters
=
20
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
conv_pool_1
=
fluid
.
layers
.
batch_norm
(
conv_pool_1
)
conv_pool_1
=
fluid
.
layers
.
cast
(
conv_pool_1
,
np
.
float32
)
conv_pool_2
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
conv_pool_1
,
filter_size
=
5
,
num_filters
=
50
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
hidden
=
fluid
.
layers
.
cast
(
conv_pool_2
,
np
.
float32
)
return
loss_net
(
hidden
,
label
)
def
_optimizer
(
learning_rate
=
1e-6
):
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
learning_rate
)
return
optimizer
class
TestResnet
(
TestParallelExecutorBase
):
def
check_model
(
self
,
use_cuda
):
img
,
label
=
init_data
(
batch_size
=
batch_size
,
img_shape
=
img_shape
,
label_range
=
9
)
img
=
np
.
float16
(
img
).
view
(
np
.
uint16
)
feed_dict
=
{
"image"
:
img
,
"label"
:
label
}
TestParallelExecutorBase
.
check_network_convergence
(
conv_net
,
feed_dict
=
feed_dict
,
iter
=
10
,
use_cuda
=
use_cuda
,
fuse_all_reduce_ops
=
True
,
optimizer
=
_optimizer
)
def
test_model
(
self
):
if
core
.
is_compiled_with_cuda
():
self
.
check_model
(
True
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录