Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
74538573
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
74538573
编写于
7月 04, 2019
作者:
C
chengduo
提交者:
gongweibao
7月 04, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Make fuse_all_reduce_op_pass support mix_precision (#17652)
上级
55baeced
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
332 addition
and
152 deletion
+332
-152
paddle/fluid/framework/details/multi_devices_helper.h
paddle/fluid/framework/details/multi_devices_helper.h
+3
-3
paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc
...luid/framework/ir/alloc_continuous_space_for_grad_pass.cc
+170
-113
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
...work/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+9
-2
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
...rk/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+38
-31
paddle/fluid/operators/alloc_continuous_space_op.cc
paddle/fluid/operators/alloc_continuous_space_op.cc
+5
-0
paddle/fluid/operators/optimizers/sgd_op.cc
paddle/fluid/operators/optimizers/sgd_op.cc
+12
-1
paddle/fluid/operators/optimizers/sgd_op.cu
paddle/fluid/operators/optimizers/sgd_op.cu
+4
-2
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
...uid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+91
-0
未找到文件。
paddle/fluid/framework/details/multi_devices_helper.h
浏览文件 @
74538573
...
@@ -58,15 +58,15 @@ constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
...
@@ -58,15 +58,15 @@ constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
typedef
std
::
string
FusedOptType
;
typedef
std
::
string
FusedOptType
;
constexpr
char
kFusedOptType
[]
=
"fused_opt_type"
;
constexpr
char
kFusedOptType
[]
=
"fused_opt_type"
;
typedef
std
::
string
FusedGrads
;
typedef
std
::
vector
<
std
::
string
>
FusedGrads
;
constexpr
char
kFusedGrads
[]
=
"fused_gradients"
;
constexpr
char
kFusedGrads
[]
=
"fused_gradients"
;
typedef
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
ParamsAndGrads
;
typedef
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
ParamsAndGrads
;
constexpr
char
kParamsAndGrads
[]
=
"params_grads"
;
constexpr
char
kParamsAndGrads
[]
=
"params_grads"
;
typedef
std
::
vector
<
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>>
typedef
std
::
vector
<
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>>
Group
GradsAndParam
s
;
Group
ParamsAndGrad
s
;
constexpr
char
kGroup
GradsAndParams
[]
=
"group_grads_param
s"
;
constexpr
char
kGroup
ParamsAndGrads
[]
=
"group_params_grad
s"
;
}
// namespace details
}
// namespace details
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc
浏览文件 @
74538573
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h"
#include "paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h"
#include <algorithm>
#include <algorithm>
#include <map>
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <unordered_set>
#include <unordered_set>
...
@@ -52,18 +53,13 @@ static constexpr double kMB = 1048576.0;
...
@@ -52,18 +53,13 @@ static constexpr double kMB = 1048576.0;
void
SetFuseParameterGroupsSize
(
int
group_size
)
{
void
SetFuseParameterGroupsSize
(
int
group_size
)
{
FLAGS_fuse_parameter_groups_size
=
group_size
;
FLAGS_fuse_parameter_groups_size
=
group_size
;
}
}
int
GetFuseParameterGroupsSize
()
{
return
FLAGS_fuse_parameter_groups_size
;
}
int
GetFuseParameterGroupsSize
()
{
return
FLAGS_fuse_parameter_groups_size
;
}
void
SetFuseParameterMemorySize
(
double
memory_size
)
{
void
SetFuseParameterMemorySize
(
double
memory_size
)
{
FLAGS_fuse_parameter_memory_size
=
memory_size
;
FLAGS_fuse_parameter_memory_size
=
memory_size
;
}
}
double
GetFuseParameterMemorySize
()
{
return
FLAGS_fuse_parameter_memory_size
;
}
double
GetFuseParameterMemorySize
()
{
return
FLAGS_fuse_parameter_memory_size
;
}
static
framework
::
proto
::
VarType
::
Type
kDefaultDtype
=
framework
::
proto
::
VarType
::
Type
::
VarType_Type_BOOL
;
class
AllocContinuousSpaceForGradPass
:
public
ir
::
Pass
{
class
AllocContinuousSpaceForGradPass
:
public
ir
::
Pass
{
protected:
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
...
@@ -73,19 +69,16 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
...
@@ -73,19 +69,16 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
auto
&
local_scopes
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
auto
&
local_scopes
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
ResetAttribute
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
,
&
result
);
ResetAttribute
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
,
&
result
);
ResetAttribute
<
details
::
Group
GradsAndParams
>
(
details
::
kGroupGradsAndParam
s
,
ResetAttribute
<
details
::
Group
ParamsAndGrads
>
(
details
::
kGroupParamsAndGrad
s
,
&
result
);
&
result
);
// NOTE: The operator nodes should be in topology order.
std
::
vector
<
ir
::
Node
*>
topo_nodes
=
ir
::
TopologySortOperations
(
result
);
auto
&
params_grads
=
auto
&
params_grads
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
);
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
);
for
(
auto
&
node
:
topo_nodes
)
{
RecordParamsAndGrads
(
result
,
&
params_grads
);
RecordParamsAndGrads
(
node
,
&
params_grads
);
}
if
(
params_grads
.
size
()
==
0
)
{
auto
num_params_grads
=
params_grads
.
size
();
LOG
(
INFO
)
<<
"Doesn't find gradients"
;
VLOG
(
10
)
<<
"The number of params and grads is:"
<<
num_params_grads
;
if
(
num_params_grads
==
0
)
{
return
;
return
;
}
}
...
@@ -101,24 +94,43 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
...
@@ -101,24 +94,43 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
}
}
}
}
auto
&
group_grads_params
=
auto
&
group_params_grads
=
result
.
Get
<
details
::
GroupGradsAndParams
>
(
details
::
kGroupGradsAndParams
);
result
.
Get
<
details
::
GroupParamsAndGrads
>
(
details
::
kGroupParamsAndGrads
);
// Note: the order of params_grads may be changed by SetGroupParamsAndGrads.
// Note: the order of params_grads may be changed by SetGroupGradsAndParams.
SetGroupParamsAndGrads
(
var_name2node
,
params_grads
,
&
group_params_grads
);
SetGroupGradsAndParams
(
var_name2node
,
params_grads
,
&
group_grads_params
);
params_grads
.
clear
();
params_grads
.
clear
();
for
(
auto
&
group_p_g
:
group_grads_params
)
{
params_grads
.
reserve
(
num_params_grads
);
params_grads
.
insert
(
params_grads
.
begin
(),
group_p_g
.
begin
(),
for
(
auto
&
group_p_g
:
group_params_grads
)
{
params_grads
.
insert
(
params_grads
.
end
(),
group_p_g
.
begin
(),
group_p_g
.
end
());
group_p_g
.
end
());
}
}
for
(
auto
&
p_g
:
params_grads
)
{
PADDLE_ENFORCE_EQ
(
std
::
swap
(
p_g
.
first
,
p_g
.
second
);
num_params_grads
,
params_grads
.
size
(),
"The number of params_grads is not consistent with before."
);
if
(
IsUnifiedDtype
(
params_grads
,
var_name2node
))
{
SetGradientPersistable
(
params_grads
,
var_name2node
,
var_name2node_set
);
AllocContinuousAddressSpace
(
places
,
local_scopes
,
var_name2node
,
params_grads
,
&
result
);
}
else
{
// Set Gradients as Persistable to prevent this var becoming reusable.
for
(
auto
&
sub_param_grad
:
group_params_grads
)
{
SetGradientPersistable
(
params_grads
,
var_name2node
,
var_name2node_set
);
PADDLE_ENFORCE
(
IsUnifiedDtype
(
sub_param_grad
,
var_name2node
),
"The data type of the same group is not consistent."
);
AllocContinuousAddressSpace
(
places
,
local_scopes
,
var_name2node
,
sub_param_grad
,
&
result
);
}
}
}
}
// Set Gradients as Persistable to prevent this var becoming reusable.
void
SetGradientPersistable
(
auto
dtype
=
kDefaultDtype
;
const
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
&
sub_param_grad
,
for
(
auto
&
p_g
:
params_grads
)
{
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_name2node
,
const
std
::
unordered_map
<
std
::
string
,
std
::
unordered_set
<
ir
::
Node
*>>
&
var_name2node_set
)
const
{
for
(
auto
&
p_g
:
sub_param_grad
)
{
// Get gradient var
// Get gradient var
auto
iter
=
var_name2node
.
find
(
p_g
.
second
);
auto
iter
=
var_name2node
.
find
(
p_g
.
second
);
PADDLE_ENFORCE
(
iter
!=
var_name2node
.
end
(),
"%s is not found."
,
PADDLE_ENFORCE
(
iter
!=
var_name2node
.
end
(),
"%s is not found."
,
...
@@ -132,32 +144,45 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
...
@@ -132,32 +144,45 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
}
}
PADDLE_ENFORCE
(
IsSupportedVarType
(
iter
->
second
->
Var
()
->
GetType
()));
PADDLE_ENFORCE
(
IsSupportedVarType
(
iter
->
second
->
Var
()
->
GetType
()));
}
}
// Get Dtype
bool
IsUnifiedDtype
(
auto
ele_dtype
=
iter
->
second
->
Var
()
->
GetDataType
();
const
details
::
ParamsAndGrads
&
params_grads
,
if
(
dtype
==
kDefaultDtype
)
{
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_name2node
)
const
{
dtype
=
ele_dtype
;
auto
dtype
=
PADDLE_ENFORCE_NE
(
ele_dtype
,
kDefaultDtype
,
this
->
GetDtypeOfVar
(
var_name2node
,
params_grads
.
front
().
second
);
"The data type should not be bool."
);
for
(
auto
p_g
:
params_grads
)
{
auto
next_dtype
=
this
->
GetDtypeOfVar
(
var_name2node
,
p_g
.
second
);
if
(
next_dtype
!=
dtype
)
{
return
false
;
}
}
PADDLE_ENFORCE_EQ
(
ele_dtype
,
dtype
,
"The data type of input is not consistent."
);
}
}
return
true
;
}
void
AllocContinuousAddressSpace
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_name2node
,
const
details
::
ParamsAndGrads
&
params_grads
,
Graph
*
result
)
const
{
// Create a FusedVarsSet to avoid duplicating names for fused_var in other
// Create a FusedVarsSet to avoid duplicating names for fused_var in other
// pass.
// pass.
if
(
!
result
.
Has
(
details
::
kFusedVars
))
{
if
(
!
result
->
Has
(
details
::
kFusedVars
))
{
result
.
Set
(
details
::
kFusedVars
,
new
details
::
FusedVars
);
result
->
Set
(
details
::
kFusedVars
,
new
details
::
FusedVars
);
}
}
// the kFusedGrads is used be fuse_optimizer_op_pass.
// the kFusedGrads is used be fuse_optimizer_op_pass.
result
.
Set
(
details
::
kFusedGrads
,
new
details
::
FusedGrads
);
if
(
!
result
->
Has
(
details
::
kFusedGrads
))
{
result
->
Set
(
details
::
kFusedGrads
,
new
details
::
FusedGrads
);
}
// the fused_var_name should be unique, so it appends
// the fused_var_name should be unique, so it appends
// params_grads.begin()->second.
// params_grads.begin()->second.
auto
fused_var_name
=
std
::
string
(
details
::
kFusedVarNamePrefix
)
+
"@GRAD@"
+
auto
fused_var_name
=
std
::
string
(
details
::
kFusedVarNamePrefix
)
+
"@GRAD@"
+
params_grads
.
begin
()
->
second
;
params_grads
.
begin
()
->
second
;
result
.
Get
<
details
::
FusedGrads
>
(
details
::
kFusedGrads
)
=
fused_var_name
;
result
->
Get
<
details
::
FusedGrads
>
(
details
::
kFusedGrads
)
auto
&
fused_var_set
=
result
.
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
.
emplace_back
(
fused_var_name
);
auto
&
fused_var_set
=
result
->
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
PADDLE_ENFORCE_EQ
(
fused_var_set
.
count
(
fused_var_name
),
0
,
PADDLE_ENFORCE_EQ
(
fused_var_set
.
count
(
fused_var_name
),
0
,
"%s is duplicate in FusedVars."
,
fused_var_name
);
"%s is duplicate in FusedVars."
,
fused_var_name
);
fused_var_set
.
insert
(
fused_var_name
);
fused_var_set
.
insert
(
fused_var_name
);
...
@@ -175,109 +200,126 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
...
@@ -175,109 +200,126 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
graph
->
Set
(
attr_name
,
new
AttrType
);
graph
->
Set
(
attr_name
,
new
AttrType
);
}
}
void
SetGroup
GradsAndParam
s
(
void
SetGroup
ParamsAndGrad
s
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
const
details
::
ParamsAndGrads
&
params_grads
,
const
details
::
ParamsAndGrads
&
params_grads
,
details
::
Group
GradsAndParams
*
group_grads_param
s
)
const
{
details
::
Group
ParamsAndGrads
*
group_params_grad
s
)
const
{
SetGroupAccordingToLayers
(
var_nodes
,
params_grads
,
group_
grads_param
s
);
SetGroupAccordingToLayers
(
var_nodes
,
params_grads
,
group_
params_grad
s
);
SetGroupAccordingToMemorySize
(
var_nodes
,
group_
grads_param
s
);
SetGroupAccordingToMemorySize
(
var_nodes
,
group_
params_grad
s
);
}
}
void
SetGroupAccordingToLayers
(
void
SetGroupAccordingToLayers
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
const
details
::
ParamsAndGrads
&
params_grads
,
const
details
::
ParamsAndGrads
&
params_grads
,
details
::
GroupGradsAndParams
*
group_grads_params
)
const
{
details
::
GroupParamsAndGrads
*
group_params_grads
)
const
{
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int
>>
layer_params
;
using
var_dtype
=
std
::
pair
<
std
::
string
,
proto
::
VarType
::
Type
>
;
std
::
map
<
var_dtype
,
size_t
>
var_idx
;
for
(
size_t
i
=
0
;
i
<
params_grads
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
params_grads
.
size
();
++
i
)
{
auto
pos
=
params_grads
[
i
].
first
.
find_first_of
(
"."
);
auto
pos
=
params_grads
[
i
].
first
.
find_first_of
(
"."
);
auto
dtype
=
GetDtypeOfVar
(
var_nodes
,
params_grads
[
i
].
second
);
var_dtype
var_key
;
if
(
pos
==
std
::
string
::
npos
)
{
if
(
pos
==
std
::
string
::
npos
)
{
layer_params
[
params_grads
[
i
].
first
].
emplace_back
(
i
);
var_key
=
std
::
make_pair
(
params_grads
[
i
].
first
,
dtype
);
}
else
{
}
else
{
layer_params
[
params_grads
[
i
].
first
.
substr
(
0
,
pos
)].
emplace_back
(
i
);
var_key
=
std
::
make_pair
(
params_grads
[
i
].
first
.
substr
(
0
,
pos
),
dtype
);
}
}
}
group_grads_params
->
reserve
(
layer_params
.
size
());
size_t
idx
=
0
;
for
(
size_t
i
=
0
;
i
<
params_grads
.
size
();
++
i
)
{
auto
var_idx_iter
=
var_idx
.
find
(
var_key
);
auto
pos
=
params_grads
[
i
].
first
.
find_first_of
(
"."
);
if
(
var_idx_iter
!=
var_idx
.
end
())
{
std
::
string
key
=
params_grads
[
i
].
first
;
idx
=
var_idx_iter
->
second
;
if
(
pos
!=
std
::
string
::
npos
)
{
}
else
{
key
=
params_grads
[
i
].
first
.
substr
(
0
,
pos
);
group_params_grads
->
emplace_back
();
}
idx
=
group_params_grads
->
size
()
-
1
;
auto
iter
=
layer_params
.
find
(
key
);
var_idx
[
var_key
]
=
idx
;
if
(
iter
==
layer_params
.
end
())
continue
;
group_grads_params
->
emplace_back
();
auto
&
local_group_grads_params
=
group_grads_params
->
back
();
for
(
auto
&
idx
:
iter
->
second
)
{
local_group_grads_params
.
emplace_back
(
std
::
make_pair
(
params_grads
[
idx
].
second
,
params_grads
[
idx
].
first
));
}
}
layer_params
.
erase
(
iter
);
auto
&
local_group_params_grads
=
group_params_grads
->
at
(
idx
);
local_group_params_grads
.
emplace_back
(
std
::
make_pair
(
params_grads
[
i
].
first
,
params_grads
[
i
].
second
));
}
}
VLOG
(
10
)
<<
"SetGroupAccordingToLayers: "
;
if
(
VLOG_IS_ON
(
10
))
{
if
(
VLOG_IS_ON
(
10
))
{
PrintGroupInfo
(
var_nodes
,
group_grads_params
);
VLOG
(
10
)
<<
"SetGroupAccordingToLayers: "
;
PrintGroupInfo
(
var_nodes
,
group_params_grads
);
}
}
}
}
void
PrintGroupInfo
(
void
PrintGroupInfo
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
details
::
Group
GradsAndParams
*
group_grads_param
s
)
const
{
details
::
Group
ParamsAndGrads
*
group_params_grad
s
)
const
{
for
(
size_t
i
=
0
;
i
<
group_
grads_param
s
->
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
group_
params_grad
s
->
size
();
++
i
)
{
VLOG
(
10
)
<<
"group "
<<
i
;
VLOG
(
10
)
<<
"group "
<<
i
;
std
::
stringstream
out
;
std
::
stringstream
out
;
size_t
gps_size
=
0
;
size_t
gps_size
=
0
;
for
(
auto
&
g_p
:
group_grads_param
s
->
at
(
i
))
{
for
(
auto
&
p_g
:
group_params_grad
s
->
at
(
i
))
{
auto
iter
=
var_nodes
.
find
(
g_p
.
second
);
auto
iter
=
var_nodes
.
find
(
p_g
.
first
);
PADDLE_ENFORCE
(
iter
!=
var_nodes
.
end
(),
"%s is not found."
,
g_p
.
second
);
PADDLE_ENFORCE
(
iter
!=
var_nodes
.
end
(),
"%s is not found."
,
p_g
.
first
);
auto
shape
=
iter
->
second
->
Var
()
->
GetShape
();
auto
shape
=
iter
->
second
->
Var
()
->
GetShape
();
size_t
size
=
framework
::
SizeOfType
(
iter
->
second
->
Var
()
->
GetDataType
());
size_t
size
=
framework
::
SizeOfType
(
iter
->
second
->
Var
()
->
GetDataType
());
std
::
for_each
(
shape
.
begin
(),
shape
.
end
(),
std
::
for_each
(
shape
.
begin
(),
shape
.
end
(),
[
&
size
](
const
int64_t
&
n
)
{
size
*=
n
;
});
[
&
size
](
const
int64_t
&
n
)
{
size
*=
n
;
});
gps_size
+=
size
;
gps_size
+=
size
;
out
<<
string
::
Sprintf
(
"(%s(%d), %s)"
,
g_p
.
second
,
size
,
g_p
.
first
);
out
<<
string
::
Sprintf
(
"(%s(%d), %s)"
,
p_g
.
first
,
size
,
p_g
.
second
);
}
}
auto
dtype
=
this
->
GetDtypeOfVar
(
var_nodes
,
group_params_grads
->
at
(
i
).
front
().
first
);
VLOG
(
10
)
<<
out
.
str
()
VLOG
(
10
)
<<
out
.
str
()
<<
", group size:"
<<
group_
grads_param
s
->
at
(
i
).
size
()
<<
", group size:"
<<
group_
params_grad
s
->
at
(
i
).
size
()
<<
", group memory size:"
<<
static_cast
<
double
>
(
gps_size
)
/
kMB
<<
", group memory size:"
<<
static_cast
<
double
>
(
gps_size
)
/
kMB
<<
"(MB)"
;
<<
"(MB)"
<<
", dtype:"
<<
dtype
;
}
}
}
}
void
SetGroupAccordingToMemorySize
(
void
SetGroupAccordingToMemorySize
(
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
const
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
&
var_nodes
,
details
::
Group
GradsAndParams
*
group_grads_param
s
)
const
{
details
::
Group
ParamsAndGrads
*
group_params_grad
s
)
const
{
const
double
group_memory_size
=
GetFuseParameterMemorySize
();
const
double
group_memory_size
=
GetFuseParameterMemorySize
();
if
(
group_memory_size
<=
0.0
)
{
if
(
group_memory_size
<=
0.0
)
{
return
;
return
;
}
}
details
::
GroupGradsAndParams
local_group_grads_params
;
details
::
GroupParamsAndGrads
local_group_params_grads
;
size_t
j
=
0
;
size_t
j
=
0
;
while
(
j
<
group_grads_params
->
size
())
{
while
(
j
<
group_params_grads
->
size
())
{
local_group_grads_params
.
emplace_back
();
local_group_params_grads
.
emplace_back
();
auto
&
group_p_g
=
local_group_grads_params
.
back
();
auto
&
group_p_g
=
local_group_params_grads
.
back
();
auto
&
grad_name
=
group_params_grads
->
at
(
j
).
front
().
second
;
auto
var_type
=
GetDtypeOfVar
(
var_nodes
,
grad_name
);
size_t
local_group_memory_size
=
0
;
size_t
local_group_memory_size
=
0
;
while
(
j
<
group_
grads_param
s
->
size
())
{
while
(
j
<
group_
params_grad
s
->
size
())
{
std
::
for_each
(
std
::
for_each
(
group_
grads_params
->
at
(
j
).
begin
(),
group_grads_param
s
->
at
(
j
).
end
(),
group_
params_grads
->
at
(
j
).
begin
(),
group_params_grad
s
->
at
(
j
).
end
(),
[
&
local_group_memory_size
,
[
&
local_group_memory_size
,
&
var_nodes
](
const
std
::
pair
<
std
::
string
,
std
::
string
>
&
g_p
)
{
&
var_nodes
](
const
std
::
pair
<
std
::
string
,
std
::
string
>
&
p_g
)
{
auto
iter
=
var_nodes
.
find
(
g_p
.
second
);
auto
iter
=
var_nodes
.
find
(
p_g
.
second
);
PADDLE_ENFORCE
(
iter
!=
var_nodes
.
end
(),
"%s is not found."
,
PADDLE_ENFORCE
(
iter
!=
var_nodes
.
end
(),
"%s is not found."
,
g_p
.
second
);
p_g
.
second
);
auto
shape
=
iter
->
second
->
Var
()
->
GetShape
();
size_t
size
=
size_t
size
=
framework
::
SizeOfType
(
iter
->
second
->
Var
()
->
GetDataType
());
framework
::
SizeOfType
(
iter
->
second
->
Var
()
->
GetDataType
());
auto
shape
=
iter
->
second
->
Var
()
->
GetShape
();
std
::
for_each
(
shape
.
begin
(),
shape
.
end
(),
std
::
for_each
(
shape
.
begin
(),
shape
.
end
(),
[
&
size
](
const
int64_t
&
n
)
{
size
*=
n
;
});
[
&
size
](
const
int64_t
&
n
)
{
size
*=
n
;
});
local_group_memory_size
+=
size
;
local_group_memory_size
+=
size
;
});
});
group_p_g
.
insert
(
group_p_g
.
end
(),
group_grads_params
->
at
(
j
).
begin
(),
group_grads_params
->
at
(
j
).
end
());
group_p_g
.
insert
(
group_p_g
.
end
(),
group_params_grads
->
at
(
j
).
begin
(),
group_params_grads
->
at
(
j
).
end
());
++
j
;
++
j
;
if
(
j
>=
group_params_grads
->
size
())
{
break
;
}
if
(
GetFuseParameterGroupsSize
()
>
1
&&
if
(
GetFuseParameterGroupsSize
()
>
1
&&
group_p_g
.
size
()
>
group_p_g
.
size
()
>
static_cast
<
size_t
>
(
GetFuseParameterGroupsSize
()))
{
static_cast
<
size_t
>
(
GetFuseParameterGroupsSize
()))
{
...
@@ -288,49 +330,64 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
...
@@ -288,49 +330,64 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
group_memory_size
)
{
group_memory_size
)
{
break
;
break
;
}
}
auto
next_var_type
=
GetDtypeOfVar
(
var_nodes
,
group_params_grads
->
at
(
j
).
front
().
second
);
if
(
next_var_type
!=
var_type
)
{
break
;
}
}
}
}
}
std
::
swap
(
*
group_grads_params
,
local_group_grads_params
);
std
::
swap
(
*
group_params_grads
,
local_group_params_grads
);
VLOG
(
10
)
<<
string
::
Sprintf
(
"SetGroupAccordingToMemorySize(memory_size: %f):"
,
group_memory_size
);
if
(
VLOG_IS_ON
(
10
))
{
if
(
VLOG_IS_ON
(
10
))
{
PrintGroupInfo
(
var_nodes
,
group_grads_params
);
VLOG
(
10
)
<<
string
::
Sprintf
(
"SetGroupAccordingToMemorySize(memory_size: %f):"
,
group_memory_size
);
PrintGroupInfo
(
var_nodes
,
group_params_grads
);
}
}
}
}
proto
::
VarType
::
Type
GetDtypeOfVar
(
const
std
::
unordered_map
<
std
::
string
,
Node
*>
&
var_nodes
,
const
std
::
string
&
name
)
const
{
auto
grad_iter
=
var_nodes
.
find
(
name
);
PADDLE_ENFORCE
(
grad_iter
!=
var_nodes
.
end
());
PADDLE_ENFORCE_NOT_NULL
(
grad_iter
->
second
->
Var
());
return
grad_iter
->
second
->
Var
()
->
GetDataType
();
}
private:
private:
bool
IsSupportedVarType
(
const
proto
::
VarType
::
Type
&
type
)
const
{
bool
IsSupportedVarType
(
const
proto
::
VarType
::
Type
&
type
)
const
{
// Current only support LOD_TENSOR.
// Current only support LOD_TENSOR.
return
type
==
proto
::
VarType
::
LOD_TENSOR
;
return
type
==
proto
::
VarType
::
LOD_TENSOR
;
}
}
void
RecordParamsAndGrads
(
ir
::
Node
*
node
,
void
RecordParamsAndGrads
(
const
ir
::
Graph
&
graph
,
details
::
ParamsAndGrads
*
params_grads
)
const
{
details
::
ParamsAndGrads
*
params_grads
)
const
{
try
{
std
::
vector
<
ir
::
Node
*>
topo_nodes
=
ir
::
TopologySortOperations
(
graph
);
bool
is_bk_op
=
for
(
auto
&
node
:
topo_nodes
)
{
static_cast
<
bool
>
(
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
try
{
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
&
bool
is_bk_op
=
static_cast
<
int
>
(
OpRole
::
kBackward
));
static_cast
<
bool
>
(
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
if
(
!
is_bk_op
)
return
;
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
&
static_cast
<
int
>
(
OpRole
::
kBackward
));
// Currently, we assume that once gradient is generated, it can be
if
(
!
is_bk_op
)
continue
;
// broadcast, and each gradient is only broadcast once.
// Currently, we assume that once gradient is generated, it can be
auto
backward_vars
=
// broadcast, and each gradient is only broadcast once.
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
auto
backward_vars
=
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
PADDLE_ENFORCE_EQ
(
backward_vars
.
size
()
%
2
,
static_cast
<
size_t
>
(
0
));
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
backward_vars
.
size
()
%
2
,
static_cast
<
size_t
>
(
0
));
for
(
size_t
i
=
0
;
i
<
backward_vars
.
size
();
i
+=
2
)
{
for
(
size_t
i
=
0
;
i
<
backward_vars
.
size
();
i
+=
2
)
{
VLOG
(
10
)
<<
"Trainable parameter: "
<<
backward_vars
[
i
]
VLOG
(
10
)
<<
"Trainable parameter: "
<<
backward_vars
[
i
]
<<
", gradient: "
<<
backward_vars
[
i
+
1
];
<<
", gradient: "
<<
backward_vars
[
i
+
1
];
params_grads
->
emplace_back
(
std
::
make_pair
(
params_grads
->
emplace_back
(
std
::
make_pair
(
backward_vars
[
i
]
/*param*/
,
backward_vars
[
i
+
1
]
/*grad*/
));
backward_vars
[
i
]
/*param*/
,
backward_vars
[
i
+
1
]
/*grad*/
));
}
}
catch
(
boost
::
bad_get
e
)
{
}
}
}
catch
(
boost
::
bad_get
e
)
{
}
}
}
}
...
...
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
浏览文件 @
74538573
...
@@ -101,10 +101,17 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
...
@@ -101,10 +101,17 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
"this pass."
);
"this pass."
);
}
}
auto
&
fused_grad
=
result
.
Get
<
details
::
FusedGrads
>
(
details
::
kFusedGrads
);
auto
&
fused_grad
=
result
.
Get
<
details
::
FusedGrads
>
(
details
::
kFusedGrads
);
PADDLE_ENFORCE_NE
(
fused_grad
.
size
(),
0
,
"The fused gradient should not be empty."
);
PADDLE_ENFORCE_EQ
(
fused_grad
.
size
(),
1
,
"Because the dtype of those gradients "
"is not unified, so the number of fused gradients is "
"more than one, but it is not supported currently."
);
auto
&
fused_vars
=
result
.
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
auto
&
fused_vars
=
result
.
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
auto
iter
=
std
::
find
(
fused_vars
.
begin
(),
fused_vars
.
end
(),
fused_grad
);
auto
iter
=
std
::
find
(
fused_vars
.
begin
(),
fused_vars
.
end
(),
fused_grad
.
front
());
PADDLE_ENFORCE
(
iter
!=
fused_vars
.
end
(),
"Not find the fused_grad."
);
PADDLE_ENFORCE
(
iter
!=
fused_vars
.
end
(),
"Not find the fused_grad."
);
fused_vars_name
[
kGrad
]
=
fused_grad
;
fused_vars_name
[
kGrad
]
=
fused_grad
.
front
()
;
// Sort the parameters and auxiliary variables according
// Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
// to parameters' name to make variables' name correspond correctly.
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
浏览文件 @
74538573
...
@@ -30,7 +30,6 @@ class FuseAllReduceOpPass : public ir::Pass {
...
@@ -30,7 +30,6 @@ class FuseAllReduceOpPass : public ir::Pass {
protected:
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
{
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
{
ir
::
Graph
&
result
=
*
graph
;
ir
::
Graph
&
result
=
*
graph
;
auto
&
places
=
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
details
::
kPlaces
);
auto
&
places
=
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
details
::
kPlaces
);
auto
&
local_scopes
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
auto
&
local_scopes
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...
@@ -38,38 +37,17 @@ class FuseAllReduceOpPass : public ir::Pass {
...
@@ -38,38 +37,17 @@ class FuseAllReduceOpPass : public ir::Pass {
&
Get
<
platform
::
NCCLCommunicator
>
(
details
::
kNCCLCtxs
);
&
Get
<
platform
::
NCCLCommunicator
>
(
details
::
kNCCLCtxs
);
#endif
#endif
std
::
unordered_set
<
std
::
string
>
grads
;
auto
&
params_grads
=
auto
&
params_grads
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
);
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndGrads
);
size_t
num_of_all_reduce
=
params_grads
.
size
();
size_t
num_of_all_reduce
=
params_grads
.
size
();
std
::
unordered_set
<
std
::
string
>
grads
;
grads
.
reserve
(
num_of_all_reduce
);
grads
.
reserve
(
num_of_all_reduce
);
for
(
auto
p_g
:
params_grads
)
{
for
(
auto
p_g
:
params_grads
)
{
grads
.
insert
(
p_g
.
second
);
grads
.
insert
(
p_g
.
second
);
}
}
size_t
num_place
=
places
.
size
();
std
::
unordered_map
<
std
::
string
,
Node
*>
all_reduce_ops
=
std
::
unordered_map
<
std
::
string
,
ir
::
Node
*>
all_reduce_ops
;
GetAllReduceOps
(
result
,
places
,
grads
);
all_reduce_ops
.
reserve
(
grads
.
size
());
for
(
auto
&
node
:
result
.
Nodes
())
{
if
(
node
->
IsOp
())
{
PADDLE_ENFORCE
(
node
->
IsWrappedBy
<
details
::
OpHandleBase
>
());
auto
*
all_reduce_op_handle
=
dynamic_cast
<
details
::
AllReduceOpHandle
*>
(
&
node
->
Wrapper
<
details
::
OpHandleBase
>
());
if
(
all_reduce_op_handle
)
{
auto
inputs
=
details
::
DynamicCast
<
details
::
VarHandle
>
(
all_reduce_op_handle
->
Inputs
());
PADDLE_ENFORCE_EQ
(
inputs
.
size
(),
num_place
);
// The inputs' name should be the same.
auto
&
grad_name
=
inputs
[
0
]
->
name
();
for
(
size_t
i
=
1
;
i
<
inputs
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
inputs
[
i
]
->
name
(),
grad_name
,
"The input name should be the same."
);
}
PADDLE_ENFORCE_NE
(
grads
.
count
(
grad_name
),
static_cast
<
size_t
>
(
0
));
all_reduce_ops
.
emplace
(
grad_name
,
node
);
}
}
}
VLOG
(
10
)
<<
"Find all_reduce_ops: "
<<
all_reduce_ops
.
size
();
VLOG
(
10
)
<<
"Find all_reduce_ops: "
<<
all_reduce_ops
.
size
();
if
(
all_reduce_ops
.
size
()
==
0
)
{
if
(
all_reduce_ops
.
size
()
==
0
)
{
...
@@ -82,16 +60,16 @@ class FuseAllReduceOpPass : public ir::Pass {
...
@@ -82,16 +60,16 @@ class FuseAllReduceOpPass : public ir::Pass {
"it is not supported currently."
);
"it is not supported currently."
);
VLOG
(
10
)
<<
"Insert fused_all_reduce"
;
VLOG
(
10
)
<<
"Insert fused_all_reduce"
;
auto
&
group_
grads_param
s
=
auto
&
group_
params_grad
s
=
graph
->
Get
<
details
::
Group
GradsAndParams
>
(
details
::
kGroupGradsAndParam
s
);
graph
->
Get
<
details
::
Group
ParamsAndGrads
>
(
details
::
kGroupParamsAndGrad
s
);
for
(
auto
&
group_
g_p
:
group_grads_param
s
)
{
for
(
auto
&
group_
p_g
:
group_params_grad
s
)
{
size_t
group_size
=
group_
g_p
.
size
();
size_t
group_size
=
group_
p_g
.
size
();
PADDLE_ENFORCE_GT
(
group_size
,
static_cast
<
size_t
>
(
0
));
PADDLE_ENFORCE_GT
(
group_size
,
static_cast
<
size_t
>
(
0
));
std
::
vector
<
ir
::
Node
*>
group_all_reduce_ops
;
std
::
vector
<
ir
::
Node
*>
group_all_reduce_ops
;
group_all_reduce_ops
.
reserve
(
group_size
);
group_all_reduce_ops
.
reserve
(
group_size
);
for
(
auto
&
g_p
:
group_g_p
)
{
for
(
auto
&
p_g
:
group_p_g
)
{
group_all_reduce_ops
.
emplace_back
(
all_reduce_ops
.
at
(
g_p
.
first
));
group_all_reduce_ops
.
emplace_back
(
all_reduce_ops
.
at
(
p_g
.
second
));
}
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
InsertFusedAllReduce
(
places
,
local_scopes
,
group_size
,
InsertFusedAllReduce
(
places
,
local_scopes
,
group_size
,
...
@@ -103,6 +81,35 @@ class FuseAllReduceOpPass : public ir::Pass {
...
@@ -103,6 +81,35 @@ class FuseAllReduceOpPass : public ir::Pass {
}
}
}
}
std
::
unordered_map
<
std
::
string
,
Node
*>
GetAllReduceOps
(
const
Graph
&
result
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
grads
)
const
{
size_t
num_place
=
places
.
size
();
std
::
unordered_map
<
std
::
string
,
Node
*>
all_reduce_ops
;
all_reduce_ops
.
reserve
(
grads
.
size
());
for
(
auto
&
node
:
result
.
Nodes
())
{
if
(
node
->
IsOp
())
{
PADDLE_ENFORCE
(
node
->
IsWrappedBy
<
details
::
OpHandleBase
>
());
auto
*
all_reduce_op_handle
=
dynamic_cast
<
details
::
AllReduceOpHandle
*>
(
&
node
->
Wrapper
<
details
::
OpHandleBase
>
());
if
(
all_reduce_op_handle
)
{
auto
inputs
=
details
::
DynamicCast
<
details
::
VarHandle
>
(
all_reduce_op_handle
->
Inputs
());
PADDLE_ENFORCE_EQ
(
inputs
.
size
(),
num_place
);
// The inputs' name should be the same.
auto
&
grad_name
=
inputs
[
0
]
->
name
();
for
(
size_t
i
=
1
;
i
<
inputs
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
inputs
[
i
]
->
name
(),
grad_name
,
"The input name should be the same."
);
}
PADDLE_ENFORCE_NE
(
grads
.
count
(
grad_name
),
static_cast
<
size_t
>
(
0
));
all_reduce_ops
.
emplace
(
grad_name
,
node
);
}
}
}
return
all_reduce_ops
;
}
void
InsertFusedAllReduce
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
void
InsertFusedAllReduce
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
num_of_all_reduce
,
const
size_t
num_of_all_reduce
,
...
...
paddle/fluid/operators/alloc_continuous_space_op.cc
浏览文件 @
74538573
...
@@ -227,8 +227,11 @@ REGISTER_OPERATOR(alloc_continuous_space,
...
@@ -227,8 +227,11 @@ REGISTER_OPERATOR(alloc_continuous_space,
paddle
::
operators
::
AllocContinuousSpaceOp
,
paddle
::
operators
::
AllocContinuousSpaceOp
,
paddle
::
operators
::
AllocContinuousSpaceOpMaker
);
paddle
::
operators
::
AllocContinuousSpaceOpMaker
);
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
alloc_continuous_space
,
alloc_continuous_space
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
plat
::
float16
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
...
@@ -237,6 +240,8 @@ REGISTER_OP_CPU_KERNEL(
...
@@ -237,6 +240,8 @@ REGISTER_OP_CPU_KERNEL(
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
alloc_continuous_space
,
alloc_continuous_space
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
AllocContinuousSpaceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/fluid/operators/optimizers/sgd_op.cc
浏览文件 @
74538573
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/optimizers/sgd_op.h"
#include "paddle/fluid/operators/optimizers/sgd_op.h"
#include <string>
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -46,6 +46,17 @@ class SGDOp : public framework::OperatorWithKernel {
...
@@ -46,6 +46,17 @@ class SGDOp : public framework::OperatorWithKernel {
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Param"
));
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Param"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
}
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
framework
::
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
{
if
(
var_name
==
"LearningRate"
)
{
return
framework
::
OpKernelType
(
tensor
.
type
(),
tensor
.
place
(),
tensor
.
layout
());
}
return
framework
::
OpKernelType
(
expected_kernel_type
.
data_type_
,
tensor
.
place
(),
tensor
.
layout
());
}
};
};
class
SGDOpInferVarType
:
public
framework
::
VarTypeInference
{
class
SGDOpInferVarType
:
public
framework
::
VarTypeInference
{
...
...
paddle/fluid/operators/optimizers/sgd_op.cu
浏览文件 @
74538573
...
@@ -46,7 +46,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
...
@@ -46,7 +46,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
// Atomic Operation to avoid concurrent write error.
// Atomic Operation to avoid concurrent write error.
paddle
::
platform
::
CudaAtomicAdd
(
paddle
::
platform
::
CudaAtomicAdd
(
tensor_out_ptr
+
index
,
tensor_out_ptr
+
index
,
-
1.0
*
learning_rate
[
0
]
*
selected_rows_ptr
[
index
]);
-
static_cast
<
T
>
(
1.0
)
*
learning_rate
[
0
]
*
selected_rows_ptr
[
index
]);
}
}
}
}
}
}
...
@@ -122,5 +122,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -122,5 +122,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
}
// namespace paddle
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
sgd
,
ops
::
SGDOpCUDAKernel
<
float
>
,
REGISTER_OP_CUDA_KERNEL
(
sgd
,
ops
::
SGDOpCUDAKernel
<
float
>
,
ops
::
SGDOpCUDAKernel
<
double
>
);
ops
::
SGDOpCUDAKernel
<
double
>
,
ops
::
SGDOpCUDAKernel
<
plat
::
float16
>
);
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
0 → 100644
浏览文件 @
74538573
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
paddle.fluid.core
as
core
import
math
import
os
import
sys
import
unittest
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
simple_nets
import
init_data
from
parallel_executor_test_base
import
TestParallelExecutorBase
batch_size
=
12
img_shape
=
[
1
,
28
,
28
]
def
loss_net
(
hidden
,
label
):
prediction
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
,
act
=
'softmax'
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
return
avg_loss
def
conv_net
(
use_feed
):
img
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
img_shape
,
dtype
=
'float16'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
conv_pool_1
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
img
,
filter_size
=
5
,
num_filters
=
20
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
conv_pool_1
=
fluid
.
layers
.
batch_norm
(
conv_pool_1
)
conv_pool_1
=
fluid
.
layers
.
cast
(
conv_pool_1
,
np
.
float32
)
conv_pool_2
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
conv_pool_1
,
filter_size
=
5
,
num_filters
=
50
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
hidden
=
fluid
.
layers
.
cast
(
conv_pool_2
,
np
.
float32
)
return
loss_net
(
hidden
,
label
)
def
_optimizer
(
learning_rate
=
1e-6
):
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
learning_rate
)
return
optimizer
class
TestResnet
(
TestParallelExecutorBase
):
def
check_model
(
self
,
use_cuda
):
img
,
label
=
init_data
(
batch_size
=
batch_size
,
img_shape
=
img_shape
,
label_range
=
9
)
img
=
np
.
float16
(
img
).
view
(
np
.
uint16
)
feed_dict
=
{
"image"
:
img
,
"label"
:
label
}
TestParallelExecutorBase
.
check_network_convergence
(
conv_net
,
feed_dict
=
feed_dict
,
iter
=
10
,
use_cuda
=
use_cuda
,
fuse_all_reduce_ops
=
True
,
optimizer
=
_optimizer
)
def
test_model
(
self
):
if
core
.
is_compiled_with_cuda
():
self
.
check_model
(
True
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录