Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
6ce49eea
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6ce49eea
编写于
11月 29, 2019
作者:
W
WangXi
提交者:
gongweibao
11月 29, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix dgc accuracy by mv regularization to local, test=release/1.6 (#21390)
上级
06545fcf
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
179 addition
and
20 deletion
+179
-20
paddle/fluid/operators/dgc_op.cc
paddle/fluid/operators/dgc_op.cc
+14
-0
paddle/fluid/operators/dgc_op.h
paddle/fluid/operators/dgc_op.h
+43
-11
paddle/fluid/operators/optimizers/dgc_momentum_op.cc
paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+12
-1
paddle/fluid/operators/optimizers/dgc_momentum_op.h
paddle/fluid/operators/optimizers/dgc_momentum_op.h
+20
-0
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+56
-1
python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py
python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py
+10
-3
python/paddle/fluid/tests/unittests/test_dgc_op.py
python/paddle/fluid/tests/unittests/test_dgc_op.py
+9
-0
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+15
-4
未找到文件。
paddle/fluid/operators/dgc_op.cc
浏览文件 @
6ce49eea
...
@@ -29,6 +29,9 @@ class DGCOp : public framework::OperatorWithKernel {
...
@@ -29,6 +29,9 @@ class DGCOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"V"
),
"Input(V) of DGCop should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"V"
),
"Input(V) of DGCop should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grad"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grad"
),
"Input(Grad) of DGCop should not be null."
);
"Input(Grad) of DGCop should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Param"
),
true
,
platform
::
errors
::
NotFound
(
"Input(Param) of DGCop is not found."
));
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"current_step"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"current_step"
),
"Input(current_step) of DGCop should not be null."
);
"Input(current_step) of DGCop should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"nranks"
),
true
,
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"nranks"
),
true
,
...
@@ -66,6 +69,7 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -66,6 +69,7 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"U"
,
"(Tensor) U velocity tensor of DGC"
);
AddInput
(
"U"
,
"(Tensor) U velocity tensor of DGC"
);
AddInput
(
"V"
,
"(Tensor) V velocity tensor of DGC"
);
AddInput
(
"V"
,
"(Tensor) V velocity tensor of DGC"
);
AddInput
(
"Grad"
,
"(Tensor) Input gradient"
);
AddInput
(
"Grad"
,
"(Tensor) Input gradient"
);
AddInput
(
"Param"
,
"(Tensor) Input parameter"
);
AddInput
(
"current_step"
,
"(Tensor) Current step."
);
AddInput
(
"current_step"
,
"(Tensor) Current step."
);
AddInput
(
"nranks"
,
"(Tensor) nranks."
);
AddInput
(
"nranks"
,
"(Tensor) nranks."
);
...
@@ -99,6 +103,16 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -99,6 +103,16 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, 0.0)"
"(float, 0.0)"
"The period when begin k_select."
);
"The period when begin k_select."
);
AddAttr
<
float
>
(
"regular_coeff"
,
"(float, 0.0)"
"The coeff of regularization, weight decay parameter"
)
.
SetDefault
(
0.0
);
AddAttr
<
int
>
(
"regular_type"
,
"(int, 0)"
"The type of regularization, {0:None, 1:L1Decay, 2:L2Decay"
)
.
SetDefault
(
0
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Original paper is https://arxiv.org/abs/1712.01887
Original paper is https://arxiv.org/abs/1712.01887
...
...
paddle/fluid/operators/dgc_op.h
浏览文件 @
6ce49eea
...
@@ -43,6 +43,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
...
@@ -43,6 +43,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
auto
v
=
ctx
.
Input
<
framework
::
Tensor
>
(
"V"
);
auto
v
=
ctx
.
Input
<
framework
::
Tensor
>
(
"V"
);
auto
g
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
);
auto
g
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
);
auto
grad_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Grad_out"
);
// attrs
// attrs
float
m
=
ctx
.
Attr
<
float
>
(
"m"
);
float
m
=
ctx
.
Attr
<
float
>
(
"m"
);
bool
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
bool
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
...
@@ -55,6 +57,39 @@ class DGCOpKernel : public framework::OpKernel<T> {
...
@@ -55,6 +57,39 @@ class DGCOpKernel : public framework::OpKernel<T> {
const
int
nranks
=
static_cast
<
const
int
>
(
*
nranks_tensor
->
data
<
float
>
());
const
int
nranks
=
static_cast
<
const
int
>
(
*
nranks_tensor
->
data
<
float
>
());
PADDLE_ENFORCE_GT
(
nranks
,
1
,
"DGC is not useful when num_trainers <= 1"
);
PADDLE_ENFORCE_GT
(
nranks
,
1
,
"DGC is not useful when num_trainers <= 1"
);
// regularization
auto
p
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
float
regular_coeff
=
ctx
.
Attr
<
float
>
(
"regular_coeff"
);
int
regular_type
=
ctx
.
Attr
<
int
>
(
"regular_type"
);
auto
p_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
p
);
auto
g_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
g
);
auto
grad_out_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
grad_out
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
&
eigen_ctx
=
*
dev_ctx
.
eigen_device
();
// NOTE. In paddle, loss has divided by nranks. Because dgc_op is before
// allreduce, so local regular_coeff need div nranks too. But now we
// multi grad with nranks in dgc_op, in that case regular_coeff don't
// need to /nranks, can prevent precision loss. For coeff often equal
// with 1e-4, if nranks=32, coeff/nranks will be 3.125e-6, the numerical
// accuracy of coeff/nranks will be too low.
PADDLE_ENFORCE_EQ
(
regular_type
>=
0
&&
regular_type
<=
2
,
true
,
platform
::
errors
::
InvalidArgument
(
"DGC only support one of None|L1Decay|L2Decay "
"Regularization for now."
));
if
(
regular_type
==
0
)
{
grad_out_e
.
device
(
eigen_ctx
)
=
(
1.0
*
nranks
)
*
g_e
;
}
else
if
(
regular_type
==
1
)
{
// L1Decay. grad = grad + coeff * sign(param)
grad_out_e
.
device
(
eigen_ctx
)
=
(
1.0
*
nranks
)
*
g_e
+
regular_coeff
*
p_e
.
sign
();
}
else
if
(
regular_type
==
2
)
{
// L2Decay. grad = grad + coeff * param
grad_out_e
.
device
(
eigen_ctx
)
=
(
1.0
*
nranks
)
*
g_e
+
regular_coeff
*
p_e
;
}
// current step
// current step
auto
current_step_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"current_step"
);
auto
current_step_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"current_step"
);
const
float
*
current_step
=
current_step_tensor
->
data
<
float
>
();
const
float
*
current_step
=
current_step_tensor
->
data
<
float
>
();
...
@@ -91,19 +126,17 @@ class DGCOpKernel : public framework::OpKernel<T> {
...
@@ -91,19 +126,17 @@ class DGCOpKernel : public framework::OpKernel<T> {
// FIXME(gongwb): use cublas.
// FIXME(gongwb): use cublas.
auto
u_out_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
u_out
);
auto
u_out_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
u_out
);
auto
u_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
u
);
auto
u_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
u
);
auto
g_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
g
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
&
eigen_ctx
=
*
dev_ctx
.
eigen_device
();
if
(
static_cast
<
int
>
(
*
current_step
)
==
static_cast
<
int
>
(
rampup_begin_step
))
{
// calc local momentum from global momentum
// calc local momentum from global momentum
u_out_e
.
device
(
eigen_ctx
)
=
(
1.0
/
nranks
)
*
u_e
;
// NOTE. If grad not multi nranks, need add below code.
}
// if (static_cast<int>(*current_step) ==
// static_cast<int>(rampup_begin_step)) {
// u_out_e.device(eigen_ctx) = (1.0 / nranks) * u_e;
// }
if
(
use_nesterov
)
{
if
(
use_nesterov
)
{
// u = m * (u + g)
// u = m * (u + g)
u_out_e
.
device
(
eigen_ctx
)
=
m
*
(
u_e
+
g_e
);
u_out_e
.
device
(
eigen_ctx
)
=
m
*
(
u_e
+
g
rad_out
_e
);
// v = u + v + g
// v = u + v + g
ElementwiseComputeEx
<
AddFunctor
<
T
>
,
DeviceContext
,
T
>
(
ElementwiseComputeEx
<
AddFunctor
<
T
>
,
DeviceContext
,
T
>
(
...
@@ -113,7 +146,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
...
@@ -113,7 +146,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
ctx
,
g
,
v
,
0
,
AddFunctor
<
T
>
(),
v_out
);
ctx
,
g
,
v
,
0
,
AddFunctor
<
T
>
(),
v_out
);
}
else
{
}
else
{
// u = m * u + g
// u = m * u + g
u_out_e
.
device
(
eigen_ctx
)
=
m
*
u_e
+
g_e
;
u_out_e
.
device
(
eigen_ctx
)
=
m
*
u_e
+
g
rad_out
_e
;
// v = u + v
// v = u + v
ElementwiseComputeEx
<
AddFunctor
<
T
>
,
DeviceContext
,
T
>
(
ElementwiseComputeEx
<
AddFunctor
<
T
>
,
DeviceContext
,
T
>
(
...
@@ -138,7 +171,6 @@ class DGCOpKernel : public framework::OpKernel<T> {
...
@@ -138,7 +171,6 @@ class DGCOpKernel : public framework::OpKernel<T> {
LOG
(
FATAL
)
<<
"v_out numel:"
<<
v_out
->
numel
();
LOG
(
FATAL
)
<<
"v_out numel:"
<<
v_out
->
numel
();
}
}
auto
grad_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Grad_out"
);
math
::
SetConstant
<
DeviceContext
,
T
>
tset
;
math
::
SetConstant
<
DeviceContext
,
T
>
tset
;
tset
(
dev_ctx
,
grad_out
,
static_cast
<
T
>
(
0
));
tset
(
dev_ctx
,
grad_out
,
static_cast
<
T
>
(
0
));
}
}
...
...
paddle/fluid/operators/optimizers/dgc_momentum_op.cc
浏览文件 @
6ce49eea
...
@@ -27,13 +27,20 @@ class DGCMomentumOp : public MomentumOp {
...
@@ -27,13 +27,20 @@ class DGCMomentumOp : public MomentumOp {
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"current_step"
),
true
,
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"current_step"
),
true
,
"current_step should be set."
);
"current_step should be set."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"nranks"
),
true
,
platform
::
errors
::
NotFound
(
"Input(nranks) of DGCMomentumOp is not found."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"Grad_out"
),
true
,
platform
::
errors
::
NotFound
(
"Output(Grad_out) of DGCMomentumOp is not found."
));
return
MomentumOp
::
InferShape
(
ctx
);
return
MomentumOp
::
InferShape
(
ctx
);
}
}
framework
::
OpKernelType
GetKernelTypeForVar
(
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
framework
::
Tensor
&
tensor
,
const
std
::
string
&
var_name
,
const
framework
::
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
override
{
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
override
{
if
(
var_name
==
"current_step"
)
{
if
(
var_name
==
"current_step"
||
var_name
==
"nranks"
)
{
VLOG
(
10
)
<<
"var_name:"
<<
var_name
<<
" need not to transform"
;
VLOG
(
10
)
<<
"var_name:"
<<
var_name
<<
" need not to transform"
;
return
expected_kernel_type
;
return
expected_kernel_type
;
}
}
...
@@ -47,6 +54,10 @@ class DGCMomentumOpMaker : public MomentumOpMaker {
...
@@ -47,6 +54,10 @@ class DGCMomentumOpMaker : public MomentumOpMaker {
public:
public:
void
Make
()
override
{
void
Make
()
override
{
AddInput
(
"current_step"
,
"(Tensor) Current step."
);
AddInput
(
"current_step"
,
"(Tensor) Current step."
);
AddInput
(
"nranks"
,
"(Tensor) The number of trainers."
);
AddOutput
(
"Grad_out"
,
"(Tensor) Output grad gradient"
);
AddAttr
<
float
>
(
"rampup_begin_step"
,
AddAttr
<
float
>
(
"rampup_begin_step"
,
"(float, -1.0)"
"(float, -1.0)"
"The period when begin DGC."
)
"The period when begin DGC."
)
...
...
paddle/fluid/operators/optimizers/dgc_momentum_op.h
浏览文件 @
6ce49eea
...
@@ -38,6 +38,26 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
...
@@ -38,6 +38,26 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
auto
current_step_tensor
=
context
.
Input
<
framework
::
Tensor
>
(
"current_step"
);
auto
current_step_tensor
=
context
.
Input
<
framework
::
Tensor
>
(
"current_step"
);
auto
*
current_step
=
current_step_tensor
->
data
<
T
>
();
auto
*
current_step
=
current_step_tensor
->
data
<
T
>
();
// nranks
auto
nranks_tensor
=
context
.
Input
<
framework
::
Tensor
>
(
"nranks"
);
const
int
nranks
=
static_cast
<
const
int
>
(
*
nranks_tensor
->
data
<
float
>
());
PADDLE_ENFORCE_GT
(
nranks
,
1
,
platform
::
errors
::
InvalidArgument
(
"DGC is not useful when num_trainers <= 1, but now nranks=%d"
,
nranks
));
const
framework
::
Tensor
*
g
=
context
.
Input
<
framework
::
Tensor
>
(
"Grad"
);
framework
::
Tensor
*
g_out
=
context
.
Output
<
framework
::
Tensor
>
(
"Grad_out"
);
auto
g_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
g
);
auto
g_out_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
g_out
);
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
&
eigen_ctx
=
*
dev_ctx
.
eigen_device
();
// NOTE. In dgc_op we multi grad with nranks, so we need /nranks here.
g_out_e
.
device
(
eigen_ctx
)
=
(
1.0
/
nranks
)
*
g_e
;
VLOG
(
10
)
<<
"current_step:"
<<
*
current_step
VLOG
(
10
)
<<
"current_step:"
<<
*
current_step
<<
", rampup_begin_step:"
<<
rampup_begin_step
;
<<
", rampup_begin_step:"
<<
rampup_begin_step
;
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
6ce49eea
...
@@ -966,6 +966,22 @@ class DGCMomentumOptimizer(Optimizer):
...
@@ -966,6 +966,22 @@ class DGCMomentumOptimizer(Optimizer):
self
.
_clip_norm
=
local_grad_clip_norm
/
(
num_trainers
*
self
.
_clip_norm
=
local_grad_clip_norm
/
(
num_trainers
*
num_trainers
)
num_trainers
)
self
.
_get_dgc_regularization_param
()
def
_get_dgc_regularization_param
(
self
):
self
.
regular_coeff
=
0.0
self
.
regular_type
=
0
if
self
.
regularization
is
not
None
:
self
.
regular_coeff
=
self
.
regularization
.
_regularization_coeff
from
.regularizer
import
L1Decay
,
L2Decay
if
isinstance
(
self
.
regularization
,
L1Decay
):
self
.
regular_type
=
1
elif
isinstance
(
self
.
regularization
,
L2Decay
):
self
.
regular_type
=
2
else
:
assert
False
,
'regularization must be None|L1Decay|L2Deacy'
def
_is_use_dgc
(
self
,
param_var
,
grad_var
):
def
_is_use_dgc
(
self
,
param_var
,
grad_var
):
var_numel
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
param_var
.
shape
))
var_numel
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
param_var
.
shape
))
if
var_numel
<
16384
or
\
if
var_numel
<
16384
or
\
...
@@ -997,7 +1013,11 @@ class DGCMomentumOptimizer(Optimizer):
...
@@ -997,7 +1013,11 @@ class DGCMomentumOptimizer(Optimizer):
type
=
"momentum"
type
=
"momentum"
else
:
else
:
type
=
"dgc_momentum"
type
=
"dgc_momentum"
inputs
.
update
({
"current_step"
:
self
.
_global_step_var
})
inputs
.
update
({
"current_step"
:
self
.
_global_step_var
,
"nranks"
:
self
.
_nranks_var
})
outputs
.
update
({
'Grad_out'
:
param_and_grad
[
1
]})
attrs
.
update
({
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
)})
attrs
.
update
({
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
)})
# create the dgc momentum optimize op
# create the dgc momentum optimize op
...
@@ -1160,12 +1180,14 @@ class DGCMomentumOptimizer(Optimizer):
...
@@ -1160,12 +1180,14 @@ class DGCMomentumOptimizer(Optimizer):
encoded_var
,
gather_var
):
encoded_var
,
gather_var
):
block
=
framework
.
default_main_program
().
global_block
()
block
=
framework
.
default_main_program
().
global_block
()
op_maker
=
core
.
op_proto_and_checker_maker
op_maker
=
core
.
op_proto_and_checker_maker
dgc_op
=
block
.
append_op
(
dgc_op
=
block
.
append_op
(
type
=
"dgc"
,
type
=
"dgc"
,
inputs
=
{
inputs
=
{
"U"
:
u_var
,
"U"
:
u_var
,
"V"
:
v_var
,
"V"
:
v_var
,
"Grad"
:
clip_var
,
"Grad"
:
clip_var
,
"Param"
:
param_var
,
"current_step"
:
self
.
_global_step_var
,
"current_step"
:
self
.
_global_step_var
,
"nranks"
:
self
.
_nranks_var
,
"nranks"
:
self
.
_nranks_var
,
},
},
...
@@ -1183,6 +1205,8 @@ class DGCMomentumOptimizer(Optimizer):
...
@@ -1183,6 +1205,8 @@ class DGCMomentumOptimizer(Optimizer):
"use_nesterov"
:
self
.
_use_nesterov
,
"use_nesterov"
:
self
.
_use_nesterov
,
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
),
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
),
"rampup_step"
:
float
(
self
.
_rampup_step
),
"rampup_step"
:
float
(
self
.
_rampup_step
),
"regular_coeff"
:
float
(
self
.
regular_coeff
),
"regular_type"
:
int
(
self
.
regular_type
),
},
},
stop_gradient
=
True
)
stop_gradient
=
True
)
...
@@ -1191,6 +1215,37 @@ class DGCMomentumOptimizer(Optimizer):
...
@@ -1191,6 +1215,37 @@ class DGCMomentumOptimizer(Optimizer):
dgc_op
.
_set_attr
(
op_maker
.
kOpRoleVarAttrName
(),
dgc_op
.
_set_attr
(
op_maker
.
kOpRoleVarAttrName
(),
[
param_var
.
name
,
grad_var
.
name
])
[
param_var
.
name
,
grad_var
.
name
])
def
apply_gradients
(
self
,
params_grads
):
params_grads
=
sorted
(
params_grads
,
key
=
lambda
x
:
x
[
0
].
name
)
params_grads
,
table_param_and_grad
,
table_optimize_op
=
\
self
.
_process_distribute_lookuptable
(
params_grads
)
not_dgc_params_grads
=
[]
dgc_params_grads
=
[]
for
param
,
grad
in
params_grads
:
if
not
self
.
_is_use_dgc
(
param
,
grad
):
not_dgc_params_grads
.
append
((
param
,
grad
))
else
:
dgc_params_grads
.
append
((
param
,
grad
))
# DGC clip and regularization in local
not_dgc_params_grads
=
append_gradient_clip_ops
(
not_dgc_params_grads
)
# Add regularization if any
not_dgc_params_grads
=
append_regularization_ops
(
not_dgc_params_grads
,
self
.
regularization
)
params_grads
=
not_dgc_params_grads
+
dgc_params_grads
params_grads
=
sorted
(
params_grads
,
key
=
lambda
x
:
x
[
0
].
name
)
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
)
if
table_optimize_op
is
not
None
:
optimize_ops
.
append
(
table_optimize_op
)
params_grads
.
append
(
table_param_and_grad
)
return
optimize_ops
class
LarsMomentumOptimizer
(
Optimizer
):
class
LarsMomentumOptimizer
(
Optimizer
):
"""
"""
...
...
python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py
浏览文件 @
6ce49eea
...
@@ -34,16 +34,19 @@ class TestDGCMomentumOp1(unittest.TestCase):
...
@@ -34,16 +34,19 @@ class TestDGCMomentumOp1(unittest.TestCase):
self
.
op_type
=
"dgc_momentum"
self
.
op_type
=
"dgc_momentum"
self
.
dtype
=
np
.
float32
self
.
dtype
=
np
.
float32
nranks_val
=
2
param
=
np
.
random
.
random
((
123
,
321
)).
astype
(
self
.
dtype
)
param
=
np
.
random
.
random
((
123
,
321
)).
astype
(
self
.
dtype
)
grad
=
np
.
random
.
random
((
123
,
321
)).
astype
(
self
.
dtype
)
grad
=
np
.
random
.
random
((
123
,
321
)).
astype
(
self
.
dtype
)
velocity
=
np
.
zeros
((
123
,
321
)).
astype
(
self
.
dtype
)
velocity
=
np
.
zeros
((
123
,
321
)).
astype
(
self
.
dtype
)
learning_rate
=
np
.
array
([
0.001
]).
astype
(
self
.
dtype
)
learning_rate
=
np
.
array
([
0.001
]).
astype
(
self
.
dtype
)
current_step
=
np
.
full
((
1
),
step
).
astype
(
"float32"
)
current_step
=
np
.
full
((
1
),
step
).
astype
(
"float32"
)
nranks
=
np
.
full
((
1
),
nranks_val
).
astype
(
"float32"
)
mu
=
0.0001
mu
=
0.0001
use_nesterov
=
False
use_nesterov
=
False
rampup_begin_step
=
10.0
rampup_begin_step
=
10.0
# get tensor
self
.
param_name
,
self
.
param_tensor
=
self
.
get_tensor
(
'Param'
,
param
)
self
.
param_name
,
self
.
param_tensor
=
self
.
get_tensor
(
'Param'
,
param
)
self
.
grad_name
,
self
.
grad_tensor
=
self
.
get_tensor
(
'Grad'
,
grad
)
self
.
grad_name
,
self
.
grad_tensor
=
self
.
get_tensor
(
'Grad'
,
grad
)
self
.
velocity_name
,
self
.
velocity_tensor
=
self
.
get_tensor
(
'Velocity'
,
self
.
velocity_name
,
self
.
velocity_tensor
=
self
.
get_tensor
(
'Velocity'
,
...
@@ -52,6 +55,8 @@ class TestDGCMomentumOp1(unittest.TestCase):
...
@@ -52,6 +55,8 @@ class TestDGCMomentumOp1(unittest.TestCase):
'LearningRate'
,
learning_rate
)
'LearningRate'
,
learning_rate
)
self
.
current_step_name
,
self
.
current_step_tensor
=
self
.
get_tensor
(
self
.
current_step_name
,
self
.
current_step_tensor
=
self
.
get_tensor
(
'current_step'
,
current_step
,
core
.
CPUPlace
())
'current_step'
,
current_step
,
core
.
CPUPlace
())
self
.
nranks_name
,
self
.
nranks_tensor
=
self
.
get_tensor
(
'nranks'
,
nranks
,
core
.
CPUPlace
())
self
.
kwargs
=
{
self
.
kwargs
=
{
# inputs
# inputs
...
@@ -60,6 +65,7 @@ class TestDGCMomentumOp1(unittest.TestCase):
...
@@ -60,6 +65,7 @@ class TestDGCMomentumOp1(unittest.TestCase):
'Velocity'
:
self
.
velocity_name
,
'Velocity'
:
self
.
velocity_name
,
'LearningRate'
:
self
.
learning_rate_name
,
'LearningRate'
:
self
.
learning_rate_name
,
'current_step'
:
self
.
current_step_name
,
'current_step'
:
self
.
current_step_name
,
'nranks'
:
self
.
nranks_name
,
# attrs
# attrs
'mu'
:
mu
,
'mu'
:
mu
,
...
@@ -68,17 +74,18 @@ class TestDGCMomentumOp1(unittest.TestCase):
...
@@ -68,17 +74,18 @@ class TestDGCMomentumOp1(unittest.TestCase):
# outputs
# outputs
'ParamOut'
:
self
.
param_name
,
'ParamOut'
:
self
.
param_name
,
'VelocityOut'
:
self
.
velocity_name
'VelocityOut'
:
self
.
velocity_name
,
'Grad_out'
:
self
.
grad_name
,
}
}
velocity_out
=
mu
*
velocity
+
grad
velocity_out
=
mu
*
velocity
+
grad
/
nranks
if
use_nesterov
:
if
use_nesterov
:
param_out
=
param
-
grad
*
learning_rate
-
\
param_out
=
param
-
grad
*
learning_rate
-
\
velocity_out
*
mu
*
learning_rate
velocity_out
*
mu
*
learning_rate
else
:
else
:
param_out
=
param
-
learning_rate
*
velocity_out
param_out
=
param
-
learning_rate
*
velocity_out
sgd_out
=
param
-
learning_rate
*
grad
sgd_out
=
param
-
learning_rate
*
grad
/
nranks
self
.
outputs
=
{
self
.
outputs
=
{
'ParamOut'
:
param_out
,
'ParamOut'
:
param_out
,
...
...
python/paddle/fluid/tests/unittests/test_dgc_op.py
浏览文件 @
6ce49eea
...
@@ -44,6 +44,9 @@ class TestDGCOp(unittest.TestCase):
...
@@ -44,6 +44,9 @@ class TestDGCOp(unittest.TestCase):
self
.
grad_name
=
"Grad"
self
.
grad_name
=
"Grad"
self
.
grad
=
np
.
random
.
random
(
size
).
astype
(
"float32"
)
self
.
grad
=
np
.
random
.
random
(
size
).
astype
(
"float32"
)
self
.
param_name
=
"Param"
self
.
param
=
np
.
random
.
random
(
size
).
astype
(
"float32"
)
self
.
current_step_name
=
"current_step"
self
.
current_step_name
=
"current_step"
self
.
current_step
=
np
.
full
((
1
),
0.0
).
astype
(
"float32"
)
self
.
current_step
=
np
.
full
((
1
),
0.0
).
astype
(
"float32"
)
...
@@ -66,6 +69,9 @@ class TestDGCOp(unittest.TestCase):
...
@@ -66,6 +69,9 @@ class TestDGCOp(unittest.TestCase):
self
.
grad_tensor
=
self
.
scope
.
var
(
self
.
grad_name
).
get_tensor
()
self
.
grad_tensor
=
self
.
scope
.
var
(
self
.
grad_name
).
get_tensor
()
self
.
grad_tensor
.
set
(
self
.
grad
,
place
)
self
.
grad_tensor
.
set
(
self
.
grad
,
place
)
self
.
param_tensor
=
self
.
scope
.
var
(
self
.
param_name
).
get_tensor
()
self
.
param_tensor
.
set
(
self
.
param
,
place
)
self
.
current_step_tensor
=
self
.
scope
.
var
(
self
.
current_step_tensor
=
self
.
scope
.
var
(
self
.
current_step_name
).
get_tensor
()
self
.
current_step_name
).
get_tensor
()
self
.
current_step_tensor
.
set
(
self
.
current_step
,
core
.
CPUPlace
())
self
.
current_step_tensor
.
set
(
self
.
current_step
,
core
.
CPUPlace
())
...
@@ -96,6 +102,7 @@ class TestDGCOp(unittest.TestCase):
...
@@ -96,6 +102,7 @@ class TestDGCOp(unittest.TestCase):
'U'
:
self
.
u_name
,
'U'
:
self
.
u_name
,
'V'
:
self
.
v_name
,
'V'
:
self
.
v_name
,
'Grad'
:
self
.
grad_name
,
'Grad'
:
self
.
grad_name
,
'Param'
:
self
.
param_name
,
'current_step'
:
self
.
current_step_name
,
'current_step'
:
self
.
current_step_name
,
'nranks'
:
self
.
nranks_name
,
'nranks'
:
self
.
nranks_name
,
...
@@ -113,6 +120,8 @@ class TestDGCOp(unittest.TestCase):
...
@@ -113,6 +120,8 @@ class TestDGCOp(unittest.TestCase):
'use_nesterov'
:
True
,
'use_nesterov'
:
True
,
'rampup_begin_step'
:
float
(
0.0
),
'rampup_begin_step'
:
float
(
0.0
),
'rampup_step'
:
float
(
10.0
),
'rampup_step'
:
float
(
10.0
),
'regular_coeff'
:
float
(
1e-4
),
'regular_type'
:
int
(
2
),
}
}
dgc_op
=
Operator
(
'dgc'
,
**
kwargs
)
dgc_op
=
Operator
(
'dgc'
,
**
kwargs
)
...
...
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
浏览文件 @
6ce49eea
...
@@ -18,6 +18,7 @@ import unittest
...
@@ -18,6 +18,7 @@ import unittest
import
paddle.fluid.framework
as
framework
import
paddle.fluid.framework
as
framework
import
paddle.fluid.optimizer
as
optimizer
import
paddle.fluid.optimizer
as
optimizer
import
paddle.fluid.regularizer
as
regularizer
import
paddle.compat
as
cpt
import
paddle.compat
as
cpt
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.transpiler.details
import
program_to_code
from
paddle.fluid.transpiler.details
import
program_to_code
...
@@ -31,7 +32,10 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
...
@@ -31,7 +32,10 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
def
get_velocity_str
(
self
):
def
get_velocity_str
(
self
):
return
self
.
_u_velocity_acc_str
return
self
.
_u_velocity_acc_str
def
check_dgc_momentum_optimizer
(
self
,
dims
=
[
5
,
10
,
8
],
name
=
"momentum"
):
def
check_dgc_momentum_optimizer
(
self
,
dims
=
[
5
,
10
,
8
],
name
=
"momentum"
,
regularization
=
None
):
init_program
=
framework
.
Program
()
init_program
=
framework
.
Program
()
program
=
framework
.
Program
()
program
=
framework
.
Program
()
block
=
program
.
global_block
()
block
=
program
.
global_block
()
...
@@ -58,8 +62,12 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
...
@@ -58,8 +62,12 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
outputs
=
{
"Out"
:
mul_out
},
outputs
=
{
"Out"
:
mul_out
},
attrs
=
{
"x_num_col_dims"
:
1
})
attrs
=
{
"x_num_col_dims"
:
1
})
learning_rate
=
0.01
learning_rate
=
0.01
dgc_momentum_optimizer
=
self
.
MockDGCMomentum
(
dgc_momentum_optimizer
=
self
.
MockDGCMomentum
(
learning_rate
=
learning_rate
,
momentum
=
0.2
,
rampup_begin_step
=
0
)
learning_rate
=
learning_rate
,
momentum
=
0.2
,
rampup_begin_step
=
0
,
regularization
=
regularization
)
mean_out
=
block
.
create_var
(
mean_out
=
block
.
create_var
(
dtype
=
"float32"
,
shape
=
[
1
],
lod_level
=
0
,
name
=
"mean.out"
)
dtype
=
"float32"
,
shape
=
[
1
],
lod_level
=
0
,
name
=
"mean.out"
)
block
.
append_op
(
block
.
append_op
(
...
@@ -96,12 +104,15 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
...
@@ -96,12 +104,15 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
program_to_code
(
program
,
fout
=
f
)
program_to_code
(
program
,
fout
=
f
)
def
test_momentum_without_dgc
(
self
):
def
test_momentum_without_dgc
(
self
):
self
.
check_dgc_momentum_optimizer
()
self
.
check_dgc_momentum_optimizer
(
regularization
=
regularizer
.
L1Decay
(
1e-4
))
def
test_momentum_with_dgc
(
self
):
def
test_momentum_with_dgc
(
self
):
# 16 * 1024 = 16384, use dgc momentum
# 16 * 1024 = 16384, use dgc momentum
self
.
check_dgc_momentum_optimizer
(
self
.
check_dgc_momentum_optimizer
(
dims
=
[
16
,
1024
,
8
],
name
=
"dgc_momentum"
)
dims
=
[
16
,
1024
,
8
],
name
=
"dgc_momentum"
,
regularization
=
regularizer
.
L2Decay
(
1e-4
))
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录