Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
8ac7687e
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8ac7687e
编写于
11月 25, 2019
作者:
W
WangXi
提交者:
gongweibao
11月 25, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix dgc accuracy by mv regularization to local (#21278)
上级
b9f8ae84
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
179 addition
and
20 deletion
+179
-20
paddle/fluid/operators/dgc_op.cc
paddle/fluid/operators/dgc_op.cc
+14
-0
paddle/fluid/operators/dgc_op.h
paddle/fluid/operators/dgc_op.h
+43
-11
paddle/fluid/operators/optimizers/dgc_momentum_op.cc
paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+12
-1
paddle/fluid/operators/optimizers/dgc_momentum_op.h
paddle/fluid/operators/optimizers/dgc_momentum_op.h
+20
-0
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+56
-1
python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py
python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py
+10
-3
python/paddle/fluid/tests/unittests/test_dgc_op.py
python/paddle/fluid/tests/unittests/test_dgc_op.py
+9
-0
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+15
-4
未找到文件。
paddle/fluid/operators/dgc_op.cc
浏览文件 @
8ac7687e
...
...
@@ -29,6 +29,9 @@ class DGCOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"V"
),
"Input(V) of DGCop should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grad"
),
"Input(Grad) of DGCop should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Param"
),
true
,
platform
::
errors
::
NotFound
(
"Input(Param) of DGCop is not found."
));
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"current_step"
),
"Input(current_step) of DGCop should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"nranks"
),
true
,
...
...
@@ -66,6 +69,7 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"U"
,
"(Tensor) U velocity tensor of DGC"
);
AddInput
(
"V"
,
"(Tensor) V velocity tensor of DGC"
);
AddInput
(
"Grad"
,
"(Tensor) Input gradient"
);
AddInput
(
"Param"
,
"(Tensor) Input parameter"
);
AddInput
(
"current_step"
,
"(Tensor) Current step."
);
AddInput
(
"nranks"
,
"(Tensor) nranks."
);
...
...
@@ -99,6 +103,16 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, 0.0)"
"The period when begin k_select."
);
AddAttr
<
float
>
(
"regular_coeff"
,
"(float, 0.0)"
"The coeff of regularization, weight decay parameter"
)
.
SetDefault
(
0.0
);
AddAttr
<
int
>
(
"regular_type"
,
"(int, 0)"
"The type of regularization, {0:None, 1:L1Decay, 2:L2Decay"
)
.
SetDefault
(
0
);
AddComment
(
R"DOC(
Original paper is https://arxiv.org/abs/1712.01887
...
...
paddle/fluid/operators/dgc_op.h
浏览文件 @
8ac7687e
...
...
@@ -43,6 +43,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
auto
v
=
ctx
.
Input
<
framework
::
Tensor
>
(
"V"
);
auto
g
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
);
auto
grad_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Grad_out"
);
// attrs
float
m
=
ctx
.
Attr
<
float
>
(
"m"
);
bool
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
...
...
@@ -55,6 +57,39 @@ class DGCOpKernel : public framework::OpKernel<T> {
const
int
nranks
=
static_cast
<
const
int
>
(
*
nranks_tensor
->
data
<
float
>
());
PADDLE_ENFORCE_GT
(
nranks
,
1
,
"DGC is not useful when num_trainers <= 1"
);
// regularization
auto
p
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
float
regular_coeff
=
ctx
.
Attr
<
float
>
(
"regular_coeff"
);
int
regular_type
=
ctx
.
Attr
<
int
>
(
"regular_type"
);
auto
p_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
p
);
auto
g_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
g
);
auto
grad_out_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
grad_out
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
&
eigen_ctx
=
*
dev_ctx
.
eigen_device
();
// NOTE. In paddle, loss has divided by nranks. Because dgc_op is before
// allreduce, so local regular_coeff need div nranks too. But now we
// multi grad with nranks in dgc_op, in that case regular_coeff don't
// need to /nranks, can prevent precision loss. For coeff often equal
// with 1e-4, if nranks=32, coeff/nranks will be 3.125e-6, the numerical
// accuracy of coeff/nranks will be too low.
PADDLE_ENFORCE_EQ
(
regular_type
>=
0
&&
regular_type
<=
2
,
true
,
platform
::
errors
::
InvalidArgument
(
"DGC only support one of None|L1Decay|L2Decay "
"Regularization for now."
));
if
(
regular_type
==
0
)
{
grad_out_e
.
device
(
eigen_ctx
)
=
(
1.0
*
nranks
)
*
g_e
;
}
else
if
(
regular_type
==
1
)
{
// L1Decay. grad = grad + coeff * sign(param)
grad_out_e
.
device
(
eigen_ctx
)
=
(
1.0
*
nranks
)
*
g_e
+
regular_coeff
*
p_e
.
sign
();
}
else
if
(
regular_type
==
2
)
{
// L2Decay. grad = grad + coeff * param
grad_out_e
.
device
(
eigen_ctx
)
=
(
1.0
*
nranks
)
*
g_e
+
regular_coeff
*
p_e
;
}
// current step
auto
current_step_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"current_step"
);
const
float
*
current_step
=
current_step_tensor
->
data
<
float
>
();
...
...
@@ -91,19 +126,17 @@ class DGCOpKernel : public framework::OpKernel<T> {
// FIXME(gongwb): use cublas.
auto
u_out_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
u_out
);
auto
u_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
u
);
auto
g_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
g
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
&
eigen_ctx
=
*
dev_ctx
.
eigen_device
();
if
(
static_cast
<
int
>
(
*
current_step
)
==
static_cast
<
int
>
(
rampup_begin_step
))
{
// calc local momentum from global momentum
u_out_e
.
device
(
eigen_ctx
)
=
(
1.0
/
nranks
)
*
u_e
;
}
// calc local momentum from global momentum
// NOTE. If grad not multi nranks, need add below code.
// if (static_cast<int>(*current_step) ==
// static_cast<int>(rampup_begin_step)) {
// u_out_e.device(eigen_ctx) = (1.0 / nranks) * u_e;
// }
if
(
use_nesterov
)
{
// u = m * (u + g)
u_out_e
.
device
(
eigen_ctx
)
=
m
*
(
u_e
+
g_e
);
u_out_e
.
device
(
eigen_ctx
)
=
m
*
(
u_e
+
g
rad_out
_e
);
// v = u + v + g
ElementwiseComputeEx
<
AddFunctor
<
T
>
,
DeviceContext
,
T
>
(
...
...
@@ -113,7 +146,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
ctx
,
g
,
v
,
0
,
AddFunctor
<
T
>
(),
v_out
);
}
else
{
// u = m * u + g
u_out_e
.
device
(
eigen_ctx
)
=
m
*
u_e
+
g_e
;
u_out_e
.
device
(
eigen_ctx
)
=
m
*
u_e
+
g
rad_out
_e
;
// v = u + v
ElementwiseComputeEx
<
AddFunctor
<
T
>
,
DeviceContext
,
T
>
(
...
...
@@ -138,7 +171,6 @@ class DGCOpKernel : public framework::OpKernel<T> {
LOG
(
FATAL
)
<<
"v_out numel:"
<<
v_out
->
numel
();
}
auto
grad_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Grad_out"
);
math
::
SetConstant
<
DeviceContext
,
T
>
tset
;
tset
(
dev_ctx
,
grad_out
,
static_cast
<
T
>
(
0
));
}
...
...
paddle/fluid/operators/optimizers/dgc_momentum_op.cc
浏览文件 @
8ac7687e
...
...
@@ -27,13 +27,20 @@ class DGCMomentumOp : public MomentumOp {
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"current_step"
),
true
,
"current_step should be set."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"nranks"
),
true
,
platform
::
errors
::
NotFound
(
"Input(nranks) of DGCMomentumOp is not found."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"Grad_out"
),
true
,
platform
::
errors
::
NotFound
(
"Output(Grad_out) of DGCMomentumOp is not found."
));
return
MomentumOp
::
InferShape
(
ctx
);
}
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
framework
::
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
override
{
if
(
var_name
==
"current_step"
)
{
if
(
var_name
==
"current_step"
||
var_name
==
"nranks"
)
{
VLOG
(
10
)
<<
"var_name:"
<<
var_name
<<
" need not to transform"
;
return
expected_kernel_type
;
}
...
...
@@ -47,6 +54,10 @@ class DGCMomentumOpMaker : public MomentumOpMaker {
public:
void
Make
()
override
{
AddInput
(
"current_step"
,
"(Tensor) Current step."
);
AddInput
(
"nranks"
,
"(Tensor) The number of trainers."
);
AddOutput
(
"Grad_out"
,
"(Tensor) Output grad gradient"
);
AddAttr
<
float
>
(
"rampup_begin_step"
,
"(float, -1.0)"
"The period when begin DGC."
)
...
...
paddle/fluid/operators/optimizers/dgc_momentum_op.h
浏览文件 @
8ac7687e
...
...
@@ -38,6 +38,26 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
auto
current_step_tensor
=
context
.
Input
<
framework
::
Tensor
>
(
"current_step"
);
auto
*
current_step
=
current_step_tensor
->
data
<
T
>
();
// nranks
auto
nranks_tensor
=
context
.
Input
<
framework
::
Tensor
>
(
"nranks"
);
const
int
nranks
=
static_cast
<
const
int
>
(
*
nranks_tensor
->
data
<
float
>
());
PADDLE_ENFORCE_GT
(
nranks
,
1
,
platform
::
errors
::
InvalidArgument
(
"DGC is not useful when num_trainers <= 1, but now nranks=%d"
,
nranks
));
const
framework
::
Tensor
*
g
=
context
.
Input
<
framework
::
Tensor
>
(
"Grad"
);
framework
::
Tensor
*
g_out
=
context
.
Output
<
framework
::
Tensor
>
(
"Grad_out"
);
auto
g_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
g
);
auto
g_out_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
g_out
);
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
&
eigen_ctx
=
*
dev_ctx
.
eigen_device
();
// NOTE. In dgc_op we multi grad with nranks, so we need /nranks here.
g_out_e
.
device
(
eigen_ctx
)
=
(
1.0
/
nranks
)
*
g_e
;
VLOG
(
10
)
<<
"current_step:"
<<
*
current_step
<<
", rampup_begin_step:"
<<
rampup_begin_step
;
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
8ac7687e
...
...
@@ -966,6 +966,22 @@ class DGCMomentumOptimizer(Optimizer):
self
.
_clip_norm
=
local_grad_clip_norm
/
(
num_trainers
*
num_trainers
)
self
.
_get_dgc_regularization_param
()
def
_get_dgc_regularization_param
(
self
):
self
.
regular_coeff
=
0.0
self
.
regular_type
=
0
if
self
.
regularization
is
not
None
:
self
.
regular_coeff
=
self
.
regularization
.
_regularization_coeff
from
.regularizer
import
L1Decay
,
L2Decay
if
isinstance
(
self
.
regularization
,
L1Decay
):
self
.
regular_type
=
1
elif
isinstance
(
self
.
regularization
,
L2Decay
):
self
.
regular_type
=
2
else
:
assert
False
,
'regularization must be None|L1Decay|L2Deacy'
def
_is_use_dgc
(
self
,
param_var
,
grad_var
):
var_numel
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
param_var
.
shape
))
if
var_numel
<
16384
or
\
...
...
@@ -997,7 +1013,11 @@ class DGCMomentumOptimizer(Optimizer):
type
=
"momentum"
else
:
type
=
"dgc_momentum"
inputs
.
update
({
"current_step"
:
self
.
_global_step_var
})
inputs
.
update
({
"current_step"
:
self
.
_global_step_var
,
"nranks"
:
self
.
_nranks_var
})
outputs
.
update
({
'Grad_out'
:
param_and_grad
[
1
]})
attrs
.
update
({
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
)})
# create the dgc momentum optimize op
...
...
@@ -1160,12 +1180,14 @@ class DGCMomentumOptimizer(Optimizer):
encoded_var
,
gather_var
):
block
=
framework
.
default_main_program
().
global_block
()
op_maker
=
core
.
op_proto_and_checker_maker
dgc_op
=
block
.
append_op
(
type
=
"dgc"
,
inputs
=
{
"U"
:
u_var
,
"V"
:
v_var
,
"Grad"
:
clip_var
,
"Param"
:
param_var
,
"current_step"
:
self
.
_global_step_var
,
"nranks"
:
self
.
_nranks_var
,
},
...
...
@@ -1183,6 +1205,8 @@ class DGCMomentumOptimizer(Optimizer):
"use_nesterov"
:
self
.
_use_nesterov
,
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
),
"rampup_step"
:
float
(
self
.
_rampup_step
),
"regular_coeff"
:
float
(
self
.
regular_coeff
),
"regular_type"
:
int
(
self
.
regular_type
),
},
stop_gradient
=
True
)
...
...
@@ -1191,6 +1215,37 @@ class DGCMomentumOptimizer(Optimizer):
dgc_op
.
_set_attr
(
op_maker
.
kOpRoleVarAttrName
(),
[
param_var
.
name
,
grad_var
.
name
])
def
apply_gradients
(
self
,
params_grads
):
params_grads
=
sorted
(
params_grads
,
key
=
lambda
x
:
x
[
0
].
name
)
params_grads
,
table_param_and_grad
,
table_optimize_op
=
\
self
.
_process_distribute_lookuptable
(
params_grads
)
not_dgc_params_grads
=
[]
dgc_params_grads
=
[]
for
param
,
grad
in
params_grads
:
if
not
self
.
_is_use_dgc
(
param
,
grad
):
not_dgc_params_grads
.
append
((
param
,
grad
))
else
:
dgc_params_grads
.
append
((
param
,
grad
))
# DGC clip and regularization in local
not_dgc_params_grads
=
append_gradient_clip_ops
(
not_dgc_params_grads
)
# Add regularization if any
not_dgc_params_grads
=
append_regularization_ops
(
not_dgc_params_grads
,
self
.
regularization
)
params_grads
=
not_dgc_params_grads
+
dgc_params_grads
params_grads
=
sorted
(
params_grads
,
key
=
lambda
x
:
x
[
0
].
name
)
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
)
if
table_optimize_op
is
not
None
:
optimize_ops
.
append
(
table_optimize_op
)
params_grads
.
append
(
table_param_and_grad
)
return
optimize_ops
class
LarsMomentumOptimizer
(
Optimizer
):
"""
...
...
python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py
浏览文件 @
8ac7687e
...
...
@@ -34,16 +34,19 @@ class TestDGCMomentumOp1(unittest.TestCase):
self
.
op_type
=
"dgc_momentum"
self
.
dtype
=
np
.
float32
nranks_val
=
2
param
=
np
.
random
.
random
((
123
,
321
)).
astype
(
self
.
dtype
)
grad
=
np
.
random
.
random
((
123
,
321
)).
astype
(
self
.
dtype
)
velocity
=
np
.
zeros
((
123
,
321
)).
astype
(
self
.
dtype
)
learning_rate
=
np
.
array
([
0.001
]).
astype
(
self
.
dtype
)
current_step
=
np
.
full
((
1
),
step
).
astype
(
"float32"
)
nranks
=
np
.
full
((
1
),
nranks_val
).
astype
(
"float32"
)
mu
=
0.0001
use_nesterov
=
False
rampup_begin_step
=
10.0
# get tensor
self
.
param_name
,
self
.
param_tensor
=
self
.
get_tensor
(
'Param'
,
param
)
self
.
grad_name
,
self
.
grad_tensor
=
self
.
get_tensor
(
'Grad'
,
grad
)
self
.
velocity_name
,
self
.
velocity_tensor
=
self
.
get_tensor
(
'Velocity'
,
...
...
@@ -52,6 +55,8 @@ class TestDGCMomentumOp1(unittest.TestCase):
'LearningRate'
,
learning_rate
)
self
.
current_step_name
,
self
.
current_step_tensor
=
self
.
get_tensor
(
'current_step'
,
current_step
,
core
.
CPUPlace
())
self
.
nranks_name
,
self
.
nranks_tensor
=
self
.
get_tensor
(
'nranks'
,
nranks
,
core
.
CPUPlace
())
self
.
kwargs
=
{
# inputs
...
...
@@ -60,6 +65,7 @@ class TestDGCMomentumOp1(unittest.TestCase):
'Velocity'
:
self
.
velocity_name
,
'LearningRate'
:
self
.
learning_rate_name
,
'current_step'
:
self
.
current_step_name
,
'nranks'
:
self
.
nranks_name
,
# attrs
'mu'
:
mu
,
...
...
@@ -68,17 +74,18 @@ class TestDGCMomentumOp1(unittest.TestCase):
# outputs
'ParamOut'
:
self
.
param_name
,
'VelocityOut'
:
self
.
velocity_name
'VelocityOut'
:
self
.
velocity_name
,
'Grad_out'
:
self
.
grad_name
,
}
velocity_out
=
mu
*
velocity
+
grad
velocity_out
=
mu
*
velocity
+
grad
/
nranks
if
use_nesterov
:
param_out
=
param
-
grad
*
learning_rate
-
\
velocity_out
*
mu
*
learning_rate
else
:
param_out
=
param
-
learning_rate
*
velocity_out
sgd_out
=
param
-
learning_rate
*
grad
sgd_out
=
param
-
learning_rate
*
grad
/
nranks
self
.
outputs
=
{
'ParamOut'
:
param_out
,
...
...
python/paddle/fluid/tests/unittests/test_dgc_op.py
浏览文件 @
8ac7687e
...
...
@@ -44,6 +44,9 @@ class TestDGCOp(unittest.TestCase):
self
.
grad_name
=
"Grad"
self
.
grad
=
np
.
random
.
random
(
size
).
astype
(
"float32"
)
self
.
param_name
=
"Param"
self
.
param
=
np
.
random
.
random
(
size
).
astype
(
"float32"
)
self
.
current_step_name
=
"current_step"
self
.
current_step
=
np
.
full
((
1
),
0.0
).
astype
(
"float32"
)
...
...
@@ -66,6 +69,9 @@ class TestDGCOp(unittest.TestCase):
self
.
grad_tensor
=
self
.
scope
.
var
(
self
.
grad_name
).
get_tensor
()
self
.
grad_tensor
.
set
(
self
.
grad
,
place
)
self
.
param_tensor
=
self
.
scope
.
var
(
self
.
param_name
).
get_tensor
()
self
.
param_tensor
.
set
(
self
.
param
,
place
)
self
.
current_step_tensor
=
self
.
scope
.
var
(
self
.
current_step_name
).
get_tensor
()
self
.
current_step_tensor
.
set
(
self
.
current_step
,
core
.
CPUPlace
())
...
...
@@ -96,6 +102,7 @@ class TestDGCOp(unittest.TestCase):
'U'
:
self
.
u_name
,
'V'
:
self
.
v_name
,
'Grad'
:
self
.
grad_name
,
'Param'
:
self
.
param_name
,
'current_step'
:
self
.
current_step_name
,
'nranks'
:
self
.
nranks_name
,
...
...
@@ -113,6 +120,8 @@ class TestDGCOp(unittest.TestCase):
'use_nesterov'
:
True
,
'rampup_begin_step'
:
float
(
0.0
),
'rampup_step'
:
float
(
10.0
),
'regular_coeff'
:
float
(
1e-4
),
'regular_type'
:
int
(
2
),
}
dgc_op
=
Operator
(
'dgc'
,
**
kwargs
)
...
...
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
浏览文件 @
8ac7687e
...
...
@@ -18,6 +18,7 @@ import unittest
import
paddle.fluid.framework
as
framework
import
paddle.fluid.optimizer
as
optimizer
import
paddle.fluid.regularizer
as
regularizer
import
paddle.compat
as
cpt
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.transpiler.details
import
program_to_code
...
...
@@ -31,7 +32,10 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
def
get_velocity_str
(
self
):
return
self
.
_u_velocity_acc_str
def
check_dgc_momentum_optimizer
(
self
,
dims
=
[
5
,
10
,
8
],
name
=
"momentum"
):
def
check_dgc_momentum_optimizer
(
self
,
dims
=
[
5
,
10
,
8
],
name
=
"momentum"
,
regularization
=
None
):
init_program
=
framework
.
Program
()
program
=
framework
.
Program
()
block
=
program
.
global_block
()
...
...
@@ -58,8 +62,12 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
outputs
=
{
"Out"
:
mul_out
},
attrs
=
{
"x_num_col_dims"
:
1
})
learning_rate
=
0.01
dgc_momentum_optimizer
=
self
.
MockDGCMomentum
(
learning_rate
=
learning_rate
,
momentum
=
0.2
,
rampup_begin_step
=
0
)
learning_rate
=
learning_rate
,
momentum
=
0.2
,
rampup_begin_step
=
0
,
regularization
=
regularization
)
mean_out
=
block
.
create_var
(
dtype
=
"float32"
,
shape
=
[
1
],
lod_level
=
0
,
name
=
"mean.out"
)
block
.
append_op
(
...
...
@@ -96,12 +104,15 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
program_to_code
(
program
,
fout
=
f
)
def
test_momentum_without_dgc
(
self
):
self
.
check_dgc_momentum_optimizer
()
self
.
check_dgc_momentum_optimizer
(
regularization
=
regularizer
.
L1Decay
(
1e-4
))
def
test_momentum_with_dgc
(
self
):
# 16 * 1024 = 16384, use dgc momentum
self
.
check_dgc_momentum_optimizer
(
dims
=
[
16
,
1024
,
8
],
name
=
"dgc_momentum"
)
dims
=
[
16
,
1024
,
8
],
name
=
"dgc_momentum"
,
regularization
=
regularizer
.
L2Decay
(
1e-4
))
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录