Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
7ddb93d0
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7ddb93d0
编写于
9月 19, 2020
作者:
Z
zhangkeliang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor momentum op to combine weight
上级
6fc74bba
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
501 addition
and
22 deletion
+501
-22
paddle/fluid/operators/optimizers/momentum_op.cc
paddle/fluid/operators/optimizers/momentum_op.cc
+5
-0
paddle/fluid/operators/optimizers/momentum_op.h
paddle/fluid/operators/optimizers/momentum_op.h
+98
-22
python/paddle/fluid/contrib/__init__.py
python/paddle/fluid/contrib/__init__.py
+3
-0
python/paddle/fluid/contrib/optimizer.py
python/paddle/fluid/contrib/optimizer.py
+228
-0
python/paddle/fluid/tests/unittests/test_momentum_op.py
python/paddle/fluid/tests/unittests/test_momentum_op.py
+167
-0
未找到文件。
paddle/fluid/operators/optimizers/momentum_op.cc
浏览文件 @
7ddb93d0
...
...
@@ -61,6 +61,11 @@ void MomentumOpMaker::Make() {
"(bool, default false) "
"Use Nesterov Momentum"
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
"regularization_method"
,
"(string) regularization_method"
)
.
SetDefault
(
""
);
AddAttr
<
float
>
(
"regularization_coeff"
,
"(float) regularization_coeff"
)
.
SetDefault
(
1.0
);
AddComment
(
R"DOC(
Momentum Optimizer.
...
...
paddle/fluid/operators/optimizers/momentum_op.h
浏览文件 @
7ddb93d0
...
...
@@ -29,6 +29,12 @@ using framework::SelectedRows;
struct
NoNesterov
;
struct
UseNesterov
;
enum
class
RegularizationFlag
{
kNONE
=
0
,
kL1DECAY
=
1
,
// do not need support right now
kL2DECAY
=
2
,
};
class
MomentumOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
;
...
...
@@ -100,6 +106,8 @@ class CPUDenseMomentumFunctor {
const
Tensor
*
learning_rate
;
const
T
mu
;
const
T
use_nesterov
;
const
RegularizationFlag
regularization_flag
;
const
T
regularization_coeff
;
Tensor
*
param_out
;
Tensor
*
velocity_out
;
...
...
@@ -107,13 +115,17 @@ class CPUDenseMomentumFunctor {
CPUDenseMomentumFunctor
(
const
Tensor
*
param
,
const
Tensor
*
grad
,
const
Tensor
*
velocity
,
const
Tensor
*
learning_rate
,
const
T
mu
,
const
bool
use_nesterov
,
Tensor
*
param_out
,
Tensor
*
velocity_out
)
const
RegularizationFlag
regularization_flag
,
const
T
regularization_coeff
,
Tensor
*
param_out
,
Tensor
*
velocity_out
)
:
param
(
param
),
grad
(
grad
),
velocity
(
velocity
),
learning_rate
(
learning_rate
),
mu
(
mu
),
use_nesterov
(
use_nesterov
),
regularization_flag
(
regularization_flag
),
regularization_coeff
(
regularization_coeff
),
param_out
(
param_out
),
velocity_out
(
velocity_out
)
{}
...
...
@@ -126,11 +138,20 @@ class CPUDenseMomentumFunctor {
auto
g
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
grad
);
auto
*
lr
=
learning_rate
->
data
<
T
>
();
v_out
=
v
*
mu
+
g
;
if
(
use_nesterov
)
{
p_out
=
p
-
(
g
+
v_out
*
mu
)
*
lr
[
0
];
if
(
regularization_flag
==
RegularizationFlag
::
kL2DECAY
)
{
v_out
=
v
*
mu
+
p
*
regularization_coeff
+
g
;
if
(
use_nesterov
)
{
p_out
=
p
-
(
p
*
regularization_coeff
+
g
+
v_out
*
mu
)
*
lr
[
0
];
}
else
{
p_out
=
p
-
lr
[
0
]
*
v_out
;
}
}
else
{
p_out
=
p
-
lr
[
0
]
*
v_out
;
v_out
=
v
*
mu
+
g
;
if
(
use_nesterov
)
{
p_out
=
p
-
(
g
+
v_out
*
mu
)
*
lr
[
0
];
}
else
{
p_out
=
p
-
lr
[
0
]
*
v_out
;
}
}
}
};
...
...
@@ -152,11 +173,14 @@ class DenseMomentumFunctor<T, UseNesterov> {
const
int64_t
num_
;
T
*
p_out_
;
T
*
v_out_
;
const
RegularizationFlag
regularization_flag
;
const
T
regularization_coeff
;
public:
DenseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
learning_rate
,
const
T
mu
,
const
int64_t
num
,
T
*
p_out
,
T
*
v_out
)
const
RegularizationFlag
regularization_flag
,
const
T
regularization_coeff
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
...
...
@@ -164,13 +188,20 @@ class DenseMomentumFunctor<T, UseNesterov> {
mu_
(
mu
),
num_
(
num
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
v_out_
(
v_out
),
regularization_flag
(
regularization_flag
),
regularization_coeff
(
regularization_coeff
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
// put memory access in register
const
T
p
=
p_
[
i
];
const
T
g
=
g_
[
i
];
T
g
=
g_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
g
=
regularization_flag
==
RegularizationFlag
::
kL2DECAY
?
g
+
regularization_coeff
*
p
:
g
;
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
(
g
+
v_out
*
mu_
)
*
lr
;
// write reigster to memory
...
...
@@ -190,11 +221,14 @@ class DenseMomentumFunctor<T, NoNesterov> {
const
int64_t
num_
;
T
*
p_out_
;
T
*
v_out_
;
const
RegularizationFlag
regularization_flag
;
const
T
regularization_coeff
;
public:
DenseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
learning_rate
,
const
T
mu
,
const
int64_t
num
,
T
*
p_out
,
T
*
v_out
)
const
RegularizationFlag
regularization_flag
,
const
T
regularization_coeff
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
...
...
@@ -202,13 +236,20 @@ class DenseMomentumFunctor<T, NoNesterov> {
mu_
(
mu
),
num_
(
num
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
v_out_
(
v_out
),
regularization_flag
(
regularization_flag
),
regularization_coeff
(
regularization_coeff
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
// put memory access in register
const
T
p
=
p_
[
i
];
const
T
g
=
g_
[
i
];
T
g
=
g_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
g
=
regularization_flag
==
RegularizationFlag
::
kL2DECAY
?
g
+
regularization_coeff
*
p
:
g
;
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
lr
*
v_out
;
// write reigster to memory
...
...
@@ -233,11 +274,15 @@ class SparseMomentumFunctor<T, UseNesterov> {
const
int64_t
row_height_
;
T
*
p_out_
;
T
*
v_out_
;
const
RegularizationFlag
regularization_flag
;
const
T
regularization_coeff
;
public:
SparseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
lr
,
const
T
mu
,
const
int64_t
*
rows
,
int64_t
row_numel
,
int64_t
row_height
,
T
*
p_out
,
T
*
v_out
)
int64_t
row_height
,
const
RegularizationFlag
regularization_flag
,
const
T
regularization_coeff
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
...
...
@@ -247,7 +292,9 @@ class SparseMomentumFunctor<T, UseNesterov> {
row_numel_
(
row_numel
),
row_height_
(
row_height
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
v_out_
(
v_out
),
regularization_flag
(
regularization_flag
),
regularization_coeff
(
regularization_coeff
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
{
auto
row_idx
=
...
...
@@ -258,6 +305,11 @@ class SparseMomentumFunctor<T, UseNesterov> {
const
T
p
=
p_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
g
=
regularization_flag
==
RegularizationFlag
::
kL2DECAY
?
g
+
regularization_coeff
*
p
:
g
;
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
(
g
+
v_out
*
mu_
)
*
lr
;
// write reigster to memory
...
...
@@ -279,11 +331,15 @@ class SparseMomentumFunctor<T, NoNesterov> {
const
int64_t
row_height_
;
T
*
p_out_
;
T
*
v_out_
;
const
RegularizationFlag
regularization_flag
;
const
T
regularization_coeff
;
public:
SparseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
lr
,
const
T
mu
,
const
int64_t
*
rows
,
int64_t
row_numel
,
int64_t
row_height
,
T
*
p_out
,
T
*
v_out
)
int64_t
row_height
,
const
RegularizationFlag
regularization_flag
,
const
T
regularization_coeff
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
...
...
@@ -293,7 +349,9 @@ class SparseMomentumFunctor<T, NoNesterov> {
row_numel_
(
row_numel
),
row_height_
(
row_height
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
v_out_
(
v_out
),
regularization_flag
(
regularization_flag
),
regularization_coeff
(
regularization_coeff
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
{
auto
row_idx
=
...
...
@@ -304,6 +362,11 @@ class SparseMomentumFunctor<T, NoNesterov> {
const
T
p
=
p_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
g
=
regularization_flag
==
RegularizationFlag
::
kL2DECAY
?
g
+
regularization_coeff
*
p
:
g
;
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
v_out
*
lr
;
// write reigster to memory
...
...
@@ -316,6 +379,16 @@ template <typename DeviceContext, typename T>
class
MomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
std
::
string
regularization_method
=
ctx
.
Attr
<
std
::
string
>
(
"regularization_method"
);
T
regularization_coeff
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"regularization_coeff"
));
RegularizationFlag
regularization_flag
{
RegularizationFlag
::
kNONE
};
// disable regularization
if
(
regularization_method
==
"l2_decay"
)
{
regularization_flag
=
RegularizationFlag
::
kL2DECAY
;
}
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
bool
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
...
...
@@ -324,6 +397,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
*
velocity
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Velocity"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
...
@@ -331,9 +405,9 @@ class MomentumOpKernel : public framework::OpKernel<T> {
if
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
grad
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
);
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()))
{
CPUDenseMomentumFunctor
<
T
>
functor
(
param
,
grad
,
velocity
,
learning_rate
,
mu
,
use_nesterov
,
param_out
,
velocity_out
);
CPUDenseMomentumFunctor
<
T
>
functor
(
param
,
grad
,
velocity
,
learning_rate
,
mu
,
use_nesterov
,
regularization_flag
,
regularization_coeff
,
param_out
,
velocity_out
);
functor
();
}
else
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
platform
::
ForRange
<
DeviceContext
>
for_range
(
...
...
@@ -342,16 +416,16 @@ class MomentumOpKernel : public framework::OpKernel<T> {
if
(
use_nesterov
)
{
DenseMomentumFunctor
<
T
,
UseNesterov
>
functor
(
param
->
data
<
T
>
(),
grad
->
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
param
->
numel
(),
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
learning_rate
->
data
<
T
>
(),
mu
,
param
->
numel
(),
regularization_flag
,
regularization_coeff
,
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
}
else
{
DenseMomentumFunctor
<
T
,
NoNesterov
>
functor
(
param
->
data
<
T
>
(),
grad
->
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
param
->
numel
(),
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
learning_rate
->
data
<
T
>
(),
mu
,
param
->
numel
(),
regularization_flag
,
regularization_coeff
,
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
}
...
...
@@ -384,6 +458,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
param
->
data
<
T
>
(),
merged_grad
->
value
().
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
rows
,
row_numel
,
static_cast
<
int64_t
>
(
merged_grad
->
rows
().
size
()),
regularization_flag
,
regularization_coeff
,
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
...
...
@@ -393,6 +468,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
param
->
data
<
T
>
(),
merged_grad
->
value
().
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
rows
,
row_numel
,
static_cast
<
int64_t
>
(
merged_grad
->
rows
().
size
()),
regularization_flag
,
regularization_coeff
,
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
...
...
python/paddle/fluid/contrib/__init__.py
浏览文件 @
7ddb93d0
...
...
@@ -35,6 +35,8 @@ from . import mixed_precision
from
.mixed_precision
import
*
from
.
import
layers
from
.layers
import
*
from
.
import
optimizer
from
.optimizer
import
*
__all__
=
[]
__all__
+=
decoder
.
__all__
...
...
@@ -46,3 +48,4 @@ __all__ += utils.__all__
__all__
+=
extend_optimizer
.
__all__
__all__
+=
[
'mixed_precision'
]
__all__
+=
layers
.
__all__
__all__
+=
optimizer
.
__all__
python/paddle/fluid/contrib/optimizer.py
0 → 100644
浏览文件 @
7ddb93d0
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
..optimizer
import
Optimizer
from
..regularizer
import
L1DecayRegularizer
from
..regularizer
import
L2DecayRegularizer
from
..
import
framework
from
..
import
core
from
..framework
import
program_guard
from
..clip
import
append_gradient_clip_ops
__all__
=
[
'Momentum'
]
class
Momentum
(
Optimizer
):
"""
Simple Momentum optimizer with velocity state
This optimizer has a flag for Nestrov Momentum.
The update equations are as follows:
.. math::
& velocity = mu * velocity + gradient
& if (use\_nesterov):
&\quad param = param - (gradient + mu * velocity) * learning\_rate
& else:
&\quad param = param - learning\_rate * velocity
Parameters:
learning_rate (float|Variable): The learning rate used to update parameters.
\
Can be a float value or a Variable with one float value as data element.
momentum (float): Momentum factor
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``.
\
This parameter is required in dygraph mode.
\
The default value is None in static mode, at this time all parameters will be updated.
use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method:
\
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set
\
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be
\
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.
\
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information.
\
For details, please refer to :ref:`api_guide_Name`. Default is None.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
import numpy as np
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
moment_optimizer = fluid.optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
moment_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
"""
_velocity_acc_str
=
"velocity"
def
__init__
(
self
,
learning_rate
,
momentum
,
parameter_list
=
None
,
use_nesterov
=
False
,
regularization
=
None
,
grad_clip
=
None
,
name
=
None
):
assert
learning_rate
is
not
None
assert
momentum
is
not
None
super
(
Momentum
,
self
).
__init__
(
learning_rate
=
learning_rate
,
parameter_list
=
parameter_list
,
regularization
=
regularization
,
grad_clip
=
grad_clip
,
name
=
name
)
self
.
type
=
"momentum"
self
.
_momentum
=
momentum
self
.
_use_nesterov
=
bool
(
use_nesterov
)
self
.
_regularization_method
=
""
self
.
_regularization_coef
=
0
if
(
isinstance
(
regularization
,
L2DecayRegularizer
)):
self
.
_regularization_method
=
"l2_decay"
self
.
_regularization_coef
=
regularization
.
_regularization_coeff
if
(
isinstance
(
regularization
,
L1DecayRegularizer
)):
self
.
_regularization_method
=
"l1_decay"
self
.
_regularization_coef
=
regularization
.
_regularization_coeff
def
_create_accumulators
(
self
,
block
,
parameters
):
assert
isinstance
(
block
,
framework
.
Block
)
for
p
in
parameters
:
self
.
_add_accumulator
(
self
.
_velocity_acc_str
,
p
)
def
_append_optimize_op
(
self
,
block
,
param_and_grad
):
assert
isinstance
(
block
,
framework
.
Block
)
velocity_acc
=
self
.
_get_accumulator
(
self
.
_velocity_acc_str
,
param_and_grad
[
0
])
lr
=
self
.
_create_param_lr
(
param_and_grad
)
if
framework
.
in_dygraph_mode
():
_
,
_
=
core
.
ops
.
momentum
(
param_and_grad
[
0
],
param_and_grad
[
1
],
velocity_acc
,
lr
,
param_and_grad
[
0
],
velocity_acc
,
'mu'
,
self
.
_momentum
,
'use_nesterov'
,
self
.
_use_nesterov
)
return
None
attrs
=
{
"mu"
:
self
.
_momentum
,
"use_nesterov"
:
self
.
_use_nesterov
,
"regularization_method"
:
self
.
_regularization_method
,
"regularization_coeff"
:
self
.
_regularization_coef
}
inputs
=
{
"Param"
:
[
param_and_grad
[
0
]],
"Grad"
:
[
param_and_grad
[
1
]],
"Velocity"
:
[
velocity_acc
],
"LearningRate"
:
[
lr
]
}
outputs
=
{
"ParamOut"
:
[
param_and_grad
[
0
]],
"VelocityOut"
:
[
velocity_acc
]
}
# create the momentum optimize op
momentum_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
)
return
momentum_op
def
apply_gradients
(
self
,
params_grads
):
"""
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
Args:
params_grads (list): list of (param, grad) pair to do optimization.
Returns:
list: A list of operators appended to the current program.
Examples:
.. code-block:: python
import paddle.fluid as fluid
loss = network()
optimizer = fluid.optimizer.SGD(learning_rate=0.1)
params_grads = optimizer.backward(loss)
# you may append operations for params_grads here
# ...
optimizer.apply_gradients(params_grads)
"""
params_grads
=
sorted
(
params_grads
,
key
=
lambda
x
:
x
[
0
].
name
)
# 'optimizer(grad_clip)' or 'set_gradient_clip'
if
self
.
_grad_clip
is
not
None
:
params_grads
=
self
.
_grad_clip
(
params_grads
)
else
:
params_grads
=
append_gradient_clip_ops
(
params_grads
)
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
)
return
optimize_ops
def
apply_optimize
(
self
,
loss
,
startup_program
,
params_grads
):
"""
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
params_grads (list): list of (param, grad) pair to do optimization.
Returns:
list: A list of operators appended to the current program.
"""
if
framework
.
in_dygraph_mode
():
with
program_guard
(
framework
.
default_main_program
(),
framework
.
default_startup_program
()):
if
self
.
_grad_clip
is
not
None
:
params_grads
=
self
.
_grad_clip
(
params_grads
)
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
)
else
:
program
=
loss
.
block
.
program
with
program_guard
(
program
,
startup_program
):
optimize_ops
=
self
.
apply_gradients
(
params_grads
)
return
optimize_ops
python/paddle/fluid/tests/unittests/test_momentum_op.py
浏览文件 @
7ddb93d0
...
...
@@ -279,5 +279,172 @@ class TestMomentumV2(unittest.TestCase):
self
.
assertRaises
(
ValueError
,
paddle
.
optimizer
.
Momentum
,
momentum
=
None
)
class
TestMomentumOpWithDecay
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"momentum"
self
.
dtype
=
np
.
float32
self
.
use_nesterov
=
True
self
.
regularization_method
=
'l2_decay'
self
.
regularization_coeff
=
0.9
self
.
init_config
()
param
=
np
.
random
.
random
((
123
,
321
)).
astype
(
self
.
dtype
)
grad
=
np
.
random
.
random
((
123
,
321
)).
astype
(
self
.
dtype
)
velocity
=
np
.
zeros
((
123
,
321
)).
astype
(
self
.
dtype
)
learning_rate
=
np
.
array
([
0.001
]).
astype
(
self
.
dtype
)
mu
=
0.0001
use_nesterov
=
self
.
use_nesterov
regularization_method
=
self
.
regularization_method
regularization_coeff
=
self
.
regularization_coeff
self
.
inputs
=
{
'Param'
:
param
,
'Grad'
:
grad
,
'Velocity'
:
velocity
,
'LearningRate'
:
learning_rate
}
self
.
attrs
=
{
'mu'
:
mu
,
'use_nesterov'
:
use_nesterov
,
'regularization_method'
:
regularization_method
,
'regularization_coeff'
:
regularization_coeff
}
param_decay
=
regularization_coeff
*
param
grad_new
=
grad
+
param_decay
grad
=
grad_new
velocity_out
=
mu
*
velocity
+
grad
if
use_nesterov
:
param_out
=
param
-
grad
*
learning_rate
-
\
velocity_out
*
mu
*
learning_rate
else
:
param_out
=
param
-
learning_rate
*
velocity_out
self
.
outputs
=
{
'ParamOut'
:
param_out
,
'VelocityOut'
:
velocity_out
}
def
init_config
(
self
):
pass
def
test_check_output
(
self
):
self
.
check_output
()
class
TestMomentumOpWithDecayFP16
(
TestMomentumOpWithDecay
):
def
init_config
(
self
):
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
self
.
check_output
(
atol
=
1e-3
)
class
TestMomentumOpWithDecay2
(
OpTest
):
def
init_config
(
self
):
self
.
use_nesterov
=
False
class
TestSparseMomentumOpWithDecay
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
use_nesterov
=
False
def
check_with_place
(
self
,
place
):
self
.
init_kernel
()
scope
=
core
.
Scope
()
# create and initialize Grad Variable
height
=
10
rows
=
[
0
,
4
,
7
]
row_numel
=
12
mu
=
1.0
use_nesterov
=
self
.
use_nesterov
regularization_method
=
'l2_decay'
regularization_coeff
=
0.9
# create and initialize Param Variable
param
=
scope
.
var
(
'Param'
).
get_tensor
()
param_array
=
np
.
full
((
height
,
row_numel
),
5.0
).
astype
(
"float32"
)
param
.
set
(
param_array
,
place
)
param_out
=
scope
.
var
(
"ParamOut"
).
get_tensor
()
param_out_array
=
np
.
full
((
height
,
row_numel
),
0.0
).
astype
(
"float32"
)
param_out
.
set
(
param_out_array
,
place
)
grad_selected_rows
=
scope
.
var
(
'Grad'
).
get_selected_rows
()
grad_selected_rows
.
set_height
(
height
)
grad_selected_rows
.
set_rows
(
rows
)
grad_np_array
=
np
.
ones
((
len
(
rows
),
row_numel
)).
astype
(
"float32"
)
grad_np_array
[
0
,
0
]
=
2.0
grad_np_array
[
2
,
8
]
=
4.0
grad_tensor
=
grad_selected_rows
.
get_tensor
()
grad_tensor
.
set
(
grad_np_array
,
place
)
velocity
=
scope
.
var
(
'Velocity'
).
get_tensor
()
velocity_np_array
=
np
.
ones
((
height
,
row_numel
)).
astype
(
"float32"
)
velocity
.
set
(
velocity_np_array
,
place
)
velocity_out
=
scope
.
var
(
'VelocityOut'
).
get_tensor
()
velocity_out_np_array
=
np
.
full
((
height
,
row_numel
),
0.0
).
astype
(
"float32"
)
velocity_out
.
set
(
velocity_out_np_array
,
place
)
# create and initialize LeraningRate Variable
lr
=
scope
.
var
(
'LearningRate'
).
get_tensor
()
lr_array
=
np
.
full
((
1
),
2.0
).
astype
(
"float32"
)
lr
.
set
(
lr_array
,
place
)
# create and run operator
op
=
Operator
(
"momentum"
,
Param
=
'Param'
,
Grad
=
'Grad'
,
Velocity
=
'Velocity'
,
ParamOut
=
'ParamOut'
,
VelocityOut
=
'VelocityOut'
,
LearningRate
=
'LearningRate'
,
mu
=
mu
,
use_nesterov
=
use_nesterov
,
regularization_method
=
regularization_method
,
regularization_coeff
=
regularization_coeff
)
op
.
run
(
scope
,
place
)
# get and compare result
param_out_np_array
=
np
.
array
(
param_out
)
velocity_out_np_array
=
np
.
array
(
velocity_out
)
# TODO(dzh): add a more suitable general numpy interface
# for sparse update.
_grad_np_array
=
np
.
full
((
height
,
row_numel
),
0.0
).
astype
(
"float32"
)
for
i
in
range
(
len
(
rows
)):
_grad_np_array
[
rows
[
i
]]
=
grad_np_array
[
i
]
_param
=
param_array
_param_decay
=
regularization_coeff
*
_param
_grad_np_array_new
=
_grad_np_array
+
_param_decay
_grad_np_array
=
_grad_np_array_new
_velocity_out
=
mu
*
velocity_np_array
+
_grad_np_array
if
use_nesterov
:
_param_out
=
_param
-
(
_grad_np_array
+
_velocity_out
*
mu
)
*
lr_array
else
:
_param_out
=
_param
-
lr_array
*
_velocity_out
self
.
assertTrue
((
_velocity_out
==
velocity_out_np_array
).
all
())
self
.
assertTrue
((
_param_out
==
param_out_np_array
).
all
())
def
init_kernel
(
self
):
pass
def
test_sparse_momentum
(
self
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_with_place
(
place
)
class
TestSparseMomentumOpWithDecay2
(
TestSparseMomentumOpWithDecay
):
def
init_kernel
(
self
):
self
.
use_nesterov
=
True
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录