Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
735a2db0
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
735a2db0
编写于
12月 03, 2019
作者:
K
Kaipeng Deng
提交者:
GitHub
12月 03, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[cherry-pick] add Adam beta1/beta2 support Variable (#21433)
* add Adam beta1/beta2 support Variable. test=develop
上级
2660107c
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
402 addition
and
68 deletion
+402
-68
paddle/fluid/operators/optimizers/adam_op.cc
paddle/fluid/operators/optimizers/adam_op.cc
+55
-21
paddle/fluid/operators/optimizers/adam_op.h
paddle/fluid/operators/optimizers/adam_op.h
+21
-2
paddle/fluid/operators/scale_op.cc
paddle/fluid/operators/scale_op.cc
+18
-2
paddle/fluid/operators/scale_op.h
paddle/fluid/operators/scale_op.h
+17
-1
python/paddle/fluid/layers/layer_function_generator.py
python/paddle/fluid/layers/layer_function_generator.py
+2
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+33
-9
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+114
-29
python/paddle/fluid/tests/unittests/test_adam_op.py
python/paddle/fluid/tests/unittests/test_adam_op.py
+97
-2
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+14
-0
python/paddle/fluid/tests/unittests/test_scale_op.py
python/paddle/fluid/tests/unittests/test_scale_op.py
+23
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+8
-2
未找到文件。
paddle/fluid/operators/optimizers/adam_op.cc
浏览文件 @
735a2db0
...
...
@@ -20,27 +20,50 @@ namespace operators {
using
Tensor
=
framework
::
Tensor
;
void
AdamOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Param"
),
"Input(Param) of AdamOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grad"
),
"Input(Grad) of AdamOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Moment1"
),
"Input(Moment1) of AdamOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Moment2"
),
"Input(Moment2) of AdamOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"LearningRate"
),
"Input(LearningRate) of AdamOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Beta1Pow"
),
"Input(Beta1Pow) of AdamOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Beta2Pow"
),
"Input(Beta2Pow) of AdamOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of AdamOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Moment1Out"
),
"Output(Moment1Out) of AdamOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Moment2Out"
),
"Output(Moment2Out) of AdamOp should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Param"
),
true
,
platform
::
errors
::
NotFound
(
"Input(Param) of AdamOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Grad"
),
true
,
platform
::
errors
::
NotFound
(
"Input(Grad) of AdamOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Moment1"
),
true
,
platform
::
errors
::
NotFound
(
"Input(Moment1) of AdamOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Moment2"
),
true
,
platform
::
errors
::
NotFound
(
"Input(Moment2) of AdamOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"LearningRate"
),
true
,
platform
::
errors
::
NotFound
(
"Input(LearningRate) of AdamOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Beta1Pow"
),
true
,
platform
::
errors
::
NotFound
(
"Input(Beta1Pow) of AdamOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Beta2Pow"
),
true
,
platform
::
errors
::
NotFound
(
"Input(Beta2Pow) of AdamOp should not be null."
));
if
(
ctx
->
IsRuntime
()
&&
ctx
->
HasInput
(
"Beta1Tensor"
))
{
auto
beta1
=
ctx
->
Inputs
(
"Beta1Tensor"
);
PADDLE_ENFORCE_EQ
(
beta1
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(Beta1Tensor) size must be 1"
));
}
if
(
ctx
->
IsRuntime
()
&&
ctx
->
HasInput
(
"Beta2Tensor"
))
{
auto
beta2
=
ctx
->
Inputs
(
"Beta2Tensor"
);
PADDLE_ENFORCE_EQ
(
beta2
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(Beta2Tensor) size must be 1"
));
}
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"ParamOut"
),
true
,
platform
::
errors
::
NotFound
(
"Output(ParamOut) of AdamOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"Moment1Out"
),
true
,
platform
::
errors
::
NotFound
(
"Output(Moment1Out) of AdamOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"Moment2Out"
),
true
,
platform
::
errors
::
NotFound
(
"Output(Moment2Out) of AdamOp should not be null."
));
auto
lr_dims
=
ctx
->
GetInputDim
(
"LearningRate"
);
PADDLE_ENFORCE_NE
(
framework
::
product
(
lr_dims
),
0
,
...
...
@@ -93,6 +116,17 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"Beta1Pow"
,
"(Tensor) Input beta1 power accumulator"
);
AddInput
(
"Beta2Pow"
,
"(Tensor) Input beta2 power accumulator"
);
AddInput
(
"Beta1Tensor"
,
"(Tensor<float32>, optional) If provided, Adam will use this "
"as beta1, this has a higher priority than attr(beta1), the "
"shape of this tensor MUST BE [1]."
)
.
AsDispensable
();
AddInput
(
"Beta2Tensor"
,
"(Tensor<float32>, optional) If provided, Adam will use this "
"as beta2, this has a higher priority than attr(beta2), the "
"shape of this tensor MUST BE [1]."
)
.
AsDispensable
();
AddOutput
(
"ParamOut"
,
"(Tensor) Output parameter"
);
AddOutput
(
"Moment1Out"
,
"(Tensor) Output first moment"
);
AddOutput
(
"Moment2Out"
,
"(Tensor) Output second moment"
);
...
...
paddle/fluid/operators/optimizers/adam_op.h
浏览文件 @
735a2db0
...
...
@@ -29,6 +29,16 @@ namespace operators {
namespace
scatter
=
paddle
::
operators
::
math
::
scatter
;
static
inline
float
GetAttrFromTensor
(
const
framework
::
Tensor
*
tensor
)
{
const
float
*
tensor_data
=
tensor
->
data
<
float
>
();
framework
::
Tensor
cpu_tensor
;
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
tensor_data
=
cpu_tensor
.
data
<
float
>
();
}
return
tensor_data
[
0
];
}
class
AdamOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
...
@@ -367,8 +377,6 @@ class AdamOpKernel : public framework::OpKernel<T> {
int64_t
min_row_size_to_use_multithread
=
ctx
.
Attr
<
int64_t
>
(
"min_row_size_to_use_multithread"
);
bool
lazy_mode
=
ctx
.
Attr
<
bool
>
(
"lazy_mode"
);
T
beta1
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta1"
));
T
beta2
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta2"
));
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
auto
&
param
=
Ref
(
ctx
.
Input
<
LoDTensor
>
(
"Param"
),
"Must set Param"
);
// auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
...
...
@@ -390,6 +398,17 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto
&
mom2_out
=
Ref
(
ctx
.
Output
<
LoDTensor
>
(
"Moment2Out"
),
"Must set Moment1Out"
);
T
beta1
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta1"
));
if
(
ctx
.
HasInput
(
"Beta1Tensor"
))
{
auto
*
beta1_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Beta1Tensor"
);
beta1
=
static_cast
<
T
>
(
GetAttrFromTensor
(
beta1_tensor
));
}
T
beta2
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta2"
));
if
(
ctx
.
HasInput
(
"Beta2Tensor"
))
{
auto
*
beta2_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Beta2Tensor"
);
beta2
=
static_cast
<
T
>
(
GetAttrFromTensor
(
beta2_tensor
));
}
if
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
&
grad
=
Ref
(
ctx
.
Input
<
LoDTensor
>
(
"Grad"
),
"Must set Grad"
);
...
...
paddle/fluid/operators/scale_op.cc
浏览文件 @
735a2db0
...
...
@@ -34,6 +34,14 @@ class ScaleOp : public framework::OperatorWithKernel {
"Input(X) of ScaleOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of ScaleOp should not be null."
);
if
(
ctx
->
IsRuntime
()
&&
ctx
->
HasInput
(
"ScaleTensor"
))
{
auto
scale
=
ctx
->
Inputs
(
"ScaleTensor"
);
PADDLE_ENFORCE_EQ
(
scale
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(ScaleTensor) size must be 1"
));
}
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
));
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
...
...
@@ -43,6 +51,11 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor) Input tensor of scale operator."
);
AddInput
(
"ScaleTensor"
,
"(Tensor) If provided, use this as "
"scale factor, this has a higher priority than "
"attr(scale), the shape of this tensor MUST BE 1."
)
.
AsDispensable
();
AddOutput
(
"Out"
,
"(Tensor) Output tensor of scale operator."
);
AddComment
(
R"DOC(
**Scale operator**
...
...
@@ -89,6 +102,9 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
auto
*
grad_op
=
new
framework
::
OpDesc
();
grad_op
->
SetType
(
"scale"
);
grad_op
->
SetInput
(
"X"
,
OutputGrad
(
"Out"
));
if
(
ForwardOp
().
Inputs
().
count
(
"ScaleTensor"
)
>
0
)
{
grad_op
->
SetInput
(
"ScaleTensor"
,
Input
(
"ScaleTensor"
));
}
grad_op
->
SetOutput
(
"Out"
,
InputGrad
(
"X"
));
grad_op
->
SetAttr
(
"scale"
,
GetAttr
(
"scale"
));
grad_op
->
SetAttr
(
"bias"
,
0.0
f
);
...
...
@@ -97,14 +113,14 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
}
};
using
ScaleOpInplace
=
framework
::
SingleOpInplaceInToOut
;
DECLARE_INPLACE_OP_INFERER
(
ScaleOpInplaceInferer
,
{
"X"
,
"Out"
})
;
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
scale
,
ops
::
ScaleOp
,
ops
::
ScaleOpMaker
,
ops
::
ScaleGradMaker
,
ops
::
ScaleOpVarTypeInference
,
ops
::
ScaleOpInplace
);
ops
::
ScaleOpVarTypeInference
,
ops
::
ScaleOpInplace
Inferer
);
REGISTER_OP_CPU_KERNEL
(
scale
,
ops
::
ScaleKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
ScaleKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
...
...
paddle/fluid/operators/scale_op.h
浏览文件 @
735a2db0
...
...
@@ -19,6 +19,17 @@ limitations under the License. */
namespace
paddle
{
namespace
operators
{
static
inline
float
GetAttrFromTensor
(
const
framework
::
Tensor
*
tensor
)
{
const
float
*
tensor_data
=
tensor
->
data
<
float
>
();
framework
::
Tensor
cpu_tensor
;
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
tensor_data
=
cpu_tensor
.
data
<
float
>
();
}
return
tensor_data
[
0
];
}
template
<
typename
DeviceContext
,
typename
T
>
class
ScaleKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -26,10 +37,15 @@ class ScaleKernel : public framework::OpKernel<T> {
auto
*
in_var
=
ctx
.
InputVar
(
"X"
);
auto
*
in
=
framework
::
GetLoDTensorOrSelectedRowsValueFromVar
(
*
in_var
);
auto
scale
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"scale"
));
auto
bias
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"bias"
));
auto
bias_after_scale
=
ctx
.
Attr
<
bool
>
(
"bias_after_scale"
);
auto
scale
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"scale"
));
if
(
ctx
.
HasInput
(
"ScaleTensor"
))
{
auto
*
scale_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"ScaleTensor"
);
scale
=
GetAttrFromTensor
(
scale_tensor
);
}
auto
*
out_var
=
ctx
.
OutputVar
(
"Out"
);
if
(
in_var
->
IsType
<
framework
::
SelectedRows
>
()
&&
in_var
!=
out_var
)
{
auto
&
in_slr
=
in_var
->
Get
<
framework
::
SelectedRows
>
();
...
...
python/paddle/fluid/layers/layer_function_generator.py
浏览文件 @
735a2db0
...
...
@@ -174,6 +174,8 @@ def generate_layer_fn(op_type):
if
not
isinstance
(
val
,
list
)
and
not
isinstance
(
val
,
tuple
):
val
=
[
val
]
if
len
(
val
)
==
0
:
if
len
(
args
)
==
0
:
continue
val
=
[
args
[
0
]]
args
=
args
[
1
:]
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
735a2db0
...
...
@@ -14074,7 +14074,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
Args:
x(Variable): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8.
scale(float
): The scale factor of the input
.
scale(float
|Variable): The scale factor of the input, it should be a float number or a Variable with shape [1] and data type as float32
.
bias(float): The bias to be put on the input.
bias_after_scale(bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances.
act(str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu.
...
...
@@ -14099,6 +14099,27 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
print(res) # [array([[ 3., 5., 7.], [ 9., 11., 13.]], dtype=float32)]
.. code-block:: python
# scale with parameter scale as Variable
import paddle.fluid as fluid
import numpy as np
inputs = fluid.layers.data(name="x", shape=[2, 3], dtype='float32')
scale = fluid.layers.data(name="scale", shape=[1], dtype='float32',
append_batch_size=False)
output = fluid.layers.scale(inputs, scale = scale, bias = 1.0)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
scale_np = np.array([2.]).astype(np.float32)
res = exe.run(fluid.default_main_program(), feed={'x':img, 'scale':scale_np}, fetch_list=[output])
print(res) # [array([[ 3., 5., 7.], [ 9., 11., 13.]], dtype=float32)]
"""
helper = LayerHelper('scale', **locals())
...
...
@@ -14108,15 +14129,18 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
inputs = {'X': x}
attrs = {
'bias': float(bias),
'bias_after_scale': bias_after_scale,
}
if isinstance(scale, Variable):
inputs['ScaleTensor'] = scale
else:
attrs['scale'] = float(scale)
helper.append_op(
type='scale',
inputs={'X': x},
outputs={'Out': out},
attrs={
'scale': float(scale),
'bias': float(bias),
'bias_after_scale': bias_after_scale
})
type='scale', inputs=inputs, outputs={'Out': out}, attrs=attrs)
return helper.append_activation(out)
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
735a2db0
...
...
@@ -1484,9 +1484,11 @@ class AdamOptimizer(Optimizer):
Args:
learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``.
It can be a float value or a ``Variable`` with a float type. The default value is 0.001.
beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
beta1 (float|Variable, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Variable with shape [1] and data type as float32.
The default value is 0.9.
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates.
It should be a float number or a Variable with shape [1] and data type as float32.
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08.
...
...
@@ -1530,6 +1532,64 @@ class AdamOptimizer(Optimizer):
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
.. code-block:: python
# Adam with beta1/beta2 as Variable
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
# define beta decay variable
def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate):
global_step = lr_scheduler._decay_step_counter()
beta1 = fluid.layers.create_global_var(
shape=[1],
value=float(beta1_init),
dtype='float32',
# set persistable for save checkpoints and resume
persistable=True,
name="beta1")
beta2 = fluid.layers.create_global_var(
shape=[1],
value=float(beta2_init),
dtype='float32',
# set persistable for save checkpoints and resume
persistable=True,
name="beta2")
div_res = global_step / decay_steps
decayed_beta1 = beta1_init * (decay_rate**div_res)
decayed_beta2 = beta2_init * (decay_rate**div_res)
fluid.layers.assign(decayed_beta1, beta1)
fluid.layers.assign(decayed_beta2, beta2)
return beta1, beta2
beta1, beta2 = get_decayed_betas(0.9, 0.99, 1e5, 0.9)
adam_optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=0.01,
beta1=beta1,
beta2=beta2)
adam_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
"""
_moment1_acc_str
=
"moment1"
_moment2_acc_str
=
"moment2"
...
...
@@ -1569,13 +1629,15 @@ class AdamOptimizer(Optimizer):
name
=
self
.
_beta1_pow_acc_str
,
param
=
p
,
dtype
=
'float32'
,
fill_value
=
self
.
_beta1
,
fill_value
=
0.9
if
isinstance
(
self
.
_beta1
,
Variable
)
\
else
self
.
_beta1
,
shape
=
[
1
])
self
.
_add_accumulator
(
name
=
self
.
_beta2_pow_acc_str
,
param
=
p
,
dtype
=
'float32'
,
fill_value
=
self
.
_beta2
,
fill_value
=
0.999
if
isinstance
(
self
.
_beta2
,
Variable
)
\
else
self
.
_beta2
,
shape
=
[
1
])
def
_append_optimize_op
(
self
,
block
,
param_and_grad
):
...
...
@@ -1591,29 +1653,40 @@ class AdamOptimizer(Optimizer):
param_and_grad
[
0
])
# create the adam optimize op
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
),
"Moment1"
:
moment1
,
"Moment2"
:
moment2
,
"Beta1Pow"
:
beta1_pow_acc
,
"Beta2Pow"
:
beta2_pow_acc
}
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"Moment1Out"
:
moment1
,
"Moment2Out"
:
moment2
}
attrs
=
{
"epsilon"
:
self
.
_epsilon
,
"lazy_mode"
:
self
.
_lazy_mode
,
"min_row_size_to_use_multithread"
:
1000
}
if
isinstance
(
self
.
_beta1
,
Variable
):
inputs
[
'Beta1Tensor'
]
=
self
.
_beta1
else
:
attrs
[
'beta1'
]
=
self
.
_beta1
if
isinstance
(
self
.
_beta2
,
Variable
):
inputs
[
'Beta2Tensor'
]
=
self
.
_beta2
else
:
attrs
[
'beta2'
]
=
self
.
_beta2
adam_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
),
"Moment1"
:
moment1
,
"Moment2"
:
moment2
,
"Beta1Pow"
:
beta1_pow_acc
,
"Beta2Pow"
:
beta2_pow_acc
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"Moment1Out"
:
moment1
,
"Moment2Out"
:
moment2
},
attrs
=
{
"beta1"
:
self
.
_beta1
,
"beta2"
:
self
.
_beta2
,
"epsilon"
:
self
.
_epsilon
,
"lazy_mode"
:
self
.
_lazy_mode
,
"min_row_size_to_use_multithread"
:
1000
},
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
)
return
adam_op
...
...
@@ -1632,18 +1705,30 @@ class AdamOptimizer(Optimizer):
param
)
beta2_pow_acc
=
self
.
_get_accumulator
(
self
.
_beta2_pow_acc_str
,
param
)
inputs
=
{
"X"
:
beta1_pow_acc
}
attrs
=
{}
if
isinstance
(
self
.
_beta1
,
Variable
):
inputs
[
'ScaleTensor'
]
=
self
.
_beta1
else
:
attrs
[
'scale'
]
=
self
.
_beta1
main_block
.
append_op
(
type
=
"scale"
,
inputs
=
{
"X"
:
beta1_pow_acc
}
,
inputs
=
inputs
,
outputs
=
{
"Out"
:
beta1_pow_acc
},
attrs
=
{
"scale"
:
self
.
_beta1
}
,
attrs
=
attrs
,
stop_gradient
=
True
)
inputs
=
{
"X"
:
beta2_pow_acc
}
attrs
=
{}
if
isinstance
(
self
.
_beta2
,
Variable
):
inputs
[
'ScaleTensor'
]
=
self
.
_beta2
else
:
attrs
[
'scale'
]
=
self
.
_beta2
main_block
.
append_op
(
type
=
"scale"
,
inputs
=
{
"X"
:
beta2_pow_acc
}
,
inputs
=
inputs
,
outputs
=
{
"Out"
:
beta2_pow_acc
},
attrs
=
{
"scale"
:
self
.
_beta2
}
,
attrs
=
attrs
,
stop_gradient
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_adam_op.py
浏览文件 @
735a2db0
...
...
@@ -19,6 +19,7 @@ import numpy as np
from
op_test
import
OpTest
from
paddle.fluid
import
core
from
paddle.fluid.op
import
Operator
import
paddle.fluid
as
fluid
class
TestAdamOp1
(
OpTest
):
...
...
@@ -183,10 +184,17 @@ def adam_step(inputs, attributes):
beta1_pow
=
inputs
[
'Beta1Pow'
]
beta2_pow
=
inputs
[
'Beta2Pow'
]
beta1
=
attributes
[
'beta1'
]
beta2
=
attributes
[
'beta2'
]
epsilon
=
attributes
[
'epsilon'
]
if
'beta1'
in
attributes
:
beta1
=
attributes
[
'beta1'
]
else
:
beta1
=
inputs
[
'Beta1Tensor'
][
0
]
if
'beta2'
in
attributes
:
beta2
=
attributes
[
'beta2'
]
else
:
beta2
=
inputs
[
'Beta2Tensor'
][
0
]
moment1_out
=
beta1
*
moment1
+
(
1
-
beta1
)
*
grad
moment2_out
=
beta2
*
moment2
+
(
1
-
beta2
)
*
np
.
square
(
grad
)
lr_t
=
lr
*
np
.
sqrt
(
1
-
beta2_pow
)
/
(
1
-
beta1_pow
)
...
...
@@ -330,5 +338,92 @@ class TestSparseAdamOp(unittest.TestCase):
self
.
check_with_place
(
place
,
lazy_mode
)
class
TestAdamOpBetaVariable
(
OpTest
):
def
setUp
(
self
):
'''Test Adam Op with beta as Variable
'''
self
.
op_type
=
"adam"
param
=
np
.
random
.
uniform
(
-
1
,
1
,
(
102
,
105
)).
astype
(
"float32"
)
grad
=
np
.
random
.
uniform
(
-
1
,
1
,
(
102
,
105
)).
astype
(
"float32"
)
moment1
=
np
.
random
.
uniform
(
-
1
,
1
,
(
102
,
105
)).
astype
(
"float32"
)
# The second moment is positive
moment2
=
np
.
random
.
random
((
102
,
105
)).
astype
(
"float32"
)
beta1
=
0.85
beta2
=
0.95
learning_rate
=
0.001
epsilon
=
1e-8
beta1_pow
=
beta1
**
10
beta2_pow
=
beta2
**
10
self
.
inputs
=
{
'Param'
:
param
,
'Grad'
:
grad
,
'Moment1'
:
moment1
,
'Moment2'
:
moment2
,
'LearningRate'
:
np
.
array
([
learning_rate
]).
astype
(
"float32"
),
'Beta1Pow'
:
np
.
array
([
beta1_pow
]).
astype
(
"float32"
),
'Beta2Pow'
:
np
.
array
([
beta2_pow
]).
astype
(
"float32"
),
"Beta1Tensor"
:
np
.
array
([
beta1
]).
astype
(
"float32"
),
"Beta2Tensor"
:
np
.
array
([
beta2
]).
astype
(
"float32"
),
}
attributes
=
{
'epsilon'
:
epsilon
}
param_out
,
moment1_out
,
\
moment2_out
=
adam_step
(
self
.
inputs
,
attributes
)
self
.
outputs
=
{
'Moment1Out'
:
moment1_out
,
'Moment2Out'
:
moment2_out
,
'ParamOut'
:
param_out
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestAdamOptimizerBetaVariable
(
unittest
.
TestCase
):
def
test_adam_optimizer
(
self
):
def
test_with_place
(
place
,
shape
):
exe
=
fluid
.
Executor
(
place
)
train_prog
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
train_prog
,
startup
):
with
fluid
.
unique_name
.
guard
():
data
=
fluid
.
data
(
name
=
"data"
,
shape
=
shape
)
conv
=
fluid
.
layers
.
conv2d
(
data
,
8
,
3
)
loss
=
fluid
.
layers
.
reduce_mean
(
conv
)
beta1
=
fluid
.
layers
.
create_global_var
(
shape
=
[
1
],
value
=
0.85
,
dtype
=
'float32'
,
persistable
=
True
)
beta2
=
fluid
.
layers
.
create_global_var
(
shape
=
[
1
],
value
=
0.95
,
dtype
=
'float32'
,
persistable
=
True
)
opt
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
1e-5
,
beta1
=
beta1
,
beta2
=
beta2
)
opt
.
minimize
(
loss
)
exe
.
run
(
startup
)
data_np
=
np
.
random
.
random
(
shape
).
astype
(
'float32'
)
rets
=
exe
.
run
(
train_prog
,
feed
=
{
"data"
:
data_np
},
fetch_list
=
[
loss
])
assert
rets
[
0
]
is
not
None
shape
=
[
2
,
3
,
8
,
8
]
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
for
place
in
places
:
test_with_place
(
place
,
shape
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
735a2db0
...
...
@@ -2425,6 +2425,20 @@ class TestBook(LayerTest):
out
=
layers
.
slice
(
input
,
axes
=
axes
,
starts
=
starts
,
ends
=
ends
)
return
out
def
make_scale_variable
(
self
):
with
program_guard
(
fluid
.
default_main_program
(),
fluid
.
default_startup_program
()):
input
=
self
.
_get_data
(
name
=
"input"
,
shape
=
[
3
,
4
,
5
,
6
],
dtype
=
'float32'
)
scale_var
=
self
.
_get_data
(
name
=
"scale"
,
shape
=
[
1
],
dtype
=
'float32'
,
append_batch_size
=
False
)
out
=
layers
.
scale
(
input
,
scale
=
scale_var
)
return
out
def
make_softshrink
(
self
):
with
program_guard
(
fluid
.
default_main_program
(),
fluid
.
default_startup_program
()):
...
...
python/paddle/fluid/tests/unittests/test_scale_op.py
浏览文件 @
735a2db0
...
...
@@ -42,6 +42,29 @@ class TestScaleOp(OpTest):
self
.
check_grad
([
'X'
],
'Out'
)
class
TestScaleOpScaleVariable
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"scale"
self
.
dtype
=
np
.
float32
self
.
init_dtype_type
()
self
.
scale
=
-
2.3
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
10
,
10
)).
astype
(
self
.
dtype
),
'ScaleTensor'
:
np
.
array
([
self
.
scale
]).
astype
(
'float32'
)
}
self
.
attrs
=
{}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
]
*
self
.
dtype
(
self
.
scale
)}
def
init_dtype_type
(
self
):
pass
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
class
TestScaleOpSelectedRows
(
unittest
.
TestCase
):
def
init_dtype_type
(
self
):
pass
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
735a2db0
...
...
@@ -1440,7 +1440,10 @@ class DistributeTranspiler(object):
param_name
,
endpoint
)
break
for
key
in
opt_op
.
input_names
:
if
key
in
[
"Param"
,
"Grad"
,
"LearningRate"
]:
if
key
in
[
"Param"
,
"Grad"
,
"LearningRate"
,
"Beta1Tensor"
,
"Beta2Tensor"
]:
continue
origin_var
=
self
.
origin_program
.
global_block
().
vars
[
opt_op
.
input
(
key
)[
0
]]
...
...
@@ -2204,7 +2207,10 @@ class DistributeTranspiler(object):
for
key
in
opt_op
.
input_names
:
new_shape
=
None
if
key
in
[
"Param"
,
"Grad"
,
"LearningRate"
]:
if
key
in
[
"Param"
,
"Grad"
,
"LearningRate"
,
"Beta1Tensor"
,
"Beta2Tensor"
]:
continue
var
=
self
.
origin_program
.
global_block
().
vars
[
opt_op
.
input
(
key
)[
0
]]
param_var
=
new_inputs
[
"Param"
]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录