未验证 提交 ebfb720a 编写于 作者: K Kaipeng Deng 提交者: GitHub

add Adam beta1/beta2 support Variable (#21234)

* add Adam beta1/beta2 support Variable. test=develop
上级 09696d5d
...@@ -20,27 +20,50 @@ namespace operators { ...@@ -20,27 +20,50 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
void AdamOp::InferShape(framework::InferShapeContext* ctx) const { void AdamOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE_EQ(
"Input(Param) of AdamOp should not be null."); ctx->HasInput("Param"), true,
PADDLE_ENFORCE(ctx->HasInput("Grad"), platform::errors::NotFound("Input(Param) of AdamOp should not be null."));
"Input(Grad) of AdamOp should not be null."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasInput("Moment1"), ctx->HasInput("Grad"), true,
"Input(Moment1) of AdamOp should not be null."); platform::errors::NotFound("Input(Grad) of AdamOp should not be null."));
PADDLE_ENFORCE(ctx->HasInput("Moment2"), PADDLE_ENFORCE_EQ(ctx->HasInput("Moment1"), true,
"Input(Moment2) of AdamOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasInput("LearningRate"), "Input(Moment1) of AdamOp should not be null."));
"Input(LearningRate) of AdamOp should not be null."); PADDLE_ENFORCE_EQ(ctx->HasInput("Moment2"), true,
PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), platform::errors::NotFound(
"Input(Beta1Pow) of AdamOp should not be null."); "Input(Moment2) of AdamOp should not be null."));
PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
"Input(Beta2Pow) of AdamOp should not be null."); platform::errors::NotFound(
"Input(LearningRate) of AdamOp should not be null."));
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE_EQ(ctx->HasInput("Beta1Pow"), true,
"Output(ParamOut) of AdamOp should not be null."); platform::errors::NotFound(
PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), "Input(Beta1Pow) of AdamOp should not be null."));
"Output(Moment1Out) of AdamOp should not be null."); PADDLE_ENFORCE_EQ(ctx->HasInput("Beta2Pow"), true,
PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), platform::errors::NotFound(
"Output(Moment2Out) of AdamOp should not be null."); "Input(Beta2Pow) of AdamOp should not be null."));
if (ctx->IsRuntime() && ctx->HasInput("Beta1Tensor")) {
auto beta1 = ctx->Inputs("Beta1Tensor");
PADDLE_ENFORCE_EQ(
beta1.size(), 1,
platform::errors::InvalidArgument("Input(Beta1Tensor) size must be 1"));
}
if (ctx->IsRuntime() && ctx->HasInput("Beta2Tensor")) {
auto beta2 = ctx->Inputs("Beta2Tensor");
PADDLE_ENFORCE_EQ(
beta2.size(), 1,
platform::errors::InvalidArgument("Input(Beta2Tensor) size must be 1"));
}
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(ParamOut) of AdamOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment1Out"), true,
platform::errors::NotFound(
"Output(Moment1Out) of AdamOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment2Out"), true,
platform::errors::NotFound(
"Output(Moment2Out) of AdamOp should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate"); auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
...@@ -93,6 +116,17 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -93,6 +116,17 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator"); AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
AddInput("Beta1Tensor",
"(Tensor<float32>, optional) If provided, Adam will use this "
"as beta1, this has a higher priority than attr(beta1), the "
"shape of this tensor MUST BE [1].")
.AsDispensable();
AddInput("Beta2Tensor",
"(Tensor<float32>, optional) If provided, Adam will use this "
"as beta2, this has a higher priority than attr(beta2), the "
"shape of this tensor MUST BE [1].")
.AsDispensable();
AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("ParamOut", "(Tensor) Output parameter");
AddOutput("Moment1Out", "(Tensor) Output first moment"); AddOutput("Moment1Out", "(Tensor) Output first moment");
AddOutput("Moment2Out", "(Tensor) Output second moment"); AddOutput("Moment2Out", "(Tensor) Output second moment");
......
...@@ -29,6 +29,16 @@ namespace operators { ...@@ -29,6 +29,16 @@ namespace operators {
namespace scatter = paddle::operators::math::scatter; namespace scatter = paddle::operators::math::scatter;
static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
const float* tensor_data = tensor->data<float>();
framework::Tensor cpu_tensor;
if (platform::is_gpu_place(tensor->place())) {
TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
tensor_data = cpu_tensor.data<float>();
}
return tensor_data[0];
}
class AdamOp : public framework::OperatorWithKernel { class AdamOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -367,8 +377,6 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -367,8 +377,6 @@ class AdamOpKernel : public framework::OpKernel<T> {
int64_t min_row_size_to_use_multithread = int64_t min_row_size_to_use_multithread =
ctx.Attr<int64_t>("min_row_size_to_use_multithread"); ctx.Attr<int64_t>("min_row_size_to_use_multithread");
bool lazy_mode = ctx.Attr<bool>("lazy_mode"); bool lazy_mode = ctx.Attr<bool>("lazy_mode");
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon")); T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param"); auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
// auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad"); // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
...@@ -390,6 +398,17 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -390,6 +398,17 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto& mom2_out = auto& mom2_out =
Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out"); Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
if (ctx.HasInput("Beta1Tensor")) {
auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
beta1 = static_cast<T>(GetAttrFromTensor(beta1_tensor));
}
T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
if (ctx.HasInput("Beta2Tensor")) {
auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
beta2 = static_cast<T>(GetAttrFromTensor(beta2_tensor));
}
if (grad_var->IsType<framework::LoDTensor>()) { if (grad_var->IsType<framework::LoDTensor>()) {
auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad"); auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
......
...@@ -34,6 +34,14 @@ class ScaleOp : public framework::OperatorWithKernel { ...@@ -34,6 +34,14 @@ class ScaleOp : public framework::OperatorWithKernel {
"Input(X) of ScaleOp should not be null."); "Input(X) of ScaleOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ScaleOp should not be null."); "Output(Out) of ScaleOp should not be null.");
if (ctx->IsRuntime() && ctx->HasInput("ScaleTensor")) {
auto scale = ctx->Inputs("ScaleTensor");
PADDLE_ENFORCE_EQ(scale.size(), 1,
platform::errors::InvalidArgument(
"Input(ScaleTensor) size must be 1"));
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
...@@ -43,6 +51,11 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -43,6 +51,11 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "(Tensor) Input tensor of scale operator."); AddInput("X", "(Tensor) Input tensor of scale operator.");
AddInput("ScaleTensor",
"(Tensor) If provided, use this as "
"scale factor, this has a higher priority than "
"attr(scale), the shape of this tensor MUST BE 1.")
.AsDispensable();
AddOutput("Out", "(Tensor) Output tensor of scale operator."); AddOutput("Out", "(Tensor) Output tensor of scale operator.");
AddComment(R"DOC( AddComment(R"DOC(
**Scale operator** **Scale operator**
...@@ -90,6 +103,9 @@ class ScaleGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -90,6 +103,9 @@ class ScaleGradMaker : public framework::SingleGradOpMaker<T> {
auto *grad_op = new T(); auto *grad_op = new T();
grad_op->SetType("scale"); grad_op->SetType("scale");
grad_op->SetInput("X", this->OutputGrad("Out")); grad_op->SetInput("X", this->OutputGrad("Out"));
if (this->HasInput("ScaleTensor") > 0) {
grad_op->SetInput("ScaleTensor", this->Input("ScaleTensor"));
}
grad_op->SetOutput("Out", this->InputGrad("X")); grad_op->SetOutput("Out", this->InputGrad("X"));
grad_op->SetAttr("scale", this->GetAttr("scale")); grad_op->SetAttr("scale", this->GetAttr("scale"));
grad_op->SetAttr("bias", 0.0f); grad_op->SetAttr("bias", 0.0f);
......
...@@ -19,6 +19,17 @@ limitations under the License. */ ...@@ -19,6 +19,17 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
const float* tensor_data = tensor->data<float>();
framework::Tensor cpu_tensor;
if (platform::is_gpu_place(tensor->place())) {
TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
tensor_data = cpu_tensor.data<float>();
}
return tensor_data[0];
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ScaleKernel : public framework::OpKernel<T> { class ScaleKernel : public framework::OpKernel<T> {
public: public:
...@@ -26,10 +37,15 @@ class ScaleKernel : public framework::OpKernel<T> { ...@@ -26,10 +37,15 @@ class ScaleKernel : public framework::OpKernel<T> {
auto* in_var = ctx.InputVar("X"); auto* in_var = ctx.InputVar("X");
auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var); auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
auto scale = static_cast<T>(ctx.Attr<float>("scale"));
auto bias = static_cast<T>(ctx.Attr<float>("bias")); auto bias = static_cast<T>(ctx.Attr<float>("bias"));
auto bias_after_scale = ctx.Attr<bool>("bias_after_scale"); auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
auto scale = static_cast<T>(ctx.Attr<float>("scale"));
if (ctx.HasInput("ScaleTensor")) {
auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
scale = GetAttrFromTensor(scale_tensor);
}
auto* out_var = ctx.OutputVar("Out"); auto* out_var = ctx.OutputVar("Out");
if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) { if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
auto& in_slr = in_var->Get<framework::SelectedRows>(); auto& in_slr = in_var->Get<framework::SelectedRows>();
......
...@@ -174,6 +174,8 @@ def generate_layer_fn(op_type): ...@@ -174,6 +174,8 @@ def generate_layer_fn(op_type):
if not isinstance(val, list) and not isinstance(val, tuple): if not isinstance(val, list) and not isinstance(val, tuple):
val = [val] val = [val]
if len(val) == 0: if len(val) == 0:
if len(args) == 0:
continue
val = [args[0]] val = [args[0]]
args = args[1:] args = args[1:]
......
...@@ -10153,7 +10153,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): ...@@ -10153,7 +10153,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
Args: Args:
x(Variable): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8. x(Variable): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8.
scale(float): The scale factor of the input. scale(float|Variable): The scale factor of the input, it should be a float number or a Variable with shape [1] and data type as float32.
bias(float): The bias to be put on the input. bias(float): The bias to be put on the input.
bias_after_scale(bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances. bias_after_scale(bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances.
act(str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu. act(str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu.
...@@ -10178,6 +10178,27 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): ...@@ -10178,6 +10178,27 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output]) res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
print(res) # [array([[ 3., 5., 7.], [ 9., 11., 13.]], dtype=float32)] print(res) # [array([[ 3., 5., 7.], [ 9., 11., 13.]], dtype=float32)]
.. code-block:: python
# scale with parameter scale as Variable
import paddle.fluid as fluid
import numpy as np
inputs = fluid.layers.data(name="x", shape=[2, 3], dtype='float32')
scale = fluid.layers.data(name="scale", shape=[1], dtype='float32'
append_batch_size=False)
output = fluid.layers.scale(inputs, scale = scale, bias = 1.0)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
scale_np = np.array([2.]).astype(np.float32)
res = exe.run(fluid.default_main_program(), feed={'x':img, 'scale':scale_np}, fetch_list=[output])
print(res) # [array([[ 3., 5., 7.], [ 9., 11., 13.]], dtype=float32)]
""" """
helper = LayerHelper('scale', **locals()) helper = LayerHelper('scale', **locals())
...@@ -10187,15 +10208,18 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): ...@@ -10187,15 +10208,18 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
inputs = {'X': x}
attrs = {
'bias': float(bias),
'bias_after_scale': bias_after_scale,
}
if isinstance(scale, Variable):
inputs['ScaleTensor'] = scale
else:
attrs['scale'] = float(scale)
helper.append_op( helper.append_op(
type='scale', type='scale', inputs=inputs, outputs={'Out': out}, attrs=attrs)
inputs={'X': x},
outputs={'Out': out},
attrs={
'scale': float(scale),
'bias': float(bias),
'bias_after_scale': bias_after_scale
})
return helper.append_activation(out) return helper.append_activation(out)
......
...@@ -1484,9 +1484,11 @@ class AdamOptimizer(Optimizer): ...@@ -1484,9 +1484,11 @@ class AdamOptimizer(Optimizer):
Args: Args:
learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``. learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``.
It can be a float value or a ``Variable`` with a float type. The default value is 0.001. It can be a float value or a ``Variable`` with a float type. The default value is 0.001.
beta1 (float, optional): The exponential decay rate for the 1st moment estimates. beta1 (float|Variable, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Variable with shape [1] and data type as float32.
The default value is 0.9. The default value is 0.9.
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates.
It should be a float number or a Variable with shape [1] and data type as float32.
The default value is 0.999. The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability. epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08. The default value is 1e-08.
...@@ -1530,6 +1532,64 @@ class AdamOptimizer(Optimizer): ...@@ -1530,6 +1532,64 @@ class AdamOptimizer(Optimizer):
for data in train_reader(): for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
.. code-block:: python
# Adam with beta1/beta2 as Variable
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
# define beta decay variable
def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate)
global_step = lr_scheduler._decay_step_counter()
beta1 = fluid.layers.create_global_var(
shape=[1],
value=float(beta1_init),
dtype='float32',
# set persistable for save checkpoints and resume
persistable=True,
name="beta1")
beta2 = fluid.layers.create_global_var(
shape=[1],
value=float(beta2_init),
dtype='float32',
# set persistable for save checkpoints and resume
persistable=True,
name="beta2")
div_res = global_step / decay_steps
decayed_beta1 = beta1_init * (decay_rate**div_res)
decayed_beta2 = beta2_init * (decay_rate**div_res)
fluid.layers.assign(decayed_beta1, beta1)
fluid.layers.assign(decayed_beta2, beta2)
return beta1, beta2
beta1, beta2 = get_decayed_betas(0.9, 0.99, 1e5, 0.9)
adam_optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=0.01,
beta1=beta1
beta2=beta2)
adam_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
""" """
_moment1_acc_str = "moment1" _moment1_acc_str = "moment1"
_moment2_acc_str = "moment2" _moment2_acc_str = "moment2"
...@@ -1569,13 +1629,15 @@ class AdamOptimizer(Optimizer): ...@@ -1569,13 +1629,15 @@ class AdamOptimizer(Optimizer):
name=self._beta1_pow_acc_str, name=self._beta1_pow_acc_str,
param=p, param=p,
dtype='float32', dtype='float32',
fill_value=self._beta1, fill_value=0.9 if isinstance(self._beta1, Variable) \
else self._beta1,
shape=[1]) shape=[1])
self._add_accumulator( self._add_accumulator(
name=self._beta2_pow_acc_str, name=self._beta2_pow_acc_str,
param=p, param=p,
dtype='float32', dtype='float32',
fill_value=self._beta2, fill_value=0.999 if isinstance(self._beta2, Variable) \
else self._beta2,
shape=[1]) shape=[1])
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
...@@ -1591,29 +1653,40 @@ class AdamOptimizer(Optimizer): ...@@ -1591,29 +1653,40 @@ class AdamOptimizer(Optimizer):
param_and_grad[0]) param_and_grad[0])
# create the adam optimize op # create the adam optimize op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": self._create_param_lr(param_and_grad),
"Moment1": moment1,
"Moment2": moment2,
"Beta1Pow": beta1_pow_acc,
"Beta2Pow": beta2_pow_acc
}
outputs = {
"ParamOut": param_and_grad[0],
"Moment1Out": moment1,
"Moment2Out": moment2
}
attrs = {
"epsilon": self._epsilon,
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000
}
if isinstance(self._beta1, Variable):
inputs['Beta1Tensor'] = self._beta1
else:
attrs['beta1'] = self._beta1
if isinstance(self._beta2, Variable):
inputs['Beta2Tensor'] = self._beta2
else:
attrs['beta2'] = self._beta2
adam_op = block.append_op( adam_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs=inputs,
"Param": param_and_grad[0], outputs=outputs,
"Grad": param_and_grad[1], attrs=attrs,
"LearningRate": self._create_param_lr(param_and_grad),
"Moment1": moment1,
"Moment2": moment2,
"Beta1Pow": beta1_pow_acc,
"Beta2Pow": beta2_pow_acc
},
outputs={
"ParamOut": param_and_grad[0],
"Moment1Out": moment1,
"Moment2Out": moment2
},
attrs={
"beta1": self._beta1,
"beta2": self._beta2,
"epsilon": self._epsilon,
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000
},
stop_gradient=True) stop_gradient=True)
return adam_op return adam_op
...@@ -1632,18 +1705,30 @@ class AdamOptimizer(Optimizer): ...@@ -1632,18 +1705,30 @@ class AdamOptimizer(Optimizer):
param) param)
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param) param)
inputs = {"X": beta1_pow_acc}
attrs = {}
if isinstance(self._beta1, Variable):
inputs['ScaleTensor'] = self._beta1
else:
attrs['scale'] = self._beta1
main_block.append_op( main_block.append_op(
type="scale", type="scale",
inputs={"X": beta1_pow_acc}, inputs=inputs,
outputs={"Out": beta1_pow_acc}, outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1}, attrs=attrs,
stop_gradient=True) stop_gradient=True)
inputs = {"X": beta2_pow_acc}
attrs = {}
if isinstance(self._beta2, Variable):
inputs['ScaleTensor'] = self._beta2
else:
attrs['scale'] = self._beta2
main_block.append_op( main_block.append_op(
type="scale", type="scale",
inputs={"X": beta2_pow_acc}, inputs=inputs,
outputs={"Out": beta2_pow_acc}, outputs={"Out": beta2_pow_acc},
attrs={"scale": self._beta2}, attrs=attrs,
stop_gradient=True) stop_gradient=True)
......
...@@ -19,6 +19,7 @@ import numpy as np ...@@ -19,6 +19,7 @@ import numpy as np
from op_test import OpTest from op_test import OpTest
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
import paddle.fluid as fluid
class TestAdamOp1(OpTest): class TestAdamOp1(OpTest):
...@@ -183,10 +184,17 @@ def adam_step(inputs, attributes): ...@@ -183,10 +184,17 @@ def adam_step(inputs, attributes):
beta1_pow = inputs['Beta1Pow'] beta1_pow = inputs['Beta1Pow']
beta2_pow = inputs['Beta2Pow'] beta2_pow = inputs['Beta2Pow']
beta1 = attributes['beta1']
beta2 = attributes['beta2']
epsilon = attributes['epsilon'] epsilon = attributes['epsilon']
if 'beta1' in attributes:
beta1 = attributes['beta1']
else:
beta1 = inputs['Beta1Tensor'][0]
if 'beta2' in attributes:
beta2 = attributes['beta2']
else:
beta2 = inputs['Beta2Tensor'][0]
moment1_out = beta1 * moment1 + (1 - beta1) * grad moment1_out = beta1 * moment1 + (1 - beta1) * grad
moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad) moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
...@@ -330,5 +338,92 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -330,5 +338,92 @@ class TestSparseAdamOp(unittest.TestCase):
self.check_with_place(place, lazy_mode) self.check_with_place(place, lazy_mode)
class TestAdamOpBetaVariable(OpTest):
def setUp(self):
'''Test Adam Op with beta as Variable
'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
# The second moment is positive
moment2 = np.random.random((102, 105)).astype("float32")
beta1 = 0.85
beta2 = 0.95
learning_rate = 0.001
epsilon = 1e-8
beta1_pow = beta1**10
beta2_pow = beta2**10
self.inputs = {
'Param': param,
'Grad': grad,
'Moment1': moment1,
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32"),
"Beta1Tensor": np.array([beta1]).astype("float32"),
"Beta2Tensor": np.array([beta2]).astype("float32"),
}
attributes = {'epsilon': epsilon}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out
}
def test_check_output(self):
self.check_output()
class TestAdamOptimizerBetaVariable(unittest.TestCase):
def test_adam_optimizer(self):
def test_with_place(place, shape):
exe = fluid.Executor(place)
train_prog = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
data = fluid.data(name="data", shape=shape)
conv = fluid.layers.conv2d(data, 8, 3)
loss = fluid.layers.reduce_mean(conv)
beta1 = fluid.layers.create_global_var(
shape=[1],
value=0.85,
dtype='float32',
persistable=True)
beta2 = fluid.layers.create_global_var(
shape=[1],
value=0.95,
dtype='float32',
persistable=True)
opt = fluid.optimizer.Adam(
learning_rate=1e-5, beta1=beta1, beta2=beta2)
opt.minimize(loss)
exe.run(startup)
data_np = np.random.random(shape).astype('float32')
rets = exe.run(train_prog,
feed={"data": data_np},
fetch_list=[loss])
assert rets[0] is not None
shape = [2, 3, 8, 8]
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for place in places:
test_with_place(place, shape)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -2428,6 +2428,20 @@ class TestBook(LayerTest): ...@@ -2428,6 +2428,20 @@ class TestBook(LayerTest):
out = layers.slice(input, axes=axes, starts=starts, ends=ends) out = layers.slice(input, axes=axes, starts=starts, ends=ends)
return out return out
def make_scale_variable(self):
with program_guard(fluid.default_main_program(),
fluid.default_startup_program()):
input = self._get_data(
name="input", shape=[3, 4, 5, 6], dtype='float32')
scale_var = self._get_data(
name="scale",
shape=[1],
dtype='float32',
append_batch_size=False)
out = layers.scale(input, scale=scale_var)
return out
def make_softshrink(self): def make_softshrink(self):
with program_guard(fluid.default_main_program(), with program_guard(fluid.default_main_program(),
fluid.default_startup_program()): fluid.default_startup_program()):
......
...@@ -42,6 +42,29 @@ class TestScaleOp(OpTest): ...@@ -42,6 +42,29 @@ class TestScaleOp(OpTest):
self.check_grad(['X'], 'Out') self.check_grad(['X'], 'Out')
class TestScaleOpScaleVariable(OpTest):
def setUp(self):
self.op_type = "scale"
self.dtype = np.float32
self.init_dtype_type()
self.scale = -2.3
self.inputs = {
'X': np.random.random((10, 10)).astype(self.dtype),
'ScaleTensor': np.array([self.scale]).astype('float32')
}
self.attrs = {}
self.outputs = {'Out': self.inputs['X'] * self.dtype(self.scale)}
def init_dtype_type(self):
pass
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
class TestScaleOpSelectedRows(unittest.TestCase): class TestScaleOpSelectedRows(unittest.TestCase):
def init_dtype_type(self): def init_dtype_type(self):
pass pass
......
...@@ -1440,7 +1440,10 @@ class DistributeTranspiler(object): ...@@ -1440,7 +1440,10 @@ class DistributeTranspiler(object):
param_name, endpoint) param_name, endpoint)
break break
for key in opt_op.input_names: for key in opt_op.input_names:
if key in ["Param", "Grad", "LearningRate"]: if key in [
"Param", "Grad", "LearningRate", "Beta1Tensor",
"Beta2Tensor"
]:
continue continue
origin_var = self.origin_program.global_block().vars[ origin_var = self.origin_program.global_block().vars[
opt_op.input(key)[0]] opt_op.input(key)[0]]
...@@ -2204,7 +2207,10 @@ class DistributeTranspiler(object): ...@@ -2204,7 +2207,10 @@ class DistributeTranspiler(object):
for key in opt_op.input_names: for key in opt_op.input_names:
new_shape = None new_shape = None
if key in ["Param", "Grad", "LearningRate"]: if key in [
"Param", "Grad", "LearningRate", "Beta1Tensor",
"Beta2Tensor"
]:
continue continue
var = self.origin_program.global_block().vars[opt_op.input(key)[0]] var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
param_var = new_inputs["Param"] param_var = new_inputs["Param"]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册