[cherry-pick] add Adam beta1/beta2 support Variable (#21433)

* add Adam beta1/beta2 support Variable. test=develop

[cherry-pick] add Adam beta1/beta2 support Variable (#21433)
* add Adam beta1/beta2 support Variable. test=develop
735a2db0 · Kaipeng Deng · GitHub · 2660107c · 735a2db0 · 735a2db0
11 changed file
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -20,27 +20,50 @@ namespace operators {
 using Tensor = framework::Tensor;
 void AdamOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Param"),
+  PADDLE_ENFORCE_EQ(
-                 "Input(Param) of AdamOp should not be null.");
+      ctx->HasInput("Param"), true,
-  PADDLE_ENFORCE(ctx->HasInput("Grad"),
+      platform::errors::NotFound("Input(Param) of AdamOp should not be null."));
-                 "Input(Grad) of AdamOp should not be null.");
+  PADDLE_ENFORCE_EQ(
-  PADDLE_ENFORCE(ctx->HasInput("Moment1"),
+      ctx->HasInput("Grad"), true,
-                 "Input(Moment1) of AdamOp should not be null.");
+      platform::errors::NotFound("Input(Grad) of AdamOp should not be null."));
-  PADDLE_ENFORCE(ctx->HasInput("Moment2"),
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Moment1"), true,
-                 "Input(Moment2) of AdamOp should not be null.");
+                    platform::errors::NotFound(
-  PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                        "Input(Moment1) of AdamOp should not be null."));
-                 "Input(LearningRate) of AdamOp should not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Moment2"), true,
-  PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                    platform::errors::NotFound(
-                 "Input(Beta1Pow) of AdamOp should not be null.");
+                        "Input(Moment2) of AdamOp should not be null."));
-  PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
+  PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
-                 "Input(Beta2Pow) of AdamOp should not be null.");
+                    platform::errors::NotFound(
+                        "Input(LearningRate) of AdamOp should not be null."));
-  PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Beta1Pow"), true,
-                 "Output(ParamOut) of AdamOp should not be null.");
+                    platform::errors::NotFound(
-  PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
+                        "Input(Beta1Pow) of AdamOp should not be null."));
-                 "Output(Moment1Out) of AdamOp should not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Beta2Pow"), true,
-  PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
+                    platform::errors::NotFound(
-                 "Output(Moment2Out) of AdamOp should not be null.");
+                        "Input(Beta2Pow) of AdamOp should not be null."));
+  if (ctx->IsRuntime() && ctx->HasInput("Beta1Tensor")) {
+    auto beta1 = ctx->Inputs("Beta1Tensor");
+    PADDLE_ENFORCE_EQ(
+        beta1.size(), 1,
+        platform::errors::InvalidArgument("Input(Beta1Tensor) size must be 1"));
+  }
+  if (ctx->IsRuntime() && ctx->HasInput("Beta2Tensor")) {
+    auto beta2 = ctx->Inputs("Beta2Tensor");
+    PADDLE_ENFORCE_EQ(
+        beta2.size(), 1,
+        platform::errors::InvalidArgument("Input(Beta2Tensor) size must be 1"));
+  }
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
+                    platform::errors::NotFound(
+                        "Output(ParamOut) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment1Out"), true,
+                    platform::errors::NotFound(
+                        "Output(Moment1Out) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment2Out"), true,
+                    platform::errors::NotFound(
+                        "Output(Moment2Out) of AdamOp should not be null."));
  auto lr_dims = ctx->GetInputDim("LearningRate");
  PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
@@ -93,6 +116,17 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+    AddInput("Beta1Tensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as beta1, this has a higher priority than attr(beta1), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
+    AddInput("Beta2Tensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as beta2, this has a higher priority than attr(beta2), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
    AddOutput("ParamOut", "(Tensor) Output parameter");
    AddOutput("Moment1Out", "(Tensor) Output first moment");
    AddOutput("Moment2Out", "(Tensor) Output second moment");

--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -29,6 +29,16 @@ namespace operators {
 namespace scatter = paddle::operators::math::scatter;
+static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
+  const float* tensor_data = tensor->data<float>();
+  framework::Tensor cpu_tensor;
+  if (platform::is_gpu_place(tensor->place())) {
+    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    tensor_data = cpu_tensor.data<float>();
+  }
+  return tensor_data[0];
+}
 class AdamOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -367,8 +377,6 @@ class AdamOpKernel : public framework::OpKernel<T> {
    int64_t min_row_size_to_use_multithread =
        ctx.Attr<int64_t>("min_row_size_to_use_multithread");
    bool lazy_mode = ctx.Attr<bool>("lazy_mode");
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
    auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
    // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
@@ -390,6 +398,17 @@ class AdamOpKernel : public framework::OpKernel<T> {
    auto& mom2_out =
        Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    if (ctx.HasInput("Beta1Tensor")) {
+      auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      beta1 = static_cast<T>(GetAttrFromTensor(beta1_tensor));
+    }
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    if (ctx.HasInput("Beta2Tensor")) {
+      auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      beta2 = static_cast<T>(GetAttrFromTensor(beta2_tensor));
+    }
    if (grad_var->IsType<framework::LoDTensor>()) {
      auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -34,6 +34,14 @@ class ScaleOp : public framework::OperatorWithKernel {
                   "Input(X) of ScaleOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of ScaleOp should not be null.");
+    if (ctx->IsRuntime() && ctx->HasInput("ScaleTensor")) {
+      auto scale = ctx->Inputs("ScaleTensor");
+      PADDLE_ENFORCE_EQ(scale.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(ScaleTensor) size must be 1"));
+    }
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
    ctx->ShareLoD("X", /*->*/ "Out");
  }
@@ -43,6 +51,11 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddInput("ScaleTensor",
+             "(Tensor) If provided, use this as "
+             "scale factor, this has a higher priority than "
+             "attr(scale), the shape of this tensor MUST BE 1.")
+        .AsDispensable();
    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
    AddComment(R"DOC(
 **Scale operator**
@@ -89,6 +102,9 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("scale");
    grad_op->SetInput("X", OutputGrad("Out"));
+    if (ForwardOp().Inputs().count("ScaleTensor") > 0) {
+      grad_op->SetInput("ScaleTensor", Input("ScaleTensor"));
+    }
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttr("scale", GetAttr("scale"));
    grad_op->SetAttr("bias", 0.0f);
@@ -97,14 +113,14 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
  }
 };
-using ScaleOpInplace = framework::SingleOpInplaceInToOut;
+DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker,
-                  ops::ScaleOpVarTypeInference, ops::ScaleOpInplace);
+                  ops::ScaleOpVarTypeInference, ops::ScaleOpInplaceInferer);
 REGISTER_OP_CPU_KERNEL(
    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,

--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -19,6 +19,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
+  const float* tensor_data = tensor->data<float>();
+  framework::Tensor cpu_tensor;
+  if (platform::is_gpu_place(tensor->place())) {
+    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    tensor_data = cpu_tensor.data<float>();
+  }
+  return tensor_data[0];
+}
 template <typename DeviceContext, typename T>
 class ScaleKernel : public framework::OpKernel<T> {
 public:
@@ -26,10 +37,15 @@ class ScaleKernel : public framework::OpKernel<T> {
    auto* in_var = ctx.InputVar("X");
    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
-    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
+    if (ctx.HasInput("ScaleTensor")) {
+      auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
+      scale = GetAttrFromTensor(scale_tensor);
+    }
    auto* out_var = ctx.OutputVar("Out");
    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
      auto& in_slr = in_var->Get<framework::SelectedRows>();

--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -174,6 +174,8 @@ def generate_layer_fn(op_type):
            if not isinstance(val, list) and not isinstance(val, tuple):
                val = [val]
            if len(val) == 0:
+                if len(args) == 0:
+                    continue
                val = [args[0]]
                args = args[1:]

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -14074,7 +14074,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
    Args:
        x(Variable): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8.
-        scale(float): The scale factor of the input.
+        scale(float|Variable): The scale factor of the input, it should be a float number or a Variable with shape [1] and data type as float32.
        bias(float): The bias to be put on the input.
        bias_after_scale(bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances.
        act(str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu.
@@ -14099,6 +14099,27 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
            res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
            print(res) # [array([[ 3.,  5.,  7.], [ 9., 11., 13.]], dtype=float32)]
+        .. code-block:: python
+            # scale with parameter scale as Variable
+            import paddle.fluid as fluid
+            import numpy as np
+            inputs = fluid.layers.data(name="x", shape=[2, 3], dtype='float32')
+            scale = fluid.layers.data(name="scale", shape=[1], dtype='float32',
+                                      append_batch_size=False)
+            output = fluid.layers.scale(inputs, scale = scale, bias = 1.0)
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(fluid.default_startup_program())
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            scale_np = np.array([2.]).astype(np.float32)
+            res = exe.run(fluid.default_main_program(), feed={'x':img, 'scale':scale_np}, fetch_list=[output])
+            print(res) # [array([[ 3.,  5.,  7.], [ 9., 11., 13.]], dtype=float32)]
    """
    helper = LayerHelper('scale', **locals())
@@ -14108,15 +14129,18 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
        out = helper.create_variable(
            name=name, dtype=x.dtype, persistable=False)
+    inputs = {'X': x}
+    attrs = {
+        'bias': float(bias),
+        'bias_after_scale': bias_after_scale,
+    }
+    if isinstance(scale, Variable):
+        inputs['ScaleTensor'] = scale
+    else:
+        attrs['scale'] = float(scale)
    helper.append_op(
-        type='scale',
+        type='scale', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={
-            'scale': float(scale),
-            'bias': float(bias),
-            'bias_after_scale': bias_after_scale
-        })
    return helper.append_activation(out)

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1484,9 +1484,11 @@ class AdamOptimizer(Optimizer):
    Args:
        learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``.
            It can be a float value or a ``Variable`` with a float type. The default value is 0.001.
-        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
+        beta1 (float|Variable, optional): The exponential decay rate for the 1st moment estimates.
+            It should be a float number or a Variable with shape [1] and data type as float32.
            The default value is 0.9.
-        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
+        beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates.
+            It should be a float number or a Variable with shape [1] and data type as float32.
            The default value is 0.999.
        epsilon (float, optional): A small float value for numerical stability.
            The default value is 1e-08.
@@ -1530,6 +1532,64 @@ class AdamOptimizer(Optimizer):
                for data in train_reader():
                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+        .. code-block:: python
+            # Adam with beta1/beta2 as Variable
+            import paddle
+            import paddle.fluid as fluid
+            import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler
+            place = fluid.CPUPlace()
+            main = fluid.Program()
+            with fluid.program_guard(main):
+                x = fluid.data(name='x', shape=[None, 13], dtype='float32')
+                y = fluid.data(name='y', shape=[None, 1], dtype='float32')
+                y_predict = fluid.layers.fc(input=x, size=1, act=None)
+                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+                avg_cost = fluid.layers.mean(cost)
+                # define beta decay variable
+                def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate):
+                    global_step = lr_scheduler._decay_step_counter()
+                    beta1 = fluid.layers.create_global_var(
+                        shape=[1],
+                        value=float(beta1_init),
+                        dtype='float32',
+                        # set persistable for save checkpoints and resume
+                        persistable=True,
+                        name="beta1")
+                    beta2 = fluid.layers.create_global_var(
+                        shape=[1],
+                        value=float(beta2_init),
+                        dtype='float32',
+                        # set persistable for save checkpoints and resume
+                        persistable=True,
+                        name="beta2")
+                    div_res = global_step / decay_steps
+                    decayed_beta1 = beta1_init * (decay_rate**div_res)
+                    decayed_beta2 = beta2_init * (decay_rate**div_res)
+                    fluid.layers.assign(decayed_beta1, beta1)
+                    fluid.layers.assign(decayed_beta2, beta2)
+                    return beta1, beta2
+                beta1, beta2 = get_decayed_betas(0.9, 0.99, 1e5, 0.9)
+                adam_optimizer = fluid.optimizer.AdamOptimizer(
+                                                    learning_rate=0.01,
+                                                    beta1=beta1,
+                                                    beta2=beta2)
+                adam_optimizer.minimize(avg_cost)
+                fetch_list = [avg_cost]
+                train_reader = paddle.batch(
+                    paddle.dataset.uci_housing.train(), batch_size=1)
+                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+                exe = fluid.Executor(place)
+                exe.run(fluid.default_startup_program())
+                for data in train_reader():
+                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
    """
    _moment1_acc_str = "moment1"
    _moment2_acc_str = "moment2"
@@ -1569,13 +1629,15 @@ class AdamOptimizer(Optimizer):
                name=self._beta1_pow_acc_str,
                param=p,
                dtype='float32',
-                fill_value=self._beta1,
+                fill_value=0.9 if isinstance(self._beta1, Variable) \
+                        else self._beta1,
                shape=[1])
            self._add_accumulator(
                name=self._beta2_pow_acc_str,
                param=p,
                dtype='float32',
-                fill_value=self._beta2,
+                fill_value=0.999 if isinstance(self._beta2, Variable) \
+                        else self._beta2,
                shape=[1])
    def _append_optimize_op(self, block, param_and_grad):
@@ -1591,29 +1653,40 @@ class AdamOptimizer(Optimizer):
                                              param_and_grad[0])
        # create the adam optimize op
+        inputs = {
+            "Param": param_and_grad[0],
+            "Grad": param_and_grad[1],
+            "LearningRate": self._create_param_lr(param_and_grad),
+            "Moment1": moment1,
+            "Moment2": moment2,
+            "Beta1Pow": beta1_pow_acc,
+            "Beta2Pow": beta2_pow_acc
+        }
+        outputs = {
+            "ParamOut": param_and_grad[0],
+            "Moment1Out": moment1,
+            "Moment2Out": moment2
+        }
+        attrs = {
+            "epsilon": self._epsilon,
+            "lazy_mode": self._lazy_mode,
+            "min_row_size_to_use_multithread": 1000
+        }
+        if isinstance(self._beta1, Variable):
+            inputs['Beta1Tensor'] = self._beta1
+        else:
+            attrs['beta1'] = self._beta1
+        if isinstance(self._beta2, Variable):
+            inputs['Beta2Tensor'] = self._beta2
+        else:
+            attrs['beta2'] = self._beta2
        adam_op = block.append_op(
            type=self.type,
-            inputs={
+            inputs=inputs,
-                "Param": param_and_grad[0],
+            outputs=outputs,
-                "Grad": param_and_grad[1],
+            attrs=attrs,
-                "LearningRate": self._create_param_lr(param_and_grad),
-                "Moment1": moment1,
-                "Moment2": moment2,
-                "Beta1Pow": beta1_pow_acc,
-                "Beta2Pow": beta2_pow_acc
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "Moment1Out": moment1,
-                "Moment2Out": moment2
-            },
-            attrs={
-                "beta1": self._beta1,
-                "beta2": self._beta2,
-                "epsilon": self._epsilon,
-                "lazy_mode": self._lazy_mode,
-                "min_row_size_to_use_multithread": 1000
-            },
            stop_gradient=True)
        return adam_op
@@ -1632,18 +1705,30 @@ class AdamOptimizer(Optimizer):
                                                      param)
                beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
                                                      param)
+                inputs = {"X": beta1_pow_acc}
+                attrs = {}
+                if isinstance(self._beta1, Variable):
+                    inputs['ScaleTensor'] = self._beta1
+                else:
+                    attrs['scale'] = self._beta1
                main_block.append_op(
                    type="scale",
-                    inputs={"X": beta1_pow_acc},
+                    inputs=inputs,
                    outputs={"Out": beta1_pow_acc},
-                    attrs={"scale": self._beta1},
+                    attrs=attrs,
                    stop_gradient=True)
+                inputs = {"X": beta2_pow_acc}
+                attrs = {}
+                if isinstance(self._beta2, Variable):
+                    inputs['ScaleTensor'] = self._beta2
+                else:
+                    attrs['scale'] = self._beta2
                main_block.append_op(
                    type="scale",
-                    inputs={"X": beta2_pow_acc},
+                    inputs=inputs,
                    outputs={"Out": beta2_pow_acc},
-                    attrs={"scale": self._beta2},
+                    attrs=attrs,
                    stop_gradient=True)

--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -19,6 +19,7 @@ import numpy as np
 from op_test import OpTest
 from paddle.fluid import core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
 class TestAdamOp1(OpTest):
@@ -183,10 +184,17 @@ def adam_step(inputs, attributes):
    beta1_pow = inputs['Beta1Pow']
    beta2_pow = inputs['Beta2Pow']
-    beta1 = attributes['beta1']
-    beta2 = attributes['beta2']
    epsilon = attributes['epsilon']
+    if 'beta1' in attributes:
+        beta1 = attributes['beta1']
+    else:
+        beta1 = inputs['Beta1Tensor'][0]
+    if 'beta2' in attributes:
+        beta2 = attributes['beta2']
+    else:
+        beta2 = inputs['Beta2Tensor'][0]
    moment1_out = beta1 * moment1 + (1 - beta1) * grad
    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
@@ -330,5 +338,92 @@ class TestSparseAdamOp(unittest.TestCase):
                self.check_with_place(place, lazy_mode)
+class TestAdamOpBetaVariable(OpTest):
+    def setUp(self):
+        '''Test Adam Op with beta as Variable
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+        beta1 = 0.85
+        beta2 = 0.95
+        learning_rate = 0.001
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            "Beta1Tensor": np.array([beta1]).astype("float32"),
+            "Beta2Tensor": np.array([beta2]).astype("float32"),
+        }
+        attributes = {'epsilon': epsilon}
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out
+        }
+    def test_check_output(self):
+        self.check_output()
+class TestAdamOptimizerBetaVariable(unittest.TestCase):
+    def test_adam_optimizer(self):
+        def test_with_place(place, shape):
+            exe = fluid.Executor(place)
+            train_prog = fluid.Program()
+            startup = fluid.Program()
+            with fluid.program_guard(train_prog, startup):
+                with fluid.unique_name.guard():
+                    data = fluid.data(name="data", shape=shape)
+                    conv = fluid.layers.conv2d(data, 8, 3)
+                    loss = fluid.layers.reduce_mean(conv)
+                    beta1 = fluid.layers.create_global_var(
+                        shape=[1],
+                        value=0.85,
+                        dtype='float32',
+                        persistable=True)
+                    beta2 = fluid.layers.create_global_var(
+                        shape=[1],
+                        value=0.95,
+                        dtype='float32',
+                        persistable=True)
+                    opt = fluid.optimizer.Adam(
+                        learning_rate=1e-5, beta1=beta1, beta2=beta2)
+                    opt.minimize(loss)
+            exe.run(startup)
+            data_np = np.random.random(shape).astype('float32')
+            rets = exe.run(train_prog,
+                           feed={"data": data_np},
+                           fetch_list=[loss])
+            assert rets[0] is not None
+        shape = [2, 3, 8, 8]
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            test_with_place(place, shape)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2425,6 +2425,20 @@ class TestBook(LayerTest):
            out = layers.slice(input, axes=axes, starts=starts, ends=ends)
            return out
+    def make_scale_variable(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(
+                name="input", shape=[3, 4, 5, 6], dtype='float32')
+            scale_var = self._get_data(
+                name="scale",
+                shape=[1],
+                dtype='float32',
+                append_batch_size=False)
+            out = layers.scale(input, scale=scale_var)
+            return out
    def make_softshrink(self):
        with program_guard(fluid.default_main_program(),
                           fluid.default_startup_program()):

--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -42,6 +42,29 @@ class TestScaleOp(OpTest):
        self.check_grad(['X'], 'Out')
+class TestScaleOpScaleVariable(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.dtype = np.float32
+        self.init_dtype_type()
+        self.scale = -2.3
+        self.inputs = {
+            'X': np.random.random((10, 10)).astype(self.dtype),
+            'ScaleTensor': np.array([self.scale]).astype('float32')
+        }
+        self.attrs = {}
+        self.outputs = {'Out': self.inputs['X'] * self.dtype(self.scale)}
+    def init_dtype_type(self):
+        pass
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
 class TestScaleOpSelectedRows(unittest.TestCase):
    def init_dtype_type(self):
        pass

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1440,7 +1440,10 @@ class DistributeTranspiler(object):
                            param_name, endpoint)
                        break
                for key in opt_op.input_names:
-                    if key in ["Param", "Grad", "LearningRate"]:
+                    if key in [
+                            "Param", "Grad", "LearningRate", "Beta1Tensor",
+                            "Beta2Tensor"
+                    ]:
                        continue
                    origin_var = self.origin_program.global_block().vars[
                        opt_op.input(key)[0]]
@@ -2204,7 +2207,10 @@ class DistributeTranspiler(object):
        for key in opt_op.input_names:
            new_shape = None
-            if key in ["Param", "Grad", "LearningRate"]:
+            if key in [
+                    "Param", "Grad", "LearningRate", "Beta1Tensor",
+                    "Beta2Tensor"
+            ]:
                continue
            var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
            param_var = new_inputs["Param"]