diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index c5a6fe5875baa2e1cba160b1a020916c2f42a285..3d6fbeb283594c1b5e97c68a8434b89cd3991b20 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -20,27 +20,50 @@ namespace operators { using Tensor = framework::Tensor; void AdamOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(ctx->HasInput("Param"), - "Input(Param) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Grad"), - "Input(Grad) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Moment1"), - "Input(Moment1) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Moment2"), - "Input(Moment2) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - "Input(LearningRate) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), - "Input(Beta1Pow) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), - "Input(Beta2Pow) of AdamOp should not be null."); - - PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - "Output(ParamOut) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), - "Output(Moment1Out) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), - "Output(Moment2Out) of AdamOp should not be null."); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Param"), true, + platform::errors::NotFound("Input(Param) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Grad"), true, + platform::errors::NotFound("Input(Grad) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Moment1"), true, + platform::errors::NotFound( + "Input(Moment1) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Moment2"), true, + platform::errors::NotFound( + "Input(Moment2) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true, + platform::errors::NotFound( + "Input(LearningRate) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Beta1Pow"), true, + platform::errors::NotFound( + "Input(Beta1Pow) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Beta2Pow"), true, + platform::errors::NotFound( + "Input(Beta2Pow) of AdamOp should not be null.")); + + if (ctx->IsRuntime() && ctx->HasInput("Beta1Tensor")) { + auto beta1 = ctx->Inputs("Beta1Tensor"); + PADDLE_ENFORCE_EQ( + beta1.size(), 1, + platform::errors::InvalidArgument("Input(Beta1Tensor) size must be 1")); + } + if (ctx->IsRuntime() && ctx->HasInput("Beta2Tensor")) { + auto beta2 = ctx->Inputs("Beta2Tensor"); + PADDLE_ENFORCE_EQ( + beta2.size(), 1, + platform::errors::InvalidArgument("Input(Beta2Tensor) size must be 1")); + } + + PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true, + platform::errors::NotFound( + "Output(ParamOut) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment1Out"), true, + platform::errors::NotFound( + "Output(Moment1Out) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment2Out"), true, + platform::errors::NotFound( + "Output(Moment2Out) of AdamOp should not be null.")); auto lr_dims = ctx->GetInputDim("LearningRate"); PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, @@ -93,6 +116,17 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator"); + AddInput("Beta1Tensor", + "(Tensor, optional) If provided, Adam will use this " + "as beta1, this has a higher priority than attr(beta1), the " + "shape of this tensor MUST BE [1].") + .AsDispensable(); + AddInput("Beta2Tensor", + "(Tensor, optional) If provided, Adam will use this " + "as beta2, this has a higher priority than attr(beta2), the " + "shape of this tensor MUST BE [1].") + .AsDispensable(); + AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("Moment1Out", "(Tensor) Output first moment"); AddOutput("Moment2Out", "(Tensor) Output second moment"); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 1cc34f11d09e9ec1868249f20fcc1b189efb0589..95e4d22b06fb766ca8aa64307d1890d84f1ae3f0 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -29,6 +29,16 @@ namespace operators { namespace scatter = paddle::operators::math::scatter; +static inline float GetAttrFromTensor(const framework::Tensor* tensor) { + const float* tensor_data = tensor->data(); + framework::Tensor cpu_tensor; + if (platform::is_gpu_place(tensor->place())) { + TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); + tensor_data = cpu_tensor.data(); + } + return tensor_data[0]; +} + class AdamOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -367,8 +377,6 @@ class AdamOpKernel : public framework::OpKernel { int64_t min_row_size_to_use_multithread = ctx.Attr("min_row_size_to_use_multithread"); bool lazy_mode = ctx.Attr("lazy_mode"); - T beta1 = static_cast(ctx.Attr("beta1")); - T beta2 = static_cast(ctx.Attr("beta2")); T epsilon = static_cast(ctx.Attr("epsilon")); auto& param = Ref(ctx.Input("Param"), "Must set Param"); // auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); @@ -390,6 +398,17 @@ class AdamOpKernel : public framework::OpKernel { auto& mom2_out = Ref(ctx.Output("Moment2Out"), "Must set Moment1Out"); + T beta1 = static_cast(ctx.Attr("beta1")); + if (ctx.HasInput("Beta1Tensor")) { + auto* beta1_tensor = ctx.Input("Beta1Tensor"); + beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); + } + T beta2 = static_cast(ctx.Attr("beta2")); + if (ctx.HasInput("Beta2Tensor")) { + auto* beta2_tensor = ctx.Input("Beta2Tensor"); + beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); + } + if (grad_var->IsType()) { auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 383e7940fa56586d17cf8819c463c201fbd24050..4448a414af97b688e22e577c0d05c2337792616b 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -34,6 +34,14 @@ class ScaleOp : public framework::OperatorWithKernel { "Input(X) of ScaleOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of ScaleOp should not be null."); + + if (ctx->IsRuntime() && ctx->HasInput("ScaleTensor")) { + auto scale = ctx->Inputs("ScaleTensor"); + PADDLE_ENFORCE_EQ(scale.size(), 1, + platform::errors::InvalidArgument( + "Input(ScaleTensor) size must be 1")); + } + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ "Out"); } @@ -43,6 +51,11 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "(Tensor) Input tensor of scale operator."); + AddInput("ScaleTensor", + "(Tensor) If provided, use this as " + "scale factor, this has a higher priority than " + "attr(scale), the shape of this tensor MUST BE 1.") + .AsDispensable(); AddOutput("Out", "(Tensor) Output tensor of scale operator."); AddComment(R"DOC( **Scale operator** @@ -89,6 +102,9 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { auto *grad_op = new framework::OpDesc(); grad_op->SetType("scale"); grad_op->SetInput("X", OutputGrad("Out")); + if (ForwardOp().Inputs().count("ScaleTensor") > 0) { + grad_op->SetInput("ScaleTensor", Input("ScaleTensor")); + } grad_op->SetOutput("Out", InputGrad("X")); grad_op->SetAttr("scale", GetAttr("scale")); grad_op->SetAttr("bias", 0.0f); @@ -97,14 +113,14 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { } }; -using ScaleOpInplace = framework::SingleOpInplaceInToOut; +DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, - ops::ScaleOpVarTypeInference, ops::ScaleOpInplace); + ops::ScaleOpVarTypeInference, ops::ScaleOpInplaceInferer); REGISTER_OP_CPU_KERNEL( scale, ops::ScaleKernel, ops::ScaleKernel, diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index 96b8b00b429df72569ef2a292c8a600c56159f19..909bb3d00442fa8a8618d46c904459c51ef92443 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -19,6 +19,17 @@ limitations under the License. */ namespace paddle { namespace operators { + +static inline float GetAttrFromTensor(const framework::Tensor* tensor) { + const float* tensor_data = tensor->data(); + framework::Tensor cpu_tensor; + if (platform::is_gpu_place(tensor->place())) { + TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); + tensor_data = cpu_tensor.data(); + } + return tensor_data[0]; +} + template class ScaleKernel : public framework::OpKernel { public: @@ -26,10 +37,15 @@ class ScaleKernel : public framework::OpKernel { auto* in_var = ctx.InputVar("X"); auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var); - auto scale = static_cast(ctx.Attr("scale")); auto bias = static_cast(ctx.Attr("bias")); auto bias_after_scale = ctx.Attr("bias_after_scale"); + auto scale = static_cast(ctx.Attr("scale")); + if (ctx.HasInput("ScaleTensor")) { + auto* scale_tensor = ctx.Input("ScaleTensor"); + scale = GetAttrFromTensor(scale_tensor); + } + auto* out_var = ctx.OutputVar("Out"); if (in_var->IsType() && in_var != out_var) { auto& in_slr = in_var->Get(); diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 255418fbde6ffe6bd685ce163c7a02d624b57cf1..e217cb27a2b231acd6d0c303e48180caa30e0e01 100755 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -174,6 +174,8 @@ def generate_layer_fn(op_type): if not isinstance(val, list) and not isinstance(val, tuple): val = [val] if len(val) == 0: + if len(args) == 0: + continue val = [args[0]] args = args[1:] diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index b0e3e71cdb6550177872482b327d76d1ce541b4e..f696353b5d816945c179095de838c689f322d20f 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -14074,7 +14074,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): Args: x(Variable): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8. - scale(float): The scale factor of the input. + scale(float|Variable): The scale factor of the input, it should be a float number or a Variable with shape [1] and data type as float32. bias(float): The bias to be put on the input. bias_after_scale(bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances. act(str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu. @@ -14099,6 +14099,27 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output]) print(res) # [array([[ 3., 5., 7.], [ 9., 11., 13.]], dtype=float32)] + + .. code-block:: python + + # scale with parameter scale as Variable + import paddle.fluid as fluid + import numpy as np + + inputs = fluid.layers.data(name="x", shape=[2, 3], dtype='float32') + scale = fluid.layers.data(name="scale", shape=[1], dtype='float32', + append_batch_size=False) + output = fluid.layers.scale(inputs, scale = scale, bias = 1.0) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + + img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32) + scale_np = np.array([2.]).astype(np.float32) + + res = exe.run(fluid.default_main_program(), feed={'x':img, 'scale':scale_np}, fetch_list=[output]) + print(res) # [array([[ 3., 5., 7.], [ 9., 11., 13.]], dtype=float32)] + """ helper = LayerHelper('scale', **locals()) @@ -14108,15 +14129,18 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): out = helper.create_variable( name=name, dtype=x.dtype, persistable=False) + inputs = {'X': x} + attrs = { + 'bias': float(bias), + 'bias_after_scale': bias_after_scale, + } + if isinstance(scale, Variable): + inputs['ScaleTensor'] = scale + else: + attrs['scale'] = float(scale) + helper.append_op( - type='scale', - inputs={'X': x}, - outputs={'Out': out}, - attrs={ - 'scale': float(scale), - 'bias': float(bias), - 'bias_after_scale': bias_after_scale - }) + type='scale', inputs=inputs, outputs={'Out': out}, attrs=attrs) return helper.append_activation(out) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 907d511ff3667734b4711bd8e8e51bfeb2f5ef72..168afd4a52cd16f3ad68b4163f6786bed5747a37 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -1484,9 +1484,11 @@ class AdamOptimizer(Optimizer): Args: learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``. It can be a float value or a ``Variable`` with a float type. The default value is 0.001. - beta1 (float, optional): The exponential decay rate for the 1st moment estimates. + beta1 (float|Variable, optional): The exponential decay rate for the 1st moment estimates. + It should be a float number or a Variable with shape [1] and data type as float32. The default value is 0.9. - beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. + beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates. + It should be a float number or a Variable with shape [1] and data type as float32. The default value is 0.999. epsilon (float, optional): A small float value for numerical stability. The default value is 1e-08. @@ -1530,6 +1532,64 @@ class AdamOptimizer(Optimizer): for data in train_reader(): exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + .. code-block:: python + + # Adam with beta1/beta2 as Variable + import paddle + import paddle.fluid as fluid + import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + # define beta decay variable + def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate): + global_step = lr_scheduler._decay_step_counter() + + beta1 = fluid.layers.create_global_var( + shape=[1], + value=float(beta1_init), + dtype='float32', + # set persistable for save checkpoints and resume + persistable=True, + name="beta1") + beta2 = fluid.layers.create_global_var( + shape=[1], + value=float(beta2_init), + dtype='float32', + # set persistable for save checkpoints and resume + persistable=True, + name="beta2") + + div_res = global_step / decay_steps + decayed_beta1 = beta1_init * (decay_rate**div_res) + decayed_beta2 = beta2_init * (decay_rate**div_res) + fluid.layers.assign(decayed_beta1, beta1) + fluid.layers.assign(decayed_beta2, beta2) + + return beta1, beta2 + + beta1, beta2 = get_decayed_betas(0.9, 0.99, 1e5, 0.9) + adam_optimizer = fluid.optimizer.AdamOptimizer( + learning_rate=0.01, + beta1=beta1, + beta2=beta2) + adam_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) """ _moment1_acc_str = "moment1" _moment2_acc_str = "moment2" @@ -1569,13 +1629,15 @@ class AdamOptimizer(Optimizer): name=self._beta1_pow_acc_str, param=p, dtype='float32', - fill_value=self._beta1, + fill_value=0.9 if isinstance(self._beta1, Variable) \ + else self._beta1, shape=[1]) self._add_accumulator( name=self._beta2_pow_acc_str, param=p, dtype='float32', - fill_value=self._beta2, + fill_value=0.999 if isinstance(self._beta2, Variable) \ + else self._beta2, shape=[1]) def _append_optimize_op(self, block, param_and_grad): @@ -1591,29 +1653,40 @@ class AdamOptimizer(Optimizer): param_and_grad[0]) # create the adam optimize op + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": self._create_param_lr(param_and_grad), + "Moment1": moment1, + "Moment2": moment2, + "Beta1Pow": beta1_pow_acc, + "Beta2Pow": beta2_pow_acc + } + outputs = { + "ParamOut": param_and_grad[0], + "Moment1Out": moment1, + "Moment2Out": moment2 + } + attrs = { + "epsilon": self._epsilon, + "lazy_mode": self._lazy_mode, + "min_row_size_to_use_multithread": 1000 + } + + if isinstance(self._beta1, Variable): + inputs['Beta1Tensor'] = self._beta1 + else: + attrs['beta1'] = self._beta1 + if isinstance(self._beta2, Variable): + inputs['Beta2Tensor'] = self._beta2 + else: + attrs['beta2'] = self._beta2 + adam_op = block.append_op( type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": self._create_param_lr(param_and_grad), - "Moment1": moment1, - "Moment2": moment2, - "Beta1Pow": beta1_pow_acc, - "Beta2Pow": beta2_pow_acc - }, - outputs={ - "ParamOut": param_and_grad[0], - "Moment1Out": moment1, - "Moment2Out": moment2 - }, - attrs={ - "beta1": self._beta1, - "beta2": self._beta2, - "epsilon": self._epsilon, - "lazy_mode": self._lazy_mode, - "min_row_size_to_use_multithread": 1000 - }, + inputs=inputs, + outputs=outputs, + attrs=attrs, stop_gradient=True) return adam_op @@ -1632,18 +1705,30 @@ class AdamOptimizer(Optimizer): param) beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, param) + inputs = {"X": beta1_pow_acc} + attrs = {} + if isinstance(self._beta1, Variable): + inputs['ScaleTensor'] = self._beta1 + else: + attrs['scale'] = self._beta1 main_block.append_op( type="scale", - inputs={"X": beta1_pow_acc}, + inputs=inputs, outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}, + attrs=attrs, stop_gradient=True) + inputs = {"X": beta2_pow_acc} + attrs = {} + if isinstance(self._beta2, Variable): + inputs['ScaleTensor'] = self._beta2 + else: + attrs['scale'] = self._beta2 main_block.append_op( type="scale", - inputs={"X": beta2_pow_acc}, + inputs=inputs, outputs={"Out": beta2_pow_acc}, - attrs={"scale": self._beta2}, + attrs=attrs, stop_gradient=True) diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 15f277cdc0aca30b8c768b6a6ee20e44880b2304..e3cab0630b98a414e30a5b4f344dc0f9caf6c8c5 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -19,6 +19,7 @@ import numpy as np from op_test import OpTest from paddle.fluid import core from paddle.fluid.op import Operator +import paddle.fluid as fluid class TestAdamOp1(OpTest): @@ -183,10 +184,17 @@ def adam_step(inputs, attributes): beta1_pow = inputs['Beta1Pow'] beta2_pow = inputs['Beta2Pow'] - beta1 = attributes['beta1'] - beta2 = attributes['beta2'] epsilon = attributes['epsilon'] + if 'beta1' in attributes: + beta1 = attributes['beta1'] + else: + beta1 = inputs['Beta1Tensor'][0] + if 'beta2' in attributes: + beta2 = attributes['beta2'] + else: + beta2 = inputs['Beta2Tensor'][0] + moment1_out = beta1 * moment1 + (1 - beta1) * grad moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad) lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) @@ -330,5 +338,92 @@ class TestSparseAdamOp(unittest.TestCase): self.check_with_place(place, lazy_mode) +class TestAdamOpBetaVariable(OpTest): + def setUp(self): + '''Test Adam Op with beta as Variable + ''' + self.op_type = "adam" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + beta1 = 0.85 + beta2 = 0.95 + + learning_rate = 0.001 + epsilon = 1e-8 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32"), + "Beta1Tensor": np.array([beta1]).astype("float32"), + "Beta2Tensor": np.array([beta2]).astype("float32"), + } + + attributes = {'epsilon': epsilon} + + param_out, moment1_out, \ + moment2_out = adam_step(self.inputs, attributes) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'ParamOut': param_out + } + + def test_check_output(self): + self.check_output() + + +class TestAdamOptimizerBetaVariable(unittest.TestCase): + def test_adam_optimizer(self): + def test_with_place(place, shape): + exe = fluid.Executor(place) + + train_prog = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(train_prog, startup): + with fluid.unique_name.guard(): + data = fluid.data(name="data", shape=shape) + conv = fluid.layers.conv2d(data, 8, 3) + loss = fluid.layers.reduce_mean(conv) + + beta1 = fluid.layers.create_global_var( + shape=[1], + value=0.85, + dtype='float32', + persistable=True) + beta2 = fluid.layers.create_global_var( + shape=[1], + value=0.95, + dtype='float32', + persistable=True) + opt = fluid.optimizer.Adam( + learning_rate=1e-5, beta1=beta1, beta2=beta2) + opt.minimize(loss) + + exe.run(startup) + data_np = np.random.random(shape).astype('float32') + rets = exe.run(train_prog, + feed={"data": data_np}, + fetch_list=[loss]) + assert rets[0] is not None + + shape = [2, 3, 8, 8] + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for place in places: + test_with_place(place, shape) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 6db2267fca9a26f8ba9686e17a1dc9f517e9fb4b..ca7efebfb79943127d536e83502ef81c585e3b0c 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -2425,6 +2425,20 @@ class TestBook(LayerTest): out = layers.slice(input, axes=axes, starts=starts, ends=ends) return out + def make_scale_variable(self): + with program_guard(fluid.default_main_program(), + fluid.default_startup_program()): + input = self._get_data( + name="input", shape=[3, 4, 5, 6], dtype='float32') + scale_var = self._get_data( + name="scale", + shape=[1], + dtype='float32', + append_batch_size=False) + + out = layers.scale(input, scale=scale_var) + return out + def make_softshrink(self): with program_guard(fluid.default_main_program(), fluid.default_startup_program()): diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py index 9893c92ad68f4d460c4bb428bb44a30df25fd6e0..c53db9eb78ce310c9732ae21a8117d955601198f 100644 --- a/python/paddle/fluid/tests/unittests/test_scale_op.py +++ b/python/paddle/fluid/tests/unittests/test_scale_op.py @@ -42,6 +42,29 @@ class TestScaleOp(OpTest): self.check_grad(['X'], 'Out') +class TestScaleOpScaleVariable(OpTest): + def setUp(self): + self.op_type = "scale" + self.dtype = np.float32 + self.init_dtype_type() + self.scale = -2.3 + self.inputs = { + 'X': np.random.random((10, 10)).astype(self.dtype), + 'ScaleTensor': np.array([self.scale]).astype('float32') + } + self.attrs = {} + self.outputs = {'Out': self.inputs['X'] * self.dtype(self.scale)} + + def init_dtype_type(self): + pass + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + class TestScaleOpSelectedRows(unittest.TestCase): def init_dtype_type(self): pass diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index ddf09d8979b2a4632a7ff90663aa1d51391a4984..48339d9605ef987d9c972a0a6efd2a44fa8f33ab 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -1440,7 +1440,10 @@ class DistributeTranspiler(object): param_name, endpoint) break for key in opt_op.input_names: - if key in ["Param", "Grad", "LearningRate"]: + if key in [ + "Param", "Grad", "LearningRate", "Beta1Tensor", + "Beta2Tensor" + ]: continue origin_var = self.origin_program.global_block().vars[ opt_op.input(key)[0]] @@ -2204,7 +2207,10 @@ class DistributeTranspiler(object): for key in opt_op.input_names: new_shape = None - if key in ["Param", "Grad", "LearningRate"]: + if key in [ + "Param", "Grad", "LearningRate", "Beta1Tensor", + "Beta2Tensor" + ]: continue var = self.origin_program.global_block().vars[opt_op.input(key)[0]] param_var = new_inputs["Param"]