support auto generate for op adam, adamw and merged_adam optimizer (#52711)

* support auto generate for op adam optimizer * remove unnecessary files * support auto generate for op adamw optimizer * support auto generate for op merged_adam optimizer * use manual_signature in adam_

support auto generate for op adam, adamw and merged_adam optimizer (#52711)
* support auto generate for op adam optimizer * remove unnecessary files * support auto generate for op adamw optimizer * support auto generate for op merged_adam optimizer * use manual_signature in adam_
183a74db · RedContritio · GitHub · fe6abd4d · fe6abd4d · fe6abd4d
12 changed file
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/adam_op.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(adam,
-                            AdamInferMetaFunctor,
-                            PD_INFER_META(phi::AdamInferMeta));
-
-REGISTER_OPERATOR(
-    adam,
-    ops::AdamOp,
-    ops::AdamOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    AdamInferMetaFunctor);
-
-REGISTER_OP_VERSION(adam)
-    .AddCheckpoint(
-        R"ROC(
-      Upgrade adam add 1 attribute [multi_precision].
-    )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "multi_precision",
-            "(bool) Whether to use multi-precision during weight updating.",
-            false))
-    .AddCheckpoint(
-        R"ROC(
-      Upgrade adam, add 1 dispensable input [EpsilonTensor].
-    )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewInput(
-            "EpsilonTensor",
-            "If provided, Adam will use this as epsilon, "
-            "this has a higher priority than attr(epsilon). "
-            "For better performance in npu kernel. "))
-    .AddCheckpoint(
-        R"ROC(
-      Upgrade adam, add 1 attribute [use_global_beta_pow].
-    )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "use_global_beta_pow",
-            "If true, Adam will use global beta_pow for whole model "
-            "instead of creating beta_pow for each parameter."
-            "In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be "
-            "used in adam op, "
-            "and beta_pow will be updated after all adam op in the model.",
-            false))
-    .AddCheckpoint(
-        R"ROC(
-      Upgrade adam, add 1 dispensable input [SkipUpdate].
-    )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewInput(
-            "SkipUpdate", "If the value is true, Adam will skip the update."));
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class AdamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const {
-    auto input_data_type =
-        OperatorWithKernel::IndicateVarDataType(ctx, "Param");
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string &var_name,
-      const phi::DenseTensor &tensor,
-      const phi::KernelKey &expected_kernel_type) const {
-    if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
-        var_name == "SkipUpdate") {
-      return phi::KernelKey(phi::Backend::ALL_BACKEND,
-                            expected_kernel_type.layout(),
-                            expected_kernel_type.dtype());
-    } else {
-      return phi::KernelKey(
-          tensor.place(), tensor.layout(), expected_kernel_type.dtype());
-    }
-  }
-};
-
-class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-    AddInput("Moment1", "(Tensor) Input first moment");
-    AddInput("Moment2", "(Tensor) Input second moment");
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
-    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
-
-    AddInput("Beta1Tensor",
-             "(Tensor<float32>, optional) If provided, Adam will use this "
-             "as beta1, this has a higher priority than attr(beta1), the "
-             "shape of this tensor MUST BE [1].")
-        .AsDispensable();
-    AddInput("Beta2Tensor",
-             "(Tensor<float32>, optional) If provided, Adam will use this "
-             "as beta2, this has a higher priority than attr(beta2), the "
-             "shape of this tensor MUST BE [1].")
-        .AsDispensable();
-    AddInput("EpsilonTensor",
-             "(Tensor<float32>, optional) If provided, Adam will use this "
-             "as epsilon, this has a higher priority than attr(epsilon), the "
-             "shape of this tensor MUST BE [1].")
-        .AsDispensable();
-    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
-    AddInput("SkipUpdate", "(Tensor<bool>, optional), Skip the update or not.")
-        .AsDispensable();
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("Moment1Out", "(Tensor) Output first moment");
-    AddOutput("Moment2Out", "(Tensor) Output second moment");
-    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
-    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
-    AddOutput("MasterParamOut",
-              "The updated FP32 master weight for AMP. "
-              "It shared memory with Input(MasterParam).")
-        .AsDispensable();
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "first moment estimates.")
-        .SetDefault(0.9f);
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the "
-                   "second moment estimates.")
-        .SetDefault(0.999f);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-    AddAttr<bool>(
-        "lazy_mode",
-        "(bool, default false) "
-        "only update the parameter that has gradient in sparse update")
-        .SetDefault(false);
-    AddAttr<int64_t>("min_row_size_to_use_multithread",
-                     "(int64_t, default 0) "
-                     "when not zero, if param row size is larger then "
-                     "min_row_size_to_use_multithread and "
-                     "inner_op_parallelism is larger then 0, sparse update "
-                     "will run in multithread mode")
-        .SetDefault(1000);
-    AddAttr<bool>("multi_precision",
-                  "(bool, default false) "
-                  "Whether to use multi-precision during weight updating.")
-        .SetDefault(false);
-    // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
-    // as dispensable since they are not used when use_global_beta_pow is true.
-    AddAttr<bool>("use_global_beta_pow",
-                  "(bool, default false) "
-                  "Whether to use global beta_pow for whole model instead of "
-                  "creating beta_pow for each parameter.")
-        .SetDefault(false);
-
-    AddComment(R"DOC(
-Adam Optimizer.
-
-This implements the Adam optimizer from Section 2 of the Adam
-paper : https://arxiv.org/abs/1412.6980.
-Adam is a first-order gradient-based optimization method based on
-adaptive estimates of lower-order moments.
-
-Adam updates:
-
-$$
-moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
-moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
-learning\_rate = learning\_rate *
-                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
-$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/optimizers/adam_op_functor.h
+++ b/paddle/fluid/operators/optimizers/adam_op_functor.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/phi/kernels/funcs/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-namespace scatter = phi::funcs::scatter;
-
-static inline float GetAttrFromTensor(const phi::DenseTensor* tensor) {
-  const float* tensor_data = tensor->data<float>();
-  phi::DenseTensor cpu_tensor;
-  if (platform::is_gpu_place(tensor->place())) {
-    paddle::framework::TensorCopySync(
-        *tensor, platform::CPUPlace(), &cpu_tensor);
-    tensor_data = cpu_tensor.data<float>();
-  }
-  if (platform::is_xpu_place(tensor->place())) {
-    paddle::framework::TensorCopySync(
-        *tensor, platform::CPUPlace(), &cpu_tensor);
-    tensor_data = cpu_tensor.data<float>();
-  }
-  return tensor_data[0];
-}
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/optimizers/adamw_op.cc
+++ b/paddle/fluid/operators/optimizers/adamw_op.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/operators/optimizers/adam_op.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class AdamWOp : public AdamOp {
-  using AdamOp::AdamOp;
-};
-
-class AdamWOpMaker : public AdamOpMaker {
- public:
-  void Make() {
-    AdamOpMaker::Make();
-    AddAttr<float>("lr_ratio",
-                   "(float, default 1.0) "
-                   "layerwise learning rate decay")
-        .SetDefault(1.0f);
-    AddAttr<float>("coeff",
-                   "(float, default 0.01) "
-                   "coeff of the weight decay")
-        .SetDefault(0.01f);
-    AddAttr<bool>("with_decay",
-                  "(bool, default false) "
-                  "whether to do weight decay")
-        .SetDefault(false);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(adamw,
-                            AdamwInferMetaFunctor,
-                            PD_INFER_META(phi::AdamwInferMeta));
-REGISTER_OPERATOR(
-    adamw,
-    ops::AdamWOp,
-    ops::AdamWOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    AdamwInferMetaFunctor);
--- a/paddle/fluid/operators/optimizers/merged_adam_op.cc
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class MergedAdamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto param_dtype =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "Param");
-    return phi::KernelKey(param_dtype, ctx.GetPlace());
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string& var_name,
-      const phi::DenseTensor& tensor,
-      const phi::KernelKey& expected_kernel_type) const override {
-    if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
-        var_name == "SkipUpdate") {
-      return phi::KernelKey(phi::Backend::ALL_BACKEND,
-                            expected_kernel_type.layout(),
-                            expected_kernel_type.dtype());
-    } else {
-      return phi::KernelKey(
-          tensor.place(), tensor.layout(), expected_kernel_type.dtype());
-    }
-  }
-};
-
-class MergedAdamOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor, default Tensor<float>) Input parameter")
-        .AsDuplicable();
-    AddInput("Grad", "(Tensor, default Tensor<float>) Input gradient")
-        .AsDuplicable();
-    AddInput("LearningRate", "(Tensor, default Tensor<float>) Learning rate")
-        .AsDuplicable();
-    AddInput("Moment1", "(Tensor, default Tensor<float>) Input first moment")
-        .AsDuplicable();
-    AddInput("Moment2", "(Tensor, default Tensor<float>) Input second moment")
-        .AsDuplicable();
-    AddInput("Beta1Pow",
-             "(Tensor, default Tensor<float>) Input beta1 power accumulator")
-        .AsDuplicable();
-    AddInput("Beta2Pow",
-             "(Tensor, default Tensor<float>) Input beta2 power accumulator")
-        .AsDuplicable();
-    AddInput("MasterParam", "FP32 master weight for AMP.")
-        .AsDispensable()
-        .AsDuplicable();
-
-    AddOutput("ParamOut", "(Tensor) Output parameter").AsDuplicable();
-    AddOutput("Moment1Out", "(Tensor) Output first moment").AsDuplicable();
-    AddOutput("Moment2Out", "(Tensor) Output second moment").AsDuplicable();
-    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator")
-        .AsDuplicable();
-    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator")
-        .AsDuplicable();
-    AddOutput("MasterParamOut",
-              "The updated FP32 master weight for AMP. "
-              "It shared memory with Input(MasterParam).")
-        .AsDispensable()
-        .AsDuplicable();
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "first moment estimates.")
-        .SetDefault(0.9f);
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the "
-                   "second moment estimates.")
-        .SetDefault(0.999f);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-    AddAttr<bool>("multi_precision",
-                  "(bool, default false) "
-                  "Whether to use multi-precision during weight updating.")
-        .SetDefault(false);
-    // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
-    // as dispensable since they are not used when use_global_beta_pow is true.
-    AddAttr<bool>("use_global_beta_pow",
-                  "(bool, default false) "
-                  "Whether to use global beta_pow for whole model instead of "
-                  "creating beta_pow for each parameter.")
-        .SetDefault(false);
-
-    AddComment(R"DOC(
-Adam Optimizer.
-This implements the Adam optimizer from Section 2 of the Adam
-paper : https://arxiv.org/abs/1412.6980.
-Adam is a first-order gradient-based optimization method based on
-adaptive estimates of lower-order moments.
-Adam updates:
-$$
-moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
-moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
-learning\_rate = learning\_rate *
-                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
-$$
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(merged_adam,
-                            MergedAdamInferMetaFunctor,
-                            PD_INFER_META(phi::MergedAdamInferMeta));
-
-REGISTER_OPERATOR(
-    merged_adam,
-    ops::MergedAdamOp,
-    ops::MergedAdamOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    MergedAdamInferMetaFunctor);
--- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake
+++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
@@ -9,7 +9,6 @@ register_unity_group(
  ftrl_op.cc
  lars_momentum_op.cc
  proximal_adagrad_op.cc
-  adam_op.cc
  proximal_gd_op.cc
  decayed_adagrad_op.cc
  adadelta_op.cc
@@ -22,7 +21,6 @@ register_unity_group(
  sgd_op.cu
  proximal_adagrad_op.cu
  adagrad_op.cu
-  adam_op.cu
  decayed_adagrad_op.cu
  adadelta_op.cu
  lamb_op.cu)
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -21,29 +21,6 @@
  optional : master_param
  inplace : (param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out), (master_param -> master_param_out)

- op : adam_
-  args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
-  output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
-  infer_meta :
-    func : AdamInferMeta
-  kernel :
-    func : adam {dense, dense, dense, dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense, dense, dense},
-           adam_dense_param_sparse_grad {dense, selected_rows, dense, dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense, dense, dense}
-    data_type : param
-  optional : master_param, skip_update
-  inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs)
-
- op : adamw_
-  args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, float lr_ratio, float coeff, bool with_decay, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
-  output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
-  infer_meta :
-    func : AdamwInferMeta
-  kernel :
-    func : adamw
-    data_type : param
-  optional : master_param, skip_update
-  inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs)
-
 - op : add
  args : (Tensor x, Tensor y)
  output : Tensor(out)
@@ -816,17 +793,6 @@
    func : mean
  backward : mean_grad

- op : merged_adam_
-  args : (Tensor[] param, Tensor[] grad, Tensor[] learning_rate, Tensor[] moment1, Tensor[] moment2, Tensor[] beta1_pow, Tensor[] beta2_pow, Tensor[] master_param, Scalar beta1, Scalar beta2, Scalar epsilon, bool multi_precision, bool use_global_beta_pow)
-  output : Tensor[](param_out){param.size()}, Tensor[](moment1_out){param.size()}, Tensor[](moment2_out){param.size()}, Tensor[](beta1_pow_out){param.size()}, Tensor[](beta2_pow_out){param.size()}, Tensor[](master_param_out){param.size()}
-  infer_meta :
-    func : MergedAdamInferMeta
-  optional: master_param
-  kernel :
-    func : merged_adam
-    data_type : param
-  inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out)
-
 - op : min
  args : (Tensor x, IntArray axis={}, bool keepdim=false)
  output : Tensor(out)

--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -50,12 +50,45 @@
  outputs :
    { param_out : ParamOut, moment_out : MomentOut, master_param_out : MasterParamOut }

+- op : adam_
+  inputs :
+    {param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam, skip_update: SkipUpdate}
+  outputs :
+    {param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
+  scalar :
+    beta1 :
+      data_type : float
+      tensor_name : Beta1Tensor
+    beta2 :
+      data_type : float
+      tensor_name : Beta2Tensor
+    episilon :
+      data_type : float
+      tensor_name : EpisilonTensor
+  manual_signature : [adam_]
+
 - op : adamax_
  inputs :
    {param : Param, grad: Grad, learning_rate : LearningRate, moment : Moment, inf_norm : InfNorm, beta1_pow : Beta1Pow, master_param : MasterParam}
  outputs :
    {param_out : ParamOut, moment_out : MomentOut, inf_norm_out : InfNormOut, master_param_out : MasterParamOut}

+- op : adamw_
+  inputs :
+    {param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam, skip_update: SkipUpdate}
+  outputs :
+    {param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
+  scalar :
+    beta1 :
+      data_type : float
+      tensor_name : Beta1Tensor
+    beta2 :
+      data_type : float
+      tensor_name : Beta2Tensor
+    episilon :
+      data_type : float
+      tensor_name : EpisilonTensor
+
 - op : add (elementwise_add)
  backward : add_grad (elementwise_add_grad)
  extra :
@@ -1454,6 +1487,22 @@
  outputs :
    out : Out

+- op : merged_adam_
+  inputs :
+    {param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam}
+  outputs :
+    {param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
+  scalar :
+    beta1 :
+      data_type : float
+      support_tensor : true
+    beta2 :
+      data_type : float
+      support_tensor : true
+    epsilon :
+      data_type : float
+      support_tensor : true
+
 - op : merged_momentum_
  inputs :
    {param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam}

--- a/paddle/phi/api/yaml/op_version.yaml
+++ b/paddle/phi/api/yaml/op_version.yaml
+- op : adam_
+  version :
+    - checkpoint : Upgrade adam add 1 attribute [multi_precision].
+      action :
+        - add_attr : multi_precision
+          comment : (bool) Whether to use multi-precision during weight updating.
+          default : "false"
+    - checkpoint : Upgrade adam, add 1 dispensable input [EpsilonTensor].
+      action :
+        - add_input : EpsilonTensor
+          comment : If provided, Adam will use this as epsilon, this has a higher priority than attr(epsilon). For better performance in npu kernel.
+    - checkpoint : Upgrade adam, add 1 attribute [use_global_beta_pow].
+      action :
+        - add_attr : use_global_beta_pow
+          comment : If true, Adam will use global beta_pow for whole model instead of creating beta_pow for each parameter. In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be used in adam op, and beta_pow will be updated after all adam op in the model.
+          default : "false"
+    - checkpoint : Upgrade adam, add 1 dispensable input [SkipUpdate].
+      action :
+        - add_input : SkipUpdate
+          comment : If the value is true, Adam will skip the update.
+
 - op : affine_grid
  version :
    - checkpoint : Compatible upgrade of affine_grid, add a new attribute [align_corners].

--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -44,6 +44,18 @@
  optional : master_param, master_param_out
  inplace : (param -> param_out), (moment -> moment_out), (master_param -> master_param_out)

+- op : adam_
+  args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1 = 0.9f, Scalar beta2 = 0.999f, Scalar epsilon = 1.0e-8f, bool lazy_mode = false, int64_t min_row_size_to_use_multithread = 1000, bool multi_precision = false, bool use_global_beta_pow = false)
+  output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_out)
+  infer_meta :
+    func : AdamInferMeta
+  kernel :
+    func : adam {dense, dense, dense, dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense, dense, dense},
+           adam_dense_param_sparse_grad {dense, selected_rows, dense, dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense, dense, dense}
+    data_type : param
+  optional : master_param, skip_update, master_param_out
+  inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out)
+
 - op : adamax_
  args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, Tensor master_param, float beta1 = 0.9f, float beta2 = 0.999f, float epsilon = 1.0e-8f, bool multi_precision = false)
  output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out), Tensor(master_param_out)
@@ -55,6 +67,17 @@
  optional : master_param, master_param_out
  inplace : (param -> param_out), (moment -> moment_out), (inf_norm -> inf_norm_out), (master_param ->master_param_out)

+- op : adamw_
+  args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1 = 0.9f, Scalar beta2 = 0.999f, Scalar epsilon = 1.0e-8f, float lr_ratio = 1.0f, float coeff = 0.01f, bool with_decay = false, bool lazy_mode = false, int64_t min_row_size_to_use_multithread = 1000, bool multi_precision = false, bool use_global_beta_pow = false)
+  output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_out)
+  infer_meta :
+    func : AdamwInferMeta
+  kernel :
+    func : adamw
+    data_type : param
+  optional : master_param, skip_update, master_param_out
+  inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out)
+
 - op : addmm
  args : (Tensor input, Tensor x, Tensor y, float beta=1.0, float alpha=1.0)
  output : Tensor
@@ -1257,6 +1280,17 @@
  kernel :
    func : merge_selected_rows {selected_rows -> selected_rows}

+- op : merged_adam_
+  args : (Tensor[] param, Tensor[] grad, Tensor[] learning_rate, Tensor[] moment1, Tensor[] moment2, Tensor[] beta1_pow, Tensor[] beta2_pow, Tensor[] master_param, Scalar beta1 = 0.9f, Scalar beta2 = 0.999f, Scalar epsilon = 1.0e-8f, bool multi_precision = false, bool use_global_beta_pow = false)
+  output : Tensor[](param_out){param.size()}, Tensor[](moment1_out){param.size()}, Tensor[](moment2_out){param.size()}, Tensor[](beta1_pow_out){param.size()}, Tensor[](beta2_pow_out){param.size()}, Tensor[](master_param_out){param.size()}
+  infer_meta :
+    func : MergedAdamInferMeta
+  kernel :
+    func : merged_adam
+    data_type : param
+  optional: master_param, master_param_out
+  inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out)
+
 - op : merged_momentum_
  args : (Tensor[] param, Tensor[] grad, Tensor[] velocity, Tensor[] learning_rate, Tensor[] master_param, float mu, bool use_nesterov = false, str[] regularization_method = {}, float[] regularization_coeff = {}, bool multi_precision = false, float rescale_grad = 1.0f)
  output : Tensor[](param_out){param.size()}, Tensor[](velocity_out){param.size()}, Tensor[](master_param_out){param.size()}

--- a/paddle/phi/ops/compat/adamw_sig.cc
+++ b/paddle/phi/ops/compat/adamw_sig.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <string>
-
-#include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/utils/small_vector.h"
-
-namespace phi {
-
-KernelSignature AdamwOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  paddle::small_vector<const char*> in_names = {"Param",
-                                                "Grad",
-                                                "LearningRate",
-                                                "Moment1",
-                                                "Moment2",
-                                                "Beta1Pow",
-                                                "Beta2Pow",
-                                                "MasterParam",
-                                                "SkipUpdate"};
-  paddle::small_vector<const char*> out_names = {"ParamOut",
-                                                 "Moment1Out",
-                                                 "Moment2Out",
-                                                 "Beta1PowOut",
-                                                 "Beta2PowOut",
-                                                 "MasterParamOut"};
-  paddle::small_vector<const char*> attr_names;
-
-  attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor"
-                                                      : "beta1");
-  attr_names.emplace_back(ctx.HasInput("Beta2Tensor") ? "Beta2Tensor"
-                                                      : "beta2");
-  attr_names.emplace_back(ctx.HasInput("EpsilonTensor") ? "EpsilonTensor"
-                                                        : "epsilon");
-  attr_names.emplace_back("lr_ratio");
-  attr_names.emplace_back("coeff");
-  attr_names.emplace_back("with_decay");
-  attr_names.emplace_back("lazy_mode");
-  attr_names.emplace_back("min_row_size_to_use_multithread");
-  attr_names.emplace_back("multi_precision");
-  attr_names.emplace_back("use_global_beta_pow");
-
-  if (ctx.IsSelectedRowsInput("Grad")) {
-    return KernelSignature("adamw_dense_param_sparse_grad",
-                           std::move(in_names),
-                           std::move(attr_names),
-                           std::move(out_names));
-  } else if (ctx.IsDenseTensorInput("Grad")) {
-    return KernelSignature("adamw",
-                           std::move(in_names),
-                           std::move(attr_names),
-                           std::move(out_names));
-  } else {
-    return KernelSignature("unregistered", {}, {}, {});
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(adamw, phi::AdamwOpArgumentMapping);
--- a/paddle/phi/ops/compat/merged_adam_sig.cc
+++ b/paddle/phi/ops/compat/merged_adam_sig.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <string>
-
-#include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/utils/small_vector.h"
-
-namespace phi {
-
-KernelSignature MergedAdamOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  paddle::small_vector<const char*> in_names = {"Param",
-                                                "Grad",
-                                                "LearningRate",
-                                                "Moment1",
-                                                "Moment2",
-                                                "Beta1Pow",
-                                                "Beta2Pow",
-                                                "MasterParam"};
-  paddle::small_vector<const char*> out_names = {"ParamOut",
-                                                 "Moment1Out",
-                                                 "Moment2Out",
-                                                 "Beta1PowOut",
-                                                 "Beta2PowOut",
-                                                 "MasterParamOut"};
-  paddle::small_vector<const char*> attr_names = {
-      "beta1", "beta2", "epsilon", "multi_precision", "use_global_beta_pow"};
-
-  return KernelSignature("merged_adam",
-                         std::move(in_names),
-                         std::move(attr_names),
-                         std::move(out_names));
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(merged_adam, phi::MergedAdamOpArgumentMapping);