support auto generate for op layer_norm (#53178)

* simplify layer_norm_op.cc * support auto generate for op layer_norm * update unittest for composite_layer_norm * remove layer_norm_op.cc from scripts * replace layer_norm_op with generated_op * add get_expected_kernel for layer_norm * update cmake kernel register function for layer_norm_mkldnn_op

support auto generate for op layer_norm (#53178)
* simplify layer_norm_op.cc * support auto generate for op layer_norm * update unittest for composite_layer_norm * remove layer_norm_op.cc from scripts * replace layer_norm_op with generated_op * add get_expected_kernel for layer_norm * update cmake kernel register function for layer_norm_mkldnn_op
4f07b653 · RedContritio · GitHub · 236e742d · 4f07b653 · 4f07b653
17 changed file
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -103,6 +103,48 @@ function(register_cu_kernel TARGET)
  endforeach()
 endfunction()
+# Just for those mkldnn kernels locating at "fluid/operators/mkldnn/", such as 'layer_norm_mkldnn_op.cc'.
+# Add other file modes if need in the future.
+function(register_mkldnn_kernel TARGET)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(register_mkldnn_kernel "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  set(mkldnn_cc_srcs)
+  set(op_common_deps operator op_registry math_function layer
+                     common_infer_shape_functions)
+  foreach(mkldnn_src ${register_mkldnn_kernel_SRCS})
+    if(${mkldnn_src} MATCHES ".*_mkldnn_op.cc$")
+      list(APPEND mkldnn_cc_srcs mkldnn/${mkldnn_src})
+    endif()
+  endforeach()
+  list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+  if(${mkldnn_cc_srcs_len} EQUAL 0)
+    message(
+      FATAL_ERROR
+        "The MKLDNN kernel file of ${TARGET} should contains at least one *.*_mkldnn_op.cc file"
+    )
+  endif()
+  if(WITH_MKLDNN)
+    cc_library(
+      ${TARGET}
+      SRCS ${mkldnn_cc_srcs}
+      DEPS ${op_library_DEPS} ${op_common_deps})
+  endif()
+  set(OP_LIBRARY
+      ${TARGET} ${OP_LIBRARY}
+      CACHE INTERNAL "op libs")
+  foreach(mkldnn_src ${mkldnn_cc_srcs})
+    set(op_name "")
+    find_register(${mkldnn_src} "REGISTER_OP_KERNEL" op_name)
+    if(NOT ${op_name} EQUAL "")
+      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MKLDNN);\n")
+    endif()
+  endforeach()
+endfunction()
 function(op_library TARGET)
  # op_library is a function to create op library. The interface is same as
  # cc_library. But it handle split GPU/CPU code and link some common library

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -106,6 +106,10 @@ if (WITH_GPU OR WITH_ROCM)
    register_cu_kernel(class_center_sample_op SRCS class_center_sample_op.cu DEPS ${OP_HEADER_DEPS})
 endif()
+if (WITH_MKLDNN)
+    register_mkldnn_kernel(layer_norm_op SRCS layer_norm_mkldnn_op.cc DEPS ${OP_HEADER_DEPS})
+endif()
 if (WITH_GPU OR WITH_ROCM)
    op_library(activation_op SRCS activation_op.cc activation_op.kps soft_relu_op.cu DEPS ${OP_HEADER_DEPS})
 elseif (WITH_XPU_KP)

--- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
@@ -250,5 +250,21 @@ phi::KernelKey GetInstanceNormExpectedKernelType(
  return phi::KernelKey(input_data_type, ctx.GetPlace());
 }
+phi::KernelKey GetLayerNormExpectedKernelType(
+    const framework::ExecutionContext& ctx,
+    const framework::OperatorWithKernel* op_ptr) {
+  auto input_data_type =
+      op_ptr->OperatorWithKernel::IndicateVarDataType(ctx, "X");
+  // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_MKLDNN
+  int begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+  if (begin_norm_axis != ctx.Input<phi::DenseTensor>("X")->dims().size() - 1) {
+    op_ptr->SetDnnFallback(true);
+  }
+  // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN
+  return phi::KernelKey(input_data_type, ctx.GetPlace());
+}
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/generator/get_expected_kernel_func.h
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.h
@@ -64,5 +64,9 @@ phi::KernelKey GetYoloLossExpectedKernelType(
    const framework::ExecutionContext& ctx,
    const framework::OperatorWithKernel* op_ptr);
+phi::KernelKey GetLayerNormExpectedKernelType(
+    const framework::ExecutionContext& ctx,
+    const framework::OperatorWithKernel* op_ptr);
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
-#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
-#include "paddle/fluid/prim/utils/static/desc_tensor.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/ternary.h"
-namespace paddle {
-namespace operators {
-using DataLayout = phi::DataLayout;
-class LayerNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "LayerNorm");
-    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "LayerNorm");
-    OP_INOUT_CHECK(ctx->HasOutput("Mean"), "Output", "Mean", "LayerNorm");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Variance"), "Output", "Variance", "LayerNorm");
-    auto x_dim = ctx->GetInputDim("X");
-    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
-    PADDLE_ENFORCE_LT(
-        begin_norm_axis,
-        x_dim.size(),
-        platform::errors::InvalidArgument(
-            "'begin_norm_axis' must be less than the dimensions of X,"
-            "But received 'begin_norm_axis' is [%d],"
-            "received the dimensions of X is [%d].",
-            begin_norm_axis,
-            x_dim.size()));
-    auto matrix_dim = phi::flatten_to_2d(x_dim, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    if (ctx->HasInput("Scale")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "The dimensions of Input(Scale) must be 1, but "
-                            "received dimensions of"
-                            "Input(Scale) is [%d]",
-                            ctx->GetInputDim("Scale").size()));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(
-            ctx->GetInputDim("Scale")[0],
-            right,
-            platform::errors::InvalidArgument(
-                "The first dimension value of Input(Scale) must equal to be the"
-                "second dimension value of the flattened 2D matrix of Input(X),"
-                "But received the first dimension value of Input(Scale) is"
-                "[%d], the second dimension value of the flattened 2D matrix of"
-                " Input(Scale) is [%d].",
-                ctx->GetInputDim("Scale")[0],
-                right));
-      }
-    }
-    if (ctx->HasInput("Bias")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "The dimensions of Input(Bias) must be 1, but "
-                            "received dimensions of"
-                            "Input(Bias) is [%d]",
-                            ctx->GetInputDim("Bias").size()));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(
-            ctx->GetInputDim("Bias")[0],
-            right,
-            platform::errors::InvalidArgument(
-                "The first dimension value of Input(Bias) must equal to be the"
-                "second dimension value of the flattened 2D matrix of Input(X),"
-                "But received the first dimension value of Input(Bias) is"
-                "[%d], the second dimension value of the flattened 2D matrix of"
-                " Input(Bias) is [%d].",
-                ctx->GetInputDim("Scale")[0],
-                right));
-      }
-    }
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("Mean", {left});
-    ctx->SetOutputDim("Variance", {left});
-    ctx->ShareLoD("X", "Y");
-  }
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_MKLDNN
-    int begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    if (begin_norm_axis !=
-        ctx.Input<phi::DenseTensor>("X")->dims().size() - 1) {
-      this->SetDnnFallback(true);
-    }
-    // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor.");
-    AddInput("Scale",
-             "(optional) Scale is a 1-dimensional tensor of size "
-             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
-             "It is applied to the output.")
-        .AsDispensable();
-    AddInput("Bias",
-             "(optional) Bias is a 1-dimensional tensor of size "
-             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
-             "It is applied to the output.")
-        .AsDispensable();
-    AddOutput("Y", "Result after normalization.");
-    AddOutput("Mean", "Mean of the current mini batch.").AsIntermediate();
-    AddOutput("Variance", "Variance of the current mini batch.")
-        .AsIntermediate();
-    AddAttr<float>("epsilon",
-                   "Constant for numerical stability [default 1e-5].")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float &epsilon) {
-          PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f,
-                            true,
-                            platform::errors::InvalidArgument(
-                                "'epsilon' in Op(LayerNorm) should be between"
-                                "0.0 and 0.001, But received [%s].",
-                                epsilon));
-        });
-    AddAttr<int>("begin_norm_axis",
-                 "the axis of `begin_norm_axis ... Rank(X) - 1` will be "
-                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H]. [default 1].")
-        .SetDefault(1)
-        .AddCustomChecker([](const int &begin_norm_axis) {
-          PADDLE_ENFORCE_GT(begin_norm_axis,
-                            0,
-                            platform::errors::InvalidArgument(
-                                "'begin_norm_axis' in Op(LayerNorm) should be"
-                                "greater than zero. But received [%d].",
-                                begin_norm_axis));
-        });
-    AddComment(R"DOC(
-Assume feature vectors exist on dimensions
-:attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
-along these dimensions for each feature vector :math:`a` with size
-:math:`H`, then normalize each feature vector using the corresponding
-statistics. After that, apply learnable gain and bias on the normalized
-tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
-Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
-)DOC");
-  }
-};
-class LayerNormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // check input
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "LayerNormGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "LayerNormGrad");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Variance"), "Input", "Variance", "LayerNormGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input",
-                   framework::GradVarName("Y"),
-                   "LayerNormGrad");
-    // check output
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-      ctx->SetOutputDim(framework::GradVarName("Scale"),
-                        ctx->GetInputDim("Scale"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-      ctx->SetOutputDim(framework::GradVarName("Bias"),
-                        ctx->GetInputDim("Bias"));
-    }
-  }
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_NOT_NULL(
-        var,
-        platform::errors::NotFound("Y@GRAD of LayerNorm Op is not found."));
-    const phi::DenseTensor *t = nullptr;
-    if (var->IsType<phi::DenseTensor>()) {
-      t = &var->Get<phi::DenseTensor>();
-    } else if (var->IsType<phi::DenseTensor>()) {
-      t = &var->Get<phi::DenseTensor>();
-    }
-    PADDLE_ENFORCE_NOT_NULL(
-        t, platform::errors::NotFound("Y@GRAD of LayerNorm Op is not found."));
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-template <typename T>
-class LayerNormGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("layer_norm_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Mean", this->Output("Mean"));
-    op->SetInput("Variance", this->Output("Variance"));
-    if (this->HasInput("Scale")) {
-      op->SetInput("Scale", this->Input("Scale"));
-      op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
-    }
-    if (this->HasInput("Bias")) {
-      op->SetInput("Bias", this->Input("Bias"));
-      op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
-    }
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(LayerNormGradNoNeedBufferVarInferer,
-                                    "Bias");
-class LayerNormCompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
-  using prim::CompositeGradOpMakerBase::CompositeGradOpMakerBase;
- public:
-  void Apply() override {
-    // get inputs
-    paddle::Tensor x = this->GetSingleForwardInput("X");
-    paddle::Tensor mean = this->GetSingleForwardOutput("Mean");
-    paddle::Tensor var = this->GetSingleForwardOutput("Variance");
-    paddle::Tensor y_grad = this->GetSingleOutputGrad("Y");
-    paddle::optional<paddle::Tensor> scale =
-        this->GetOptionalSingleForwardInput("Scale");
-    paddle::optional<paddle::Tensor> bias =
-        this->GetOptionalSingleForwardInput("Bias");
-    // get Attrs
-    auto epsilon = this->Attr<float>("epsilon");
-    auto begin_norm_axis = this->Attr<int>("begin_norm_axis");
-    // get outputs
-    paddle::Tensor x_grad = this->GetSingleInputGrad("X");
-    paddle::Tensor scale_grad = this->GetSingleInputGrad("Scale");
-    paddle::Tensor bias_grad = this->GetSingleInputGrad("Bias");
-    auto dx_ptr = this->GetOutputPtr(&x_grad);
-    std::string dx_name = this->GetOutputName(x_grad);
-    auto dscale_ptr = this->GetOutputPtr(&scale_grad);
-    std::string dscale_name = this->GetOutputName(scale_grad);
-    auto dbias_ptr = this->GetOutputPtr(&bias_grad);
-    std::string dbias_name = this->GetOutputName(bias_grad);
-    VLOG(6) << "Runing layer_norm_grad composite func";
-    prim::layer_norm_grad<prim::DescTensor>(x,
-                                            scale,
-                                            bias,
-                                            mean,
-                                            var,
-                                            y_grad,
-                                            epsilon,
-                                            begin_norm_axis,
-                                            dx_ptr,
-                                            dscale_ptr,
-                                            dbias_ptr);
-    this->RecoverOutputName(x_grad, dx_name);
-    this->RecoverOutputName(scale_grad, dscale_name);
-    this->RecoverOutputName(bias_grad, dbias_name);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(layer_norm,
-                            LayerNormInferShapeFunctor,
-                            PD_INFER_META(phi::LayerNormInferMeta));
-REGISTER_OPERATOR(layer_norm,
-                  ops::LayerNormOp,
-                  ops::LayerNormOpMaker,
-                  ops::LayerNormGradOpMaker<paddle::framework::OpDesc>,
-                  ops::LayerNormGradOpMaker<paddle::imperative::OpBase>,
-                  ops::LayerNormCompositeGradOpMaker,
-                  LayerNormInferShapeFunctor);
-DECLARE_INFER_SHAPE_FUNCTOR(layer_norm_grad,
-                            LayerNormGradInferShapeFunctor,
-                            PD_INFER_META(phi::LayerNormGradInferMeta));
-REGISTER_OPERATOR(layer_norm_grad,
-                  ops::LayerNormGradOp,
-                  ops::LayerNormGradNoNeedBufferVarInferer,
-                  LayerNormGradInferShapeFunctor);
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -135,7 +135,7 @@ register_unity_group(
  kron_op.cc
  l1_norm_op.cc
  label_smooth_op.cc
-  layer_norm_op.cc
+  generated_op
  mkldnn/layer_norm_mkldnn_op.cc
  mkldnn/layer_norm_mkldnn_op.cc
  linspace_op.cc

--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -973,6 +973,20 @@
  kernel :
    func : label_smooth_grad
+- backward_op : layer_norm_grad
+  forward : layer_norm (Tensor x, Tensor scale, Tensor bias, float epsilon = 1e-5, int begin_norm_axis = 1) -> Tensor(out), Tensor(mean), Tensor(variance)
+  args : (Tensor x,  Tensor scale, Tensor bias, Tensor mean, Tensor variance, Tensor out_grad, float epsilon = 1e-5, int begin_norm_axis = 1)
+  output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
+  infer_meta :
+    func : LayerNormGradInferMeta
+    param : [x, scale, bias]
+  kernel :
+    func : layer_norm_grad
+    data_type : x
+  composite : layer_norm_grad(x, scale, bias, mean, variance, out_grad, epsilon, begin_norm_axis, x_grad, scale_grad, bias_grad)
+  no_need_buffer : bias
+  optional : scale, bias
 - backward_op : leaky_relu_double_grad
  forward : leaky_relu_grad (Tensor x, Tensor grad_out, float negative_slope) -> Tensor(grad_x)
  args : (Tensor x, Tensor grad_x_grad, float negative_slope)

--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -454,20 +454,6 @@
  kernel :
    func : hsigmoid_loss_grad
- backward_op : layer_norm_grad
-  forward : layer_norm (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis) -> Tensor(out), Tensor(mean), Tensor(variance)
-  args : (Tensor x,  Tensor scale, Tensor bias, Tensor mean, Tensor variance, Tensor out_grad, float epsilon, int begin_norm_axis)
-  output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
-  infer_meta :
-    func : LayerNormGradInferMeta
-    param : [x, scale, bias]
-  kernel :
-    func : layer_norm_grad
-    data_type : out_grad
-  composite : layer_norm_grad(x, scale, bias, mean,varience, out_grad, epsilon, begin_norm_axis, x_grad, scale_grad, bias_grad)
-  no_need_buffer : bias
-  optional : scale, bias
 - backward_op : logsumexp_grad
  forward : logsumexp(Tensor x, int64_t[] axis,  bool keepdim,  bool reduce_all) -> Tensor(out)
  args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] axis,  bool keepdim,  bool reduce_all)

--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -588,17 +588,6 @@
    func : increment
  inplace : (x -> out)
- op : layer_norm
-  args : (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis)
-  output : Tensor(out), Tensor(mean), Tensor(variance)
-  infer_meta :
-    func : LayerNormInferMeta
-  kernel :
-    func : layer_norm
-    data_type : x
-  backward : layer_norm_grad
-  optional : scale, bias
 - op : less_equal
  args : (Tensor x, Tensor y)
  output : Tensor(out)

--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1264,6 +1264,8 @@
    variance : Variance
  extra :
    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
+  get_expected_kernel_type :
+    layer_norm : GetLayerNormExpectedKernelType
 - op : leaky_relu
  backward : leaky_relu_grad, leaky_relu_double_grad (leaky_relu_grad_grad)

--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1135,6 +1135,18 @@
  optional : master_param, skip_update, beta1_pow_out, beta2_pow_out, master_param_outs
  inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs)
+- op : layer_norm
+  args : (Tensor x, Tensor scale, Tensor bias, float epsilon = 1e-5, int begin_norm_axis = 1)
+  output : Tensor(out), Tensor(mean), Tensor(variance)
+  infer_meta :
+    func : LayerNormInferMeta
+  kernel :
+    func : layer_norm
+    data_type : x
+  backward : layer_norm_grad
+  intermediate : mean, variance
+  optional : scale, bias
 - op : leaky_relu
  args : (Tensor x, float negative_slope = 0.02f)
  output : Tensor

--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -529,6 +529,12 @@ void LayerNormInferMeta(const MetaTensor& x,
                        MetaTensor* variance,
                        MetaConfig config) {
  auto x_dim = x.dims();
+  PADDLE_ENFORCE_GT(begin_norm_axis,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "'begin_norm_axis' in Op(LayerNorm) should be"
+                        "greater than zero. But received [%d].",
+                        begin_norm_axis));
  PADDLE_ENFORCE_LT(
      begin_norm_axis,
      x_dim.size(),
@@ -588,6 +594,13 @@ void LayerNormInferMeta(const MetaTensor& x,
            right));
  }
+  PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "'epsilon' in Op(LayerNorm) should be between"
+                        "0.0 and 0.001, But received [%s].",
+                        epsilon));
  phi::DataType x_dtype = x.dtype();
  out->set_dims(x_dim);
  out->set_dtype(x_dtype);

--- a/paddle/phi/ops/compat/layer_norm_sig.cc
+++ b/paddle/phi/ops/compat/layer_norm_sig.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/phi/core/compat/op_utils.h"
-namespace phi {
-KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("layer_norm",
-                         {"X", "Scale", "Bias"},
-                         {"epsilon", "begin_norm_axis"},
-                         {"Y", "Mean", "Variance"});
-}
-KernelSignature LayerNormGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("layer_norm_grad",
-                         {"X", "Scale", "Bias", "Mean", "Variance", "Y@GRAD"},
-                         {"epsilon", "begin_norm_axis"},
-                         {"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
-}
-}  // namespace phi
-PD_REGISTER_ARG_MAPPING_FN(layer_norm, phi::LayerNormOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(layer_norm_grad,
-                           phi::LayerNormGradOpArgumentMapping);
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -332,7 +332,7 @@ def layer_norm(
        )
    if in_dygraph_mode():
-        out, _, _ = _C_ops.layer_norm(x, weight, bias, epsilon, begin_norm_axis)
+        out = _C_ops.layer_norm(x, weight, bias, epsilon, begin_norm_axis)
        return out
    else:

--- a/test/cpp/fluid/fused/CMakeLists.txt
+++ b/test/cpp/fluid/fused/CMakeLists.txt
@@ -13,7 +13,7 @@ if(WITH_GPU OR WITH_ROCM)
      DEPS tensor
           op_registry
           dropout_op
-           layer_norm_op
+           generated_op
           device_context
           generator
           memory)
@@ -23,7 +23,7 @@ if(WITH_GPU OR WITH_ROCM)
      DEPS tensor
           op_registry
           dropout_op
-           layer_norm_op
+           generated_op
           device_context
           generator
           memory)
@@ -33,7 +33,7 @@ if(WITH_GPU OR WITH_ROCM)
      DEPS tensor
           op_registry
           dropout_op
-           layer_norm_op
+           generated_op
           device_context
           generator
           memory)

--- a/test/prim/composite_ops/test_composite_layer_norm.py
+++ b/test/prim/composite_ops/test_composite_layer_norm.py
@@ -234,27 +234,27 @@ class TestCompositelayer_norm(unittest.TestCase):
        b_p = paddle.to_tensor(b)
        expect = expect_forward(x_p, n_shape, w_p, b_p)
-        actual = self.cal_composite(x, n_shape, w, b)
+        actual, _a_mean, _a_var = self.cal_composite(x, n_shape, w, b)
-        assert expect[0].numpy().dtype == actual[0].dtype
+        assert expect.numpy().dtype == actual.dtype
-        for i in range(3):
+        np.testing.assert_allclose(
-            np.testing.assert_allclose(
+            expect.numpy(),
-                expect[i].numpy(),
+            actual,
-                actual[i],
+            rtol=attrs.get_rtol("forward"),
-                rtol=attrs.get_rtol("forward"),
+            atol=attrs.get_atol("forward"),
-                atol=attrs.get_atol("forward"),
+        )
-            )
        expect_2 = expect_forward(x_p, n_shape, None, None)
-        actual_2 = self.cal2_composite(x, n_shape, None, None)
+        actual_2, _a_mean_2, _a_var_2 = self.cal2_composite(
-        assert expect_2[0].numpy().dtype == actual_2[0].dtype
+            x, n_shape, None, None
-        for i in range(3):
+        )
-            np.testing.assert_allclose(
+        assert expect_2.numpy().dtype == actual_2.dtype
-                expect_2[i].numpy(),
+        np.testing.assert_allclose(
-                actual_2[i],
+            expect_2.numpy(),
-                rtol=attrs.get_rtol("forward"),
+            actual_2,
-                atol=attrs.get_atol("forward"),
+            rtol=attrs.get_rtol("forward"),
-            )
+            atol=attrs.get_atol("forward"),
+        )
    def test_forward(self):
        for j in self.dtypes:

--- a/tools/enforce/count_enforce_by_file.sh
+++ b/tools/enforce/count_enforce_by_file.sh
@@ -51,7 +51,6 @@ if [ "$1" != "" ]; then
 fi
 FILE_WHITE_LIST="\
-    layer_norm_op.cc \
    box_clip_op.cc \
    box_clip_op.h \
    random_crop_op.h \