[Phi] migrate sync_batch_norm to phi (#44369)

c99c70cb · lyq · GitHub · b8d106e1 · c99c70cb · b8d106e1
11 changed file
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -13,17 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/inplace_abn_op.h"
+#include <iostream>
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
 #include "paddle/phi/kernels/batch_norm_grad_kernel.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
+#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/sync_batch_norm_kernel.h"

 namespace paddle {
 namespace operators {

 template <typename DeviceContext, typename T>
-class InplaceABNKernel
-    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
+class InplaceABNKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* y = ctx.Output<Tensor>("Y");
@@ -36,10 +38,6 @@ class InplaceABNKernel
        GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();

-    if (ctx.Attr<bool>("use_sync_bn")) {
-      SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
-    } else {
-      // BatchNormKernel<DeviceContext, T>::Compute(ctx);
    auto* scale = ctx.Input<Tensor>("Scale");
    auto* bias = ctx.Input<Tensor>("Bias");
    auto* mean = ctx.Input<Tensor>("Mean");
@@ -59,6 +57,30 @@ class InplaceABNKernel
    auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
    auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");

+    if (ctx.Attr<bool>("use_sync_bn")) {
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::SyncBatchNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x,
+          *scale,
+          *bias,
+          *mean,
+          *variance,
+          momentum,
+          epsilon,
+          data_layout,
+          is_test,
+          use_global_stats,
+          trainable_statistics,
+          fuse_with_relu,
+          y,
+          mean_out,
+          variance_out,
+          saved_mean,
+          saved_variance,
+          reserve_space);
+    } else {
      auto& dev_ctx = ctx.device_context<DeviceContext>();
      phi::BatchNormKernel<T>(
          static_cast<const typename framework::ConvertToPhiContext<
@@ -92,8 +114,7 @@ class InplaceABNKernel
 // Deriving the Gradient for the Backward Pass of Batch Normalization
 // https://kevinzakka.github.io/2016/09/14/batch_normalization/
 template <typename DeviceContext, typename T>
-class InplaceABNGradKernel
-    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
+class InplaceABNGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    const auto* y = ctx.Input<Tensor>("Y");
@@ -115,9 +136,6 @@ class InplaceABNGradKernel
    InplaceABNActivation<DeviceContext, T> functor;
    functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);

-    if (ctx.Attr<bool>("use_sync_bn")) {
-      SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
-    } else {
    auto* scale = ctx.Input<Tensor>("Scale");
    auto* bias = ctx.Input<Tensor>("Bias");
    auto* saved_mean = ctx.Input<Tensor>("SavedMean");
@@ -138,6 +156,24 @@ class InplaceABNGradKernel
    auto* mean = ctx.Input<Tensor>("ReserveSpace");
    auto* variance = ctx.Input<Tensor>("ReserveSpace");

+    if (ctx.Attr<bool>("use_sync_bn")) {
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::SyncBatchNormGradFunctor<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          nullptr,
+          y,
+          *scale,
+          *bias,
+          *saved_mean,
+          *saved_variance,
+          *d_y,
+          epsilon,
+          data_layout,
+          d_x,
+          scale_grad,
+          bias_grad);
+    } else {
      paddle::optional<Tensor> space_opt;
      paddle::optional<Tensor> mean_opt;
      paddle::optional<Tensor> variance_opt;

--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class SyncBatchNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout layout = framework::StringToDataLayout(layout_str);
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    PADDLE_ENFORCE_EQ(use_global_stats,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "sync_batch_norm doesn't support "
-                          "to set use_global_stats True. Please use batch_norm "
-                          "in this case."));
-
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Y");
-
-    const auto *est_mean = ctx.Input<Tensor>("Mean");
-    const auto *est_var = ctx.Input<Tensor>("Variance");
-
-    // moving mean/variance
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_inv_variance = ctx.Output<Tensor>("SavedVariance");
-
-    bool test_mode = is_test && (!trainable_stats);
-    SyncBatchNormFunctor<platform::CUDADeviceContext, T>(ctx,
-                                                         layout,
-                                                         x,
-                                                         y,
-                                                         est_mean,
-                                                         est_var,
-                                                         mean_out,
-                                                         variance_out,
-                                                         saved_mean,
-                                                         saved_inv_variance,
-                                                         epsilon,
-                                                         momentum,
-                                                         test_mode,
-                                                         use_global_stats);
-  }
-};
-
-template <typename T>
-class SyncBatchNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-
-    const DataLayout layout = framework::StringToDataLayout(layout_str);
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_inv_var = ctx.Input<Tensor>("SavedVariance");
-
-    SyncBatchNormGradFunctor<platform::CUDADeviceContext, T>(ctx,
-                                                             layout,
-                                                             scale,
-                                                             bias,
-                                                             d_x,
-                                                             d_y,
-                                                             d_scale,
-                                                             d_bias,
-                                                             saved_mean,
-                                                             saved_inv_var,
-                                                             epsilon);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm_grad,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, double>,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm_grad,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, double>,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-#endif
-
-// clang-format on
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -2075,6 +2075,16 @@
    func : swish
  backward : swish_grad

+# sync_batch_norm
+- api : sync_batch_norm
+  args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
+  infer_meta :
+    func : BatchNormInferMeta
+  kernel :
+    func : sync_batch_norm
+  backward : sync_batch_norm_grad
+
 # take_along_axis
 - api : take_along_axis
  args : (Tensor x, Tensor index, int axis)

--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -2085,6 +2085,18 @@
    func : swish_grad
  inplace : (out_grad -> x_grad)

+- backward_api : sync_batch_norm_grad
+  forward : sync_batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
+  args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [x, scale, bias]
+  kernel :
+    func : sync_batch_norm_grad
+    data_type : out_grad
+  optional : mean_out, variance_out, reserve_space
+
 - backward_api : take_along_axis_grad
  forward : take_along_axis (Tensor x, Tensor index, int axis) -> Tensor(out)
  args : (Tensor x, Tensor index, Tensor out_grad, int axis)

--- a/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SyncBatchNormGradKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& scale,
+                             const DenseTensor& bias,
+                             const paddle::optional<DenseTensor>& mean,
+                             const paddle::optional<DenseTensor>& variance,
+                             const DenseTensor& saved_mean,
+                             const DenseTensor& saved_variance,
+                             const paddle::optional<DenseTensor>& reserve_space,
+                             const DenseTensor& y_grad,
+                             float momentum,
+                             float epsilon_f,
+                             const std::string& data_layout_str,
+                             bool is_test,
+                             bool use_global_stats,
+                             bool trainable_statistics,
+                             bool fuse_with_relu,
+                             DenseTensor* x_grad,
+                             DenseTensor* scale_grad,
+                             DenseTensor* bias_grad) {
+  SyncBatchNormGradFunctor<T, Context>(ctx,
+                                       &x,
+                                       nullptr,
+                                       scale,
+                                       bias,
+                                       saved_mean,
+                                       saved_variance,
+                                       y_grad,
+                                       epsilon_f,
+                                       data_layout_str,
+                                       x_grad,
+                                       scale_grad,
+                                       bias_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(sync_batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(sync_batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
--- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SyncBatchNormKernel(const Context &ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &scale,
+                         const DenseTensor &bias,
+                         const DenseTensor &mean,
+                         const DenseTensor &variance,
+                         float momentum,
+                         float epsilon_f,
+                         const std::string &data_layout_str,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor *y,
+                         DenseTensor *mean_out,
+                         DenseTensor *variance_out,
+                         DenseTensor *saved_mean,
+                         DenseTensor *saved_variance,
+                         DenseTensor *reserve_space) {
+  PADDLE_ENFORCE_EQ(use_global_stats,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "sync_batch_norm doesn't support "
+                        "to set use_global_stats True. Please use batch_norm "
+                        "in this case."));
+
+  double epsilon = epsilon_f;
+  const bool trainable_stats = trainable_statistics;
+  const DataLayout layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  bool test_mode = is_test && (!trainable_statistics);
+  const auto &x_dims = x.dims();
+  PADDLE_ENFORCE_GE(x_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input dim size should be larger than 1."));
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The Input dim size should be less than 6."));
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+  int x_numel = x.numel();
+
+  const T *x_d = x.template data<T>();
+  const auto *s_d = scale.template data<BatchNormParamType<T>>();
+  const auto *b_d = bias.template data<BatchNormParamType<T>>();
+
+  T *y_d = ctx.template Alloc<T>(y);
+
+  const BatchNormParamType<T> *mean_data = nullptr;
+  const BatchNormParamType<T> *var_data = nullptr;
+
+  auto stream = ctx.stream();
+  const int block = 512;
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
+
+  paddle::memory::AllocationPtr alloc_ptr{nullptr};
+
+  if (test_mode) {
+    mean_data = mean.template data<BatchNormParamType<T>>();
+    var_data = variance.template data<BatchNormParamType<T>>();
+  } else {
+    // x, x^2, 1, here 1 is used to calc device num
+    // device num also can be got from platform::DeviceContextPool
+    const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
+    alloc_ptr = paddle::memory::Alloc(ctx, bytes);
+
+    auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
+    const int threads = 256;
+    int grid = std::min(C, (max_threads + threads - 1) / threads);
+    if (layout == paddle::framework::DataLayout::kNCHW) {
+      KeLocalStats<T, threads, paddle::framework::DataLayout::kNCHW>
+          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
+    } else {
+      KeLocalStats<T, threads, paddle::framework::DataLayout::kNHWC>
+          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
+    }
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto *comm = ctx.nccl_comm();
+    if (comm) {
+      int dtype = paddle::platform::ToNCCLDataType(
+          paddle::framework::TransToProtoVarType(mean_out->dtype()));
+      // In-place operation
+      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
+          stats,
+          stats,
+          2 * C + 1,
+          static_cast<ncclDataType_t>(dtype),
+          ncclSum,
+          comm,
+          stream));
+    }
+#endif
+
+    auto *est_mean_data = ctx.template Alloc<BatchNormParamType<T>>(mean_out);
+    auto *est_var_data =
+        ctx.template Alloc<BatchNormParamType<T>>(variance_out);
+
+    auto *sv_mean_data = ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+    auto *sv_inv_var_data =
+        ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+
+    // Note, Input('Mean')/Input('Variance') share variable with
+    // Output('MeanOut')/Output('VarianceOut')
+    KeSyncAndMovingStats<T>
+        <<<(C + block - 1) / block, block, 0, stream>>>(stats,
+                                                        stats + C,
+                                                        stats + 2 * C,
+                                                        C,
+                                                        momentum,
+                                                        epsilon,
+                                                        sv_mean_data,
+                                                        sv_inv_var_data,
+                                                        est_mean_data,
+                                                        est_var_data);
+
+    mean_data = sv_mean_data;
+    var_data = stats + C;
+  }
+
+  int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
+  if (layout == paddle::framework::DataLayout::kNCHW) {
+    KeNormAffine<T, paddle::framework::DataLayout::kNCHW>
+        <<<grid2, block, 0, stream>>>(x_d,
+                                      s_d,
+                                      b_d,
+                                      mean_data,
+                                      var_data,
+                                      epsilon,
+                                      C,
+                                      H * W * D,
+                                      x_numel,
+                                      y_d);
+  } else {
+    KeNormAffine<T, paddle::framework::DataLayout::kNHWC>
+        <<<grid2, block, 0, stream>>>(x_d,
+                                      s_d,
+                                      b_d,
+                                      mean_data,
+                                      var_data,
+                                      epsilon,
+                                      C,
+                                      H * W * D,
+                                      x_numel,
+                                      y_d);
+  }
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(sync_batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(sync_batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -27,25 +27,20 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/norm_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"

-namespace paddle {
-namespace operators {
+namespace phi {

-using Tensor = framework::Tensor;
-using DataLayout = framework::DataLayout;
 template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;

-template <typename T, int BlockDim, framework::DataLayout layout>
+template <typename T, int BlockDim, DataLayout layout>
 __global__ void KeLocalStats(
    const T *x, int N, int M, int C, BatchNormParamType<T> *mean_var) {
  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
@@ -54,8 +49,7 @@ __global__ void KeLocalStats(
    BatchNormParamType<T> x_sum = 0.;
    BatchNormParamType<T> x2_sum = 0.;
    for (int i = threadIdx.x; i < N * M; i += BlockDim) {
-      int id = layout == framework::DataLayout::kNCHW
-                   ? (i / M) * C * M + k * M + i % M
+      int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M
                                           : i * C + k;
      auto x_in = static_cast<BatchNormParamType<T>>(x[id]);
      x_sum += x_in;
@@ -109,7 +103,7 @@ __global__ void KeSyncAndMovingStats(BatchNormParamType<T> *means,
  }
 }

-template <typename T, framework::DataLayout layout>
+template <typename T, DataLayout layout>
 static __global__ void KeNormAffine(const T *x,
                                    const BatchNormParamType<T> *scale,
                                    const BatchNormParamType<T> *bias,
@@ -123,7 +117,7 @@ static __global__ void KeNormAffine(const T *x,
  int gid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
+    const int c = layout == DataLayout::kNCHW ? (i / M) % C : i % C;
    auto x_i = static_cast<BatchNormParamType<T>>(x[i]);
    auto y_i =
        (x_i - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c];
@@ -131,146 +125,7 @@ static __global__ void KeNormAffine(const T *x,
  }
 }

-template <typename DeviceContext, typename T>
-void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
-                          const DataLayout layout,
-                          const framework::Tensor *x,
-                          framework::Tensor *y,
-                          const framework::Tensor *mean,
-                          const framework::Tensor *variance,
-                          framework::Tensor *mean_out,
-                          framework::Tensor *variance_out,
-                          framework::Tensor *saved_mean,
-                          framework::Tensor *saved_variance,
-                          double epsilon,
-                          const float momentum,
-                          const bool is_test,
-                          const bool use_global_stats
-
-) {
-  const auto &x_dims = x->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(),
-                    2,
-                    platform::errors::InvalidArgument(
-                        "The Input dim size should be larger than 1."));
-  PADDLE_ENFORCE_LE(x_dims.size(),
-                    5,
-                    platform::errors::InvalidArgument(
-                        "The Input dim size should be less than 6."));
-  int N, C, H, W, D;
-  ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
-  int x_numel = x->numel();
-
-  const T *x_d = x->data<T>();
-  const auto *s_d = ctx.Input<Tensor>("Scale")->data<BatchNormParamType<T>>();
-  const auto *b_d = ctx.Input<Tensor>("Bias")->data<BatchNormParamType<T>>();
-
-  T *y_d = y->mutable_data<T>(ctx.GetPlace());
-
-  const BatchNormParamType<T> *mean_data = nullptr;
-  const BatchNormParamType<T> *var_data = nullptr;
-
-  auto &dev_ctx = ctx.cuda_device_context();
-  auto stream = dev_ctx.stream();
-  const int block = 512;
-  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-
-  paddle::memory::AllocationPtr alloc_ptr{nullptr};
-
-  if (is_test) {
-    mean_data = mean->data<BatchNormParamType<T>>();
-    var_data = variance->data<BatchNormParamType<T>>();
-  } else {
-    // x, x^2, 1, here 1 is used to calc device num
-    // device num also can be got from platform::DeviceContextPool
-    const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
-    alloc_ptr = memory::Alloc(dev_ctx, bytes);
-
-    auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
-    const int threads = 256;
-    int grid = std::min(C, (max_threads + threads - 1) / threads);
-    if (layout == framework::DataLayout::kNCHW) {
-      KeLocalStats<T, threads, framework::DataLayout::kNCHW>
-          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
-    } else {
-      KeLocalStats<T, threads, framework::DataLayout::kNHWC>
-          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
-    }
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto *comm = dev_ctx.nccl_comm();
-    if (comm) {
-      int dtype = platform::ToNCCLDataType(
-          framework::TransToProtoVarType(mean_out->dtype()));
-      // In-place operation
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclAllReduce(stats,
-                                           stats,
-                                           2 * C + 1,
-                                           static_cast<ncclDataType_t>(dtype),
-                                           ncclSum,
-                                           comm,
-                                           stream));
-    }
-#endif
-
-    auto *est_mean_data =
-        mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    auto *est_var_data =
-        variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-    auto *sv_mean_data =
-        saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    auto *sv_inv_var_data =
-        saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-    // Note, Input('Mean')/Input('Variance') share variable with
-    // Output('MeanOut')/Output('VarianceOut')
-    KeSyncAndMovingStats<T>
-        <<<(C + block - 1) / block, block, 0, stream>>>(stats,
-                                                        stats + C,
-                                                        stats + 2 * C,
-                                                        C,
-                                                        momentum,
-                                                        epsilon,
-                                                        sv_mean_data,
-                                                        sv_inv_var_data,
-                                                        est_mean_data,
-                                                        est_var_data);
-
-    mean_data = sv_mean_data;
-    var_data = stats + C;
-  }
-
-  int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
-  if (layout == framework::DataLayout::kNCHW) {
-    KeNormAffine<T, framework::DataLayout::kNCHW>
-        <<<grid2, block, 0, stream>>>(x_d,
-                                      s_d,
-                                      b_d,
-                                      mean_data,
-                                      var_data,
-                                      epsilon,
-                                      C,
-                                      H * W * D,
-                                      x_numel,
-                                      y_d);
-  } else {
-    KeNormAffine<T, framework::DataLayout::kNHWC>
-        <<<grid2, block, 0, stream>>>(x_d,
-                                      s_d,
-                                      b_d,
-                                      mean_data,
-                                      var_data,
-                                      epsilon,
-                                      C,
-                                      H * W * D,
-                                      x_numel,
-                                      y_d);
-  }
-}
-
-template <typename T, const int BlockDim, framework::DataLayout layout>
+template <typename T, const int BlockDim, DataLayout layout>
 __global__ void KeBackwardLocalStats(const T *dy,
                                     const T *x,
                                     const BatchNormParamType<T> *means,
@@ -285,8 +140,7 @@ __global__ void KeBackwardLocalStats(const T *dy,
    BatchNormParamType<T> sum2 = 0.;
    auto mean = means[k];
    for (int i = threadIdx.x; i < N * M; i += blockDim.x) {
-      int id = layout == framework::DataLayout::kNCHW
-                   ? (i / M) * C * M + k * M + i % M
+      int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M
                                           : i * C + k;
      auto g = static_cast<BatchNormParamType<T>>(dy[id]);
      sum1 += g;
@@ -311,7 +165,7 @@ __global__ void KeBackwardLocalStats(const T *dy,
  }
 }

-template <typename T, int BlockDim, framework::DataLayout layout>
+template <typename T, int BlockDim, DataLayout layout>
 static __global__ void KeBNBackwardScaleBias(
    const T *dy,
    const T *x,
@@ -335,7 +189,7 @@ static __global__ void KeBNBackwardScaleBias(
    auto inv_var_i = inv_variance[i];
    auto mean_i = mean[i];
    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int id = layout == framework::DataLayout::kNCHW
+      const int id = layout == DataLayout::kNCHW
                         ? ((j / HxW) * C + i) * HxW + (j % HxW)
                         : j * outer_size + i;
      auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
@@ -356,7 +210,7 @@ static __global__ void KeBNBackwardScaleBias(
  }
 }

-template <typename T, framework::DataLayout layout>
+template <typename T, DataLayout layout>
 static __global__ void KeBNRestoreData(T *x,
                                       const BatchNormParamType<T> *scale,
                                       const BatchNormParamType<T> *bias,
@@ -370,14 +224,14 @@ static __global__ void KeBNRestoreData(T *x,
  int gid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
+    const int c = layout == DataLayout::kNCHW ? (i / M) % C : i % C;
    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
    auto x_i = (y_i - bias[c]) / scale[c] / sv_inv[c] + mean[c];
    x[i] = static_cast<T>(x_i);
  }
 }

-template <typename T, framework::DataLayout layout>
+template <typename T, DataLayout layout>
 static __global__ void KeBNBackwardData(
    const T *dy,
    const T *x,
@@ -397,7 +251,7 @@ static __global__ void KeBNBackwardData(
  auto scale = static_cast<BatchNormParamType<T>>(C) / num;
  auto dev_num = num_dev[0];
  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    const int c = layout == DataLayout::kNCHW ? i / HxW % C : i % C;
    auto inv_var = inv_variance[c];
    auto s_d = gamma[c];
    auto gvar =
@@ -412,64 +266,80 @@ static __global__ void KeBNBackwardData(
  }
 }

-template <typename DeviceContext, typename T>
-void SyncBatchNormGradFunctor(const framework::ExecutionContext &ctx,
-                              const DataLayout layout,
-                              const framework::Tensor *scale,
-                              const framework::Tensor *bias,
-                              framework::Tensor *d_x,
-                              const framework::Tensor *d_y,
-                              framework::Tensor *d_scale,
-                              framework::Tensor *d_bias,
-                              const framework::Tensor *mean,
-                              const framework::Tensor *variance,
-                              const double epsilon) {
-  // sync_batch_norm with inplace as false will take X as grad input, which
-  // is same as cuDNN batch_norm backward calculation, batch_norm
-  // with inplace as true only take Y as input and X should be calculate
-  // by inverse operation of batch_norm on Y
-  const Tensor *x;
-  bool is_inplace;
-  if (ctx.HasInput("Y")) {
-    x = ctx.Input<Tensor>("Y");
+template <typename T, typename Context>
+void SyncBatchNormGradFunctor(
+    const Context &ctx,
+    const DenseTensor *input_x,
+    const DenseTensor *input_y,
+    const DenseTensor &scale,
+    const DenseTensor &bias,
+    // const paddle::optional<DenseTensor>& mean,
+    // const paddle::optional<DenseTensor>& variance,
+    const DenseTensor &saved_mean,
+    const DenseTensor &saved_variance,
+    // const paddle::optional<DenseTensor>& reserve_space,
+    const DenseTensor &y_grad,
+    // float momentum,
+    float epsilon_f,
+    const std::string &data_layout_str,
+    // bool is_test,
+    // bool use_global_stats,
+    // bool trainable_statistics,
+    // bool fuse_with_relu,
+    DenseTensor *x_grad,
+    DenseTensor *scale_grad,
+    DenseTensor *bias_grad) {
+  double epsilon = static_cast<double>(epsilon_f);
+
+  const DataLayout layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto *d_y = &y_grad;
+
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  const DenseTensor *x;
+  bool is_inplace = false;
+  if (input_y) {
    is_inplace = true;
+    x = input_y;
  } else {
-    x = ctx.Input<Tensor>("X");
-    is_inplace = false;
+    x = input_x;
  }
-
  const auto &x_dims = x->dims();

  PADDLE_ENFORCE_GE(x_dims.size(),
                    2,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "The Input X dim size should be larger than 1."));
  PADDLE_ENFORCE_LE(x_dims.size(),
                    5,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "The Input X dim size should be less than 6."));

  int N, C, H, W, D;
-  ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
-  PADDLE_ENFORCE_EQ(scale->dims()[0],
+  funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+  PADDLE_ENFORCE_EQ(scale.dims()[0],
                    C,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "Expected first dim for input parameter(scale) of "
                        "OP(sync_batch_norm) be (%d), but given (%d).",
                        C,
-                        scale->dims()[0]));
+                        scale.dims()[0]));

-  d_x->mutable_data<T>(ctx.GetPlace());
+  ctx.template Alloc<T>(d_x);
  if (d_scale && d_bias) {
-    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    ctx.template Alloc<BatchNormParamType<T>>(d_scale);
+    ctx.template Alloc<BatchNormParamType<T>>(d_bias);
  }
-  PADDLE_ENFORCE_EQ(scale->dims().size(),
+  PADDLE_ENFORCE_EQ(scale.dims().size(),
                    1UL,
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "Expected rank for input parameter(scale) of "
                        "OP(sync_batch_norm) be (1), but given (%d).",
-                        scale->dims().size()));
+                        scale.dims().size()));

  std::vector<int> dims;
  std::vector<int> strides;
@@ -484,30 +354,31 @@ void SyncBatchNormGradFunctor(const framework::ExecutionContext &ctx,
  auto px = *x;
  const T *dy_d = d_y->data<T>();

-  auto &dev_ctx = ctx.cuda_device_context();
-  auto stream = dev_ctx.stream();
+  auto stream = ctx.stream();

-  const auto *saved_mean = mean->data<BatchNormParamType<T>>();
-  const auto *saved_inv_var = variance->data<BatchNormParamType<T>>();
+  const auto *saved_mean_ptr =
+      saved_mean.template data<BatchNormParamType<T>>();
+  const auto *saved_inv_var =
+      saved_variance.template data<BatchNormParamType<T>>();
  const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
-  auto alloc_ptr = memory::Alloc(dev_ctx, bytes);
+  auto alloc_ptr = paddle::memory::Alloc(ctx, bytes);
  auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());

  const int block = 512;
  const int threads = 256;
  int x_numel = x->numel();
  int fsize = H * W * D;
-  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
  int grid = std::min(C, (max_threads + threads - 1) / threads);
  int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;

  if (is_inplace) {
-    if (layout == framework::DataLayout::kNCHW) {
-      KeBNRestoreData<T, framework::DataLayout::kNCHW>
-          <<<grid2, block, 0, stream>>>(px.mutable_data<T>(ctx.GetPlace()),
-                                        scale->data<BatchNormParamType<T>>(),
-                                        bias->data<BatchNormParamType<T>>(),
-                                        saved_mean,
+    if (layout == DataLayout::kNCHW) {
+      KeBNRestoreData<T, DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+          ctx.template Alloc<T>(&px),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          saved_mean_ptr,
          saved_inv_var,
          epsilon,
          C,
@@ -515,11 +386,11 @@ void SyncBatchNormGradFunctor(const framework::ExecutionContext &ctx,
          x_numel,
          x->data<T>());
    } else {
-      KeBNRestoreData<T, framework::DataLayout::kNHWC>
-          <<<grid2, block, 0, stream>>>(px.mutable_data<T>(ctx.GetPlace()),
-                                        scale->data<BatchNormParamType<T>>(),
-                                        bias->data<BatchNormParamType<T>>(),
-                                        saved_mean,
+      KeBNRestoreData<T, DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+          ctx.template Alloc<T>(&px),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          saved_mean_ptr,
          saved_inv_var,
          epsilon,
          C,
@@ -529,24 +400,24 @@ void SyncBatchNormGradFunctor(const framework::ExecutionContext &ctx,
    }
  }

-  if (layout == framework::DataLayout::kNCHW) {
-    KeBackwardLocalStats<T, threads, framework::DataLayout::kNCHW>
+  if (layout == DataLayout::kNCHW) {
+    KeBackwardLocalStats<T, threads, DataLayout::kNCHW>
        <<<grid, threads, 0, stream>>>(
-            dy_d, x_d, saved_mean, N, fsize, C, stats);
+            dy_d, x_d, saved_mean_ptr, N, fsize, C, stats);
  } else {
-    KeBackwardLocalStats<T, threads, framework::DataLayout::kNHWC>
+    KeBackwardLocalStats<T, threads, DataLayout::kNHWC>
        <<<grid, threads, 0, stream>>>(
-            dy_d, x_d, saved_mean, N, fsize, C, stats);
+            dy_d, x_d, saved_mean_ptr, N, fsize, C, stats);
  }

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  auto *comm = dev_ctx.nccl_comm();
+  auto *comm = ctx.nccl_comm();
  if (comm) {
-    int dtype = platform::ToNCCLDataType(
-        framework::TransToProtoVarType(scale->dtype()));
+    int dtype = paddle::platform::ToNCCLDataType(
+        paddle::framework::TransToProtoVarType(scale.dtype()));
    // In-place operation
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::ncclAllReduce(stats,
+    PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
+        stats,
        stats,
        2 * C + 1,
        static_cast<ncclDataType_t>(dtype),
@@ -556,12 +427,12 @@ void SyncBatchNormGradFunctor(const framework::ExecutionContext &ctx,
  }
 #endif

-  if (layout == framework::DataLayout::kNCHW) {
+  if (layout == DataLayout::kNCHW) {
    if (d_scale && d_bias) {
-      KeBNBackwardScaleBias<T, threads, framework::DataLayout::kNCHW>
+      KeBNBackwardScaleBias<T, threads, DataLayout::kNCHW>
          <<<grid, threads, 0, stream>>>(dy_d,
                                         x_d,
-                                         saved_mean,
+                                         saved_mean_ptr,
                                         saved_inv_var,
                                         epsilon,
                                         N,
@@ -571,11 +442,11 @@ void SyncBatchNormGradFunctor(const framework::ExecutionContext &ctx,
                                         d_bias->data<BatchNormParamType<T>>());
    }
    if (d_x) {
-      KeBNBackwardData<T, framework::DataLayout::kNCHW>
-          <<<grid2, block, 0, stream>>>(dy_d,
+      KeBNBackwardData<T, DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+          dy_d,
          x_d,
-                                        scale->data<BatchNormParamType<T>>(),
-                                        saved_mean,
+          scale.template data<BatchNormParamType<T>>(),
+          saved_mean_ptr,
          saved_inv_var,
          stats,
          stats + C,
@@ -588,10 +459,10 @@ void SyncBatchNormGradFunctor(const framework::ExecutionContext &ctx,
    }
  } else {
    if (d_scale && d_bias) {
-      KeBNBackwardScaleBias<T, threads, framework::DataLayout::kNHWC>
+      KeBNBackwardScaleBias<T, threads, DataLayout::kNHWC>
          <<<grid, threads, 0, stream>>>(dy_d,
                                         x_d,
-                                         saved_mean,
+                                         saved_mean_ptr,
                                         saved_inv_var,
                                         epsilon,
                                         N,
@@ -601,11 +472,11 @@ void SyncBatchNormGradFunctor(const framework::ExecutionContext &ctx,
                                         d_bias->data<BatchNormParamType<T>>());
    }
    if (d_x) {
-      KeBNBackwardData<T, framework::DataLayout::kNHWC>
-          <<<grid2, block, 0, stream>>>(dy_d,
+      KeBNBackwardData<T, DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+          dy_d,
          x_d,
-                                        scale->data<BatchNormParamType<T>>(),
-                                        saved_mean,
+          scale.template data<BatchNormParamType<T>>(),
+          saved_mean_ptr,
          saved_inv_var,
          stats,
          stats + C,
@@ -619,19 +490,4 @@ void SyncBatchNormGradFunctor(const framework::ExecutionContext &ctx,
  }
 }

-template <typename DeviceContext, typename T>
-class SyncBatchNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-// Deriving the Gradient for the Backward Pass of Batch Normalization
-// https://kevinzakka.github.io/2016/09/14/batch_normalization/
-template <typename DeviceContext, typename T>
-class SyncBatchNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
--- a/paddle/phi/kernels/sync_batch_norm_grad_kernel.h
+++ b/paddle/phi/kernels/sync_batch_norm_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SyncBatchNormGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& scale,
+                             const DenseTensor& bias,
+                             const paddle::optional<DenseTensor>& mean,
+                             const paddle::optional<DenseTensor>& variance,
+                             const DenseTensor& saved_mean,
+                             const DenseTensor& saved_variance,
+                             const paddle::optional<DenseTensor>& reserve_space,
+                             const DenseTensor& y_grad,
+                             float momentum,
+                             float epsilon,
+                             const std::string& data_layout,
+                             bool is_test,
+                             bool use_global_stats,
+                             bool trainable_statistics,
+                             bool fuse_with_relu,
+                             DenseTensor* x_grad,
+                             DenseTensor* scale_grad,
+                             DenseTensor* bias_grad);
+
+}  // namespace phi
--- a/paddle/phi/kernels/sync_batch_norm_kernel.h
+++ b/paddle/phi/kernels/sync_batch_norm_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SyncBatchNormKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* y,
+                         DenseTensor* mean_out,
+                         DenseTensor* variance_out,
+                         DenseTensor* saved_mean,
+                         DenseTensor* saved_variance,
+                         DenseTensor* reserve_space);
+}  // namespace phi
--- a/paddle/phi/ops/compat/sync_batch_norm_sig.cc
+++ b/paddle/phi/ops/compat/sync_batch_norm_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SyncBatchNormOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sync_batch_norm",
+                         {"X", "Scale", "Bias", "Mean", "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"Y",
+                          "MeanOut",
+                          "VarianceOut",
+                          "SavedMean",
+                          "SavedVariance",
+                          "ReserveSpace"});
+}
+
+KernelSignature SyncBatchNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sync_batch_norm_grad",
+                         {
+                             "X",
+                             "Scale",
+                             "Bias",
+                             "Mean",
+                             "Variance",
+                             "SavedMean",
+                             "SavedVariance",
+                             "ReserveSpace",
+                             "Y@GRAD",
+                         },
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(sync_batch_norm,
+                           phi::SyncBatchNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sync_batch_norm_grad,
+                           phi::SyncBatchNormGradOpArgumentMapping);
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -49,6 +49,7 @@ from .. import functional as F
 from paddle import _C_ops
 from .. import Layer
 from paddle import in_dynamic_mode
+from paddle.fluid.framework import in_dygraph_mode

 __all__ = []

@@ -1100,7 +1101,14 @@ class SyncBatchNorm(_BatchNormBase):

        ### train mode: use mini-batch stats, eval mode: use global stats
        ### use_global_stats only support False in sync_batch_norm
-        if in_dynamic_mode():
+        if in_dygraph_mode():
+            sync_batch_norm_out, _, _, _, _, _ = _C_ops.final_state_sync_batch_norm(
+                x, self.weight, self.bias, self._mean, self._variance,
+                self._momentum, self._epsilon, self._data_format,
+                not self.training, False, False, False)
+            return sync_batch_norm_out
+
+        elif in_dynamic_mode():
            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                     "is_test", not self.training, "data_layout",
                     self._data_format, "use_mkldnn", False, "fuse_with_relu",
@@ -1109,7 +1117,6 @@ class SyncBatchNorm(_BatchNormBase):
            sync_batch_norm_out, _, _, _, _, _ = _C_ops.sync_batch_norm(
                x, self.weight, self.bias, self._mean, self._variance, mean_out,
                variance_out, *attrs)
-
            return sync_batch_norm_out

        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],