[Phi] migrate sync_batch_norm to phi (#44369)

c99c70cb · lyq · GitHub · b8d106e1 · c99c70cb · b8d106e1
11 changed file
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -13,17 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/inplace_abn_op.h"
+#include <iostream>
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
 #include "paddle/phi/kernels/batch_norm_grad_kernel.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
+#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
 namespace paddle {
 namespace operators {
 template <typename DeviceContext, typename T>
-class InplaceABNKernel
+class InplaceABNKernel : public framework::OpKernel<T> {
-    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* y = ctx.Output<Tensor>("Y");
@@ -36,29 +38,49 @@ class InplaceABNKernel
        GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* variance = ctx.Input<Tensor>("Variance");
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+    auto* mean_out = ctx.Output<Tensor>("MeanOut");
+    auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+    auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
    if (ctx.Attr<bool>("use_sync_bn")) {
-      SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::SyncBatchNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x,
+          *scale,
+          *bias,
+          *mean,
+          *variance,
+          momentum,
+          epsilon,
+          data_layout,
+          is_test,
+          use_global_stats,
+          trainable_statistics,
+          fuse_with_relu,
+          y,
+          mean_out,
+          variance_out,
+          saved_mean,
+          saved_variance,
+          reserve_space);
    } else {
-      // BatchNormKernel<DeviceContext, T>::Compute(ctx);
-      auto* scale = ctx.Input<Tensor>("Scale");
-      auto* bias = ctx.Input<Tensor>("Bias");
-      auto* mean = ctx.Input<Tensor>("Mean");
-      auto* variance = ctx.Input<Tensor>("Variance");
-      auto momentum = ctx.Attr<float>("momentum");
-      auto epsilon = ctx.Attr<float>("epsilon");
-      auto data_layout = ctx.Attr<std::string>("data_layout");
-      auto is_test = ctx.Attr<bool>("is_test");
-      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
-      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
-      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
-      auto* mean_out = ctx.Output<Tensor>("MeanOut");
-      auto* variance_out = ctx.Output<Tensor>("VarianceOut");
-      auto* saved_mean = ctx.Output<Tensor>("SavedMean");
-      auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
-      auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
      auto& dev_ctx = ctx.device_context<DeviceContext>();
      phi::BatchNormKernel<T>(
          static_cast<const typename framework::ConvertToPhiContext<
@@ -92,8 +114,7 @@ class InplaceABNKernel
 // Deriving the Gradient for the Backward Pass of Batch Normalization
 // https://kevinzakka.github.io/2016/09/14/batch_normalization/
 template <typename DeviceContext, typename T>
-class InplaceABNGradKernel
+class InplaceABNGradKernel : public framework::OpKernel<T> {
-    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    const auto* y = ctx.Input<Tensor>("Y");
@@ -115,29 +136,44 @@ class InplaceABNGradKernel
    InplaceABNActivation<DeviceContext, T> functor;
    functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+    auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+    auto* mean = ctx.Input<Tensor>("ReserveSpace");
+    auto* variance = ctx.Input<Tensor>("ReserveSpace");
    if (ctx.Attr<bool>("use_sync_bn")) {
-      SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::SyncBatchNormGradFunctor<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          nullptr,
+          y,
+          *scale,
+          *bias,
+          *saved_mean,
+          *saved_variance,
+          *d_y,
+          epsilon,
+          data_layout,
+          d_x,
+          scale_grad,
+          bias_grad);
    } else {
-      auto* scale = ctx.Input<Tensor>("Scale");
-      auto* bias = ctx.Input<Tensor>("Bias");
-      auto* saved_mean = ctx.Input<Tensor>("SavedMean");
-      auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
-      auto momentum = ctx.Attr<float>("momentum");
-      auto epsilon = ctx.Attr<float>("epsilon");
-      auto data_layout = ctx.Attr<std::string>("data_layout");
-      auto is_test = ctx.Attr<bool>("is_test");
-      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
-      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
-      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
-      auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-      auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-      auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
-      auto* mean = ctx.Input<Tensor>("ReserveSpace");
-      auto* variance = ctx.Input<Tensor>("ReserveSpace");
      paddle::optional<Tensor> space_opt;
      paddle::optional<Tensor> mean_opt;
      paddle::optional<Tensor> variance_opt;

--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class SyncBatchNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout layout = framework::StringToDataLayout(layout_str);
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    PADDLE_ENFORCE_EQ(use_global_stats,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "sync_batch_norm doesn't support "
-                          "to set use_global_stats True. Please use batch_norm "
-                          "in this case."));
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Y");
-    const auto *est_mean = ctx.Input<Tensor>("Mean");
-    const auto *est_var = ctx.Input<Tensor>("Variance");
-    // moving mean/variance
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_inv_variance = ctx.Output<Tensor>("SavedVariance");
-    bool test_mode = is_test && (!trainable_stats);
-    SyncBatchNormFunctor<platform::CUDADeviceContext, T>(ctx,
-                                                         layout,
-                                                         x,
-                                                         y,
-                                                         est_mean,
-                                                         est_var,
-                                                         mean_out,
-                                                         variance_out,
-                                                         saved_mean,
-                                                         saved_inv_variance,
-                                                         epsilon,
-                                                         momentum,
-                                                         test_mode,
-                                                         use_global_stats);
-  }
-};
-template <typename T>
-class SyncBatchNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout layout = framework::StringToDataLayout(layout_str);
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_inv_var = ctx.Input<Tensor>("SavedVariance");
-    SyncBatchNormGradFunctor<platform::CUDADeviceContext, T>(ctx,
-                                                             layout,
-                                                             scale,
-                                                             bias,
-                                                             d_x,
-                                                             d_y,
-                                                             d_scale,
-                                                             d_bias,
-                                                             saved_mean,
-                                                             saved_inv_var,
-                                                             epsilon);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm_grad,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, double>,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm_grad,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, double>,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-#endif
-// clang-format on
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -2075,6 +2075,16 @@
    func : swish
  backward : swish_grad
+# sync_batch_norm
+- api : sync_batch_norm
+  args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
+  infer_meta :
+    func : BatchNormInferMeta
+  kernel :
+    func : sync_batch_norm
+  backward : sync_batch_norm_grad
 # take_along_axis
 - api : take_along_axis
  args : (Tensor x, Tensor index, int axis)

--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -2085,6 +2085,18 @@
    func : swish_grad
  inplace : (out_grad -> x_grad)
+- backward_api : sync_batch_norm_grad
+  forward : sync_batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
+  args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [x, scale, bias]
+  kernel :
+    func : sync_batch_norm_grad
+    data_type : out_grad
+  optional : mean_out, variance_out, reserve_space
 - backward_api : take_along_axis_grad
  forward : take_along_axis (Tensor x, Tensor index, int axis) -> Tensor(out)
  args : (Tensor x, Tensor index, Tensor out_grad, int axis)

--- a/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
+namespace phi {
+template <typename T, typename Context>
+void SyncBatchNormGradKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& scale,
+                             const DenseTensor& bias,
+                             const paddle::optional<DenseTensor>& mean,
+                             const paddle::optional<DenseTensor>& variance,
+                             const DenseTensor& saved_mean,
+                             const DenseTensor& saved_variance,
+                             const paddle::optional<DenseTensor>& reserve_space,
+                             const DenseTensor& y_grad,
+                             float momentum,
+                             float epsilon_f,
+                             const std::string& data_layout_str,
+                             bool is_test,
+                             bool use_global_stats,
+                             bool trainable_statistics,
+                             bool fuse_with_relu,
+                             DenseTensor* x_grad,
+                             DenseTensor* scale_grad,
+                             DenseTensor* bias_grad) {
+  SyncBatchNormGradFunctor<T, Context>(ctx,
+                                       &x,
+                                       nullptr,
+                                       scale,
+                                       bias,
+                                       saved_mean,
+                                       saved_variance,
+                                       y_grad,
+                                       epsilon_f,
+                                       data_layout_str,
+                                       x_grad,
+                                       scale_grad,
+                                       bias_grad);
+}
+}  // namespace phi
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(sync_batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(sync_batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
--- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
+namespace phi {
+template <typename T, typename Context>
+void SyncBatchNormKernel(const Context &ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &scale,
+                         const DenseTensor &bias,
+                         const DenseTensor &mean,
+                         const DenseTensor &variance,
+                         float momentum,
+                         float epsilon_f,
+                         const std::string &data_layout_str,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor *y,
+                         DenseTensor *mean_out,
+                         DenseTensor *variance_out,
+                         DenseTensor *saved_mean,
+                         DenseTensor *saved_variance,
+                         DenseTensor *reserve_space) {
+  PADDLE_ENFORCE_EQ(use_global_stats,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "sync_batch_norm doesn't support "
+                        "to set use_global_stats True. Please use batch_norm "
+                        "in this case."));
+  double epsilon = epsilon_f;
+  const bool trainable_stats = trainable_statistics;
+  const DataLayout layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  bool test_mode = is_test && (!trainable_statistics);
+  const auto &x_dims = x.dims();
+  PADDLE_ENFORCE_GE(x_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input dim size should be larger than 1."));
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The Input dim size should be less than 6."));
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+  int x_numel = x.numel();
+  const T *x_d = x.template data<T>();
+  const auto *s_d = scale.template data<BatchNormParamType<T>>();
+  const auto *b_d = bias.template data<BatchNormParamType<T>>();
+  T *y_d = ctx.template Alloc<T>(y);
+  const BatchNormParamType<T> *mean_data = nullptr;
+  const BatchNormParamType<T> *var_data = nullptr;
+  auto stream = ctx.stream();
+  const int block = 512;
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
+  paddle::memory::AllocationPtr alloc_ptr{nullptr};
+  if (test_mode) {
+    mean_data = mean.template data<BatchNormParamType<T>>();
+    var_data = variance.template data<BatchNormParamType<T>>();
+  } else {
+    // x, x^2, 1, here 1 is used to calc device num
+    // device num also can be got from platform::DeviceContextPool
+    const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
+    alloc_ptr = paddle::memory::Alloc(ctx, bytes);
+    auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
+    const int threads = 256;
+    int grid = std::min(C, (max_threads + threads - 1) / threads);
+    if (layout == paddle::framework::DataLayout::kNCHW) {
+      KeLocalStats<T, threads, paddle::framework::DataLayout::kNCHW>
+          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
+    } else {
+      KeLocalStats<T, threads, paddle::framework::DataLayout::kNHWC>
+          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
+    }
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto *comm = ctx.nccl_comm();
+    if (comm) {
+      int dtype = paddle::platform::ToNCCLDataType(
+          paddle::framework::TransToProtoVarType(mean_out->dtype()));
+      // In-place operation
+      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
+          stats,
+          stats,
+          2 * C + 1,
+          static_cast<ncclDataType_t>(dtype),
+          ncclSum,
+          comm,
+          stream));
+    }
+#endif
+    auto *est_mean_data = ctx.template Alloc<BatchNormParamType<T>>(mean_out);
+    auto *est_var_data =
+        ctx.template Alloc<BatchNormParamType<T>>(variance_out);
+    auto *sv_mean_data = ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+    auto *sv_inv_var_data =
+        ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+    // Note, Input('Mean')/Input('Variance') share variable with
+    // Output('MeanOut')/Output('VarianceOut')
+    KeSyncAndMovingStats<T>
+        <<<(C + block - 1) / block, block, 0, stream>>>(stats,
+                                                        stats + C,
+                                                        stats + 2 * C,
+                                                        C,
+                                                        momentum,
+                                                        epsilon,
+                                                        sv_mean_data,
+                                                        sv_inv_var_data,
+                                                        est_mean_data,
+                                                        est_var_data);
+    mean_data = sv_mean_data;
+    var_data = stats + C;
+  }
+  int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
+  if (layout == paddle::framework::DataLayout::kNCHW) {
+    KeNormAffine<T, paddle::framework::DataLayout::kNCHW>
+        <<<grid2, block, 0, stream>>>(x_d,
+                                      s_d,
+                                      b_d,
+                                      mean_data,
+                                      var_data,
+                                      epsilon,
+                                      C,
+                                      H * W * D,
+                                      x_numel,
+                                      y_d);
+  } else {
+    KeNormAffine<T, paddle::framework::DataLayout::kNHWC>
+        <<<grid2, block, 0, stream>>>(x_d,
+                                      s_d,
+                                      b_d,
+                                      mean_data,
+                                      var_data,
+                                      epsilon,
+                                      C,
+                                      H * W * D,
+                                      x_numel,
+                                      y_d);
+  }
+}
+}  // namespace phi
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(sync_batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(sync_batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
--- a/paddle/phi/kernels/sync_batch_norm_grad_kernel.h
+++ b/paddle/phi/kernels/sync_batch_norm_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void SyncBatchNormGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& scale,
+                             const DenseTensor& bias,
+                             const paddle::optional<DenseTensor>& mean,
+                             const paddle::optional<DenseTensor>& variance,
+                             const DenseTensor& saved_mean,
+                             const DenseTensor& saved_variance,
+                             const paddle::optional<DenseTensor>& reserve_space,
+                             const DenseTensor& y_grad,
+                             float momentum,
+                             float epsilon,
+                             const std::string& data_layout,
+                             bool is_test,
+                             bool use_global_stats,
+                             bool trainable_statistics,
+                             bool fuse_with_relu,
+                             DenseTensor* x_grad,
+                             DenseTensor* scale_grad,
+                             DenseTensor* bias_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/sync_batch_norm_kernel.h
+++ b/paddle/phi/kernels/sync_batch_norm_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void SyncBatchNormKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* y,
+                         DenseTensor* mean_out,
+                         DenseTensor* variance_out,
+                         DenseTensor* saved_mean,
+                         DenseTensor* saved_variance,
+                         DenseTensor* reserve_space);
+}  // namespace phi
--- a/paddle/phi/ops/compat/sync_batch_norm_sig.cc
+++ b/paddle/phi/ops/compat/sync_batch_norm_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature SyncBatchNormOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sync_batch_norm",
+                         {"X", "Scale", "Bias", "Mean", "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"Y",
+                          "MeanOut",
+                          "VarianceOut",
+                          "SavedMean",
+                          "SavedVariance",
+                          "ReserveSpace"});
+}
+KernelSignature SyncBatchNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sync_batch_norm_grad",
+                         {
+                             "X",
+                             "Scale",
+                             "Bias",
+                             "Mean",
+                             "Variance",
+                             "SavedMean",
+                             "SavedVariance",
+                             "ReserveSpace",
+                             "Y@GRAD",
+                         },
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(sync_batch_norm,
+                           phi::SyncBatchNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sync_batch_norm_grad,
+                           phi::SyncBatchNormGradOpArgumentMapping);
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -49,6 +49,7 @@ from .. import functional as F
 from paddle import _C_ops
 from .. import Layer
 from paddle import in_dynamic_mode
+from paddle.fluid.framework import in_dygraph_mode
 __all__ = []
@@ -1100,7 +1101,14 @@ class SyncBatchNorm(_BatchNormBase):
        ### train mode: use mini-batch stats, eval mode: use global stats
        ### use_global_stats only support False in sync_batch_norm
-        if in_dynamic_mode():
+        if in_dygraph_mode():
+            sync_batch_norm_out, _, _, _, _, _ = _C_ops.final_state_sync_batch_norm(
+                x, self.weight, self.bias, self._mean, self._variance,
+                self._momentum, self._epsilon, self._data_format,
+                not self.training, False, False, False)
+            return sync_batch_norm_out
+        elif in_dynamic_mode():
            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                     "is_test", not self.training, "data_layout",
                     self._data_format, "use_mkldnn", False, "fuse_with_relu",
@@ -1109,7 +1117,6 @@ class SyncBatchNorm(_BatchNormBase):
            sync_batch_norm_out, _, _, _, _, _ = _C_ops.sync_batch_norm(
                x, self.weight, self.bias, self._mean, self._variance, mean_out,
                variance_out, *attrs)
            return sync_batch_norm_out
        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],