Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_yolo_box_to_phi

9e00395a · phlrain · c782040e · 5d9e11a4 · 9e00395a · 9e00395a
134 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -7,9 +7,11 @@ paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
 paddle/phi/api/backward/backward_api.h
 paddle/phi/api/include/api.h
+paddle/phi/api/include/sparse_api.h
 paddle/phi/api/lib/api.cc
 paddle/phi/api/lib/dygraph_api.*
 paddle/phi/api/lib/backward_api.cc
+paddle/phi/api/lib/sparse_api.cc
 paddle/phi/extension.h
 paddle/phi/include/*
 paddle/phi/infermeta/generated.*

--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
+if (WITH_DISTRIBUTE)
+  cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
+endif()
 cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
 if(WITH_NCCL)

--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -117,6 +117,35 @@ class ProcessGroup {
        "ProcessGroup%s does not support receive", GetBackendName()));
  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors /* tensors */,     // NOLINT
+      std::vector<Tensor>& out_tensors /* tensors */) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<Tensor>& in /* tensors */,     // NOLINT
+      std::vector<Tensor>& out /* tensors */) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllToAll", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors /* tensors */,  // NOLINT
+      const ReduceOptions& opts) {                 // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support Reduce", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<Tensor>& in_tensors /* tensors */,   // NOLINT
+      std::vector<Tensor>& out_tensors /* tensors */,  // NOLINT
+      const ScatterOptions&) {                         // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support Scatter", GetBackendName()));
+  }
 protected:
  const int rank_;
  const int size_;

--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#ifdef _WIN32
+#include <gloo/common/win.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <netdb.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+#include <gloo/broadcast.h>
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace distributed {
+#ifdef _WIN32
+#define GENERATE_FUNC(type, func, ...)       \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(__VA_ARGS__);              \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(__VA_ARGS__);             \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(__VA_ARGS__);      \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(__VA_ARGS__);            \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(__VA_ARGS__);            \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+#define HOST_NAME_MAX 256
+#else
+#define GENERATE_FUNC(type, func, args...)   \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(args);                     \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(args);                    \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(args);             \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(args);                   \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+#endif
+typedef void (*reduce_func)(void*, const void*, const void*, size_t);
+template <typename T>
+reduce_func get_function(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+      return reduce_func(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return reduce_func(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return reduce_func(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return reduce_func(&::gloo::max<T>);
+    case ReduceOp::AVG:
+      VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
+      exit(-1);
+  }
+  VLOG(0) << "Error: Unknown ReduceOp.";
+  exit(-1);
+}
+bool CheckTensorsInCPUPlace(const std::vector<Tensor>& tensors) {
+  return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+    return t.place() == PlaceType::kCPU;
+  });
+}
+template <typename T>
+T* get_data(const Tensor& tensor) {
+  auto raw_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+  return static_cast<T*>(raw_tensor->data());
+}
+template <typename T>
+std::vector<T*> get_multi_data(const std::vector<Tensor>& tensors) {
+  std::vector<T*> ret(tensors.size());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    ret[i] = get_data<T>(tensors[i]);
+  }
+  return ret;
+}
+template <typename T, typename P>
+void set_output(P& opts, const Tensor& tensor) {  // NOLINT
+  opts.setOutput(get_data<T>(tensor), tensor.numel());
+}
+template <typename T, typename P>
+void set_input(P& opts, const Tensor& tensor) {  // NOLINT
+  opts.setInput(get_data<T>(tensor), tensor.numel());
+}
+template <typename T, typename P>
+void set_outputs(P& opts, const std::vector<Tensor>& tensors) {  // NOLINT
+  opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+template <typename T, typename P>
+void set_inputs(P& opts, const std::vector<Tensor>& tensors) {  // NOLINT
+  opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+ProcessGroupGloo::GlooTask::GlooTask(int rank,
+                                     const std::vector<Tensor>& inputs,
+                                     CommType comm_type)
+    : ProcessGroup::Task(rank, inputs, comm_type) {
+  PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true,
+                    platform::errors::Fatal(
+                        "Only CPU place is supported for ProcessGroupGloo."));
+}
+ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
+                                   int rank, int world_size,
+                                   const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size), _tag(0), _store(store) {
+  _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
+  auto prefix_store =
+      ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
+  _context->connectFullMesh(prefix_store, options->device);
+}
+class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
+                    const std::vector<Tensor>& inputs, int rank, int root,
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
+        _context(context),
+        _root(root),
+        _inputs(inputs),
+        _tag(tag) {}
+  void Run() override { _do_broadcast(_inputs[0]); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  const int _root;
+  std::vector<Tensor> _inputs{};
+  const uint32_t _tag;
+  void _do_broadcast(const Tensor& tensor) {
+    gloo::BroadcastOptions opts(_context);
+    const auto& dtype = tensor.type();
+    GENERATE_FUNC(dtype, set_output, opts, tensor);
+    opts.setRoot(_root);
+    opts.setTag(_tag);
+    gloo::broadcast(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
+    std::vector<Tensor>& inputs, const BroadcastOptions& opts) {
+  auto root = opts.source_rank;
+  std::unique_ptr<BroadcastGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_unique<BroadcastGlooTask>(context, inputs, rank_, root, tag);
+  task->Run();
+  return task;
+}
+class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllreduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                    std::vector<Tensor>& inputs, ReduceOp reduce_op,  // NOLINT
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
+        _context(context),
+        _inputs(inputs),
+        _reduce_op(reduce_op),
+        _tag(tag) {}
+  void Run() override { _do_allreduce(_inputs); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  const ReduceOp _reduce_op;
+  uint32_t _tag;
+  gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
+                                             const ReduceOp op) {
+    gloo::AllreduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+  template <typename T>
+  void _get_function_impl(gloo::AllreduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+  void _do_allreduce(std::vector<Tensor>& tensors) {  // NOLINT
+    const auto& dtype = tensors[0].type();
+    gloo::AllreduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_inputs, opts, tensors);
+    GENERATE_FUNC(dtype, set_outputs, opts, tensors);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    gloo::allreduce(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<Tensor>& inputs, const AllreduceOptions& opts) {
+  auto tag = next_tag();
+  std::shared_ptr<GlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<AllreduceGlooTask>(rank_, context, inputs,
+                                             opts.reduce_op, tag);
+  task->Run();
+  return task;
+}
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.iface = ifname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.hostname = hostname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDefaultDevice() {
+  std::array<char, HOST_NAME_MAX> hostname{};
+  auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
+  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
+                                "Get hostname error for createDefaultDevice."));
+  ::addrinfo* result;
+  result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
+  ::addrinfo* cur;
+  for (cur = result; cur != nullptr; cur = cur->ai_next) {
+    SocketType socket =
+        ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+    if (socket == -1) {
+      continue;
+    }
+    ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
+#ifdef _WIN32
+    closesocket(socket);
+#else
+    close(socket);
+#endif
+    if (ret == -1) {
+      continue;
+    }
+    break;
+  }
+  freeaddrinfo(result);
+  if (cur != nullptr) {
+    return createDeviceForHostname(hostname.data());
+  }
+  return createDeviceForHostname("127.0.0.1");
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <future>
+#include <mutex>
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#ifdef PADDLE_WITH_GLOO
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+constexpr const char* GLOO_BACKEND_NAME = "GLOO";
+namespace paddle {
+namespace distributed {
+class ProcessGroupGloo : public ProcessGroup {
+ public:
+  class GlooTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<GlooTask> {
+   public:
+    explicit GlooTask(int rank, const std::vector<Tensor>& input_tensors,
+                      CommType comm_type);
+    ~GlooTask() = default;
+    virtual void Run() = 0;
+    bool Wait(std::chrono::milliseconds timeout) override { return true; }
+    bool IsCompleted() override { return true; }
+    void Synchronize() override {}
+   protected:
+    friend class ProcessGroupGloo;
+  };
+  class GlooStore : public ::gloo::rendezvous::Store {
+   public:
+    explicit GlooStore(
+        const std::shared_ptr<paddle::distributed::TCPStore>& store)
+        : _store(store) {}
+    ~GlooStore() = default;
+    std::vector<char> get(const std::string& key) override {
+      VLOG(3) << "GlooStore::get";
+      auto value = _store->get(key);
+      return std::vector<char>(value.begin(), value.end());
+    }
+    void wait(const std::vector<std::string>& keys) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+    }
+    void set(const std::string& key, const std::vector<char>& value) override {
+      VLOG(3) << "GlooStore::set";
+      std::vector<uint8_t> tmp(value.begin(), value.end());
+      _store->set(key, tmp);
+    }
+    void wait(const std::vector<std::string>& keys,
+              const std::chrono::milliseconds& timeout) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+      // wait(keys);
+    }
+   protected:
+    std::shared_ptr<paddle::distributed::TCPStore> _store;
+  };
+  class GlooOptions {
+   public:
+    GlooOptions() = default;
+    ~GlooOptions() = default;
+    static std::shared_ptr<GlooOptions> create() {
+      return std::make_shared<GlooOptions>();
+    }
+    std::shared_ptr<::gloo::transport::Device> device;
+  };
+  explicit ProcessGroupGloo(const std::shared_ptr<GlooStore>& store, int rank,
+                            int world_size,
+                            std::shared_ptr<GlooOptions> options);
+  ~ProcessGroupGloo() = default;
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& inputs,
+      const BroadcastOptions& = BroadcastOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& inputs,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+  std::shared_ptr<::gloo::Context> get_context() { return _context; }
+  uint64_t next_tag() { return _tag++; }
+  const std::string GetBackendName() const override {
+    return GLOO_BACKEND_NAME;
+  }
+  // Helper functions for Gloo.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
+      const std::string& hostname);
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
+      const std::string& ifname);
+  static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
+ protected:
+  uint32_t _tag;
+  std::shared_ptr<gloo::rendezvous::Context> _context;
+  std::shared_ptr<GlooStore> _store;
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -473,5 +473,148 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
  return task;
 }
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::ncclAllGather(
+            input_tensor->data(), output_tensor->data(), input_tensor->numel(),
+            platform::ToNCCLDataType(input.type()), comm, stream);
+      },
+      CommType::ALLGATHER);
+}
+void* GetPointerByOffset(void* raw_pointer, size_t offset,
+                         experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
+                                   offset);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in nccl is not supported."));
+  }
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        size_t offset = 0;
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto i = 0; i < size_; i++) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+              GetPointerByOffset(input_tensor->data(), offset, input.type()),
+              input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), i, comm, stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              GetPointerByOffset(output_tensor->data(), offset, input.type()),
+              input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), i, comm, stream));
+          offset += input_tensor->numel() / size_;
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      },
+      CommType::ALLREDUCE);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
+    std::vector<Tensor>& tensors, const ReduceOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      tensors, tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
+            input_tensor->data(), output_tensor->data(), input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream));
+      },
+      CommType::REDUCE);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
+    std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors,
+    const ScatterOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        size_t offset = 0;
+        if (rank_ == opts.root_rank) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+          for (auto i = 0; i < size_; i++) {
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+                GetPointerByOffset(input_tensor->data(), offset, input.type()),
+                input_tensor->numel() / size_,
+                platform::ToNCCLDataType(input.type()), i, comm, stream));
+            offset += input_tensor->numel() / size_;
+          }
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output_tensor->data(), input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
+              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+        } else {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output_tensor->data(), input_tensor->numel() / size_,
+              platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
+              stream));
+        }
+      },
+      CommType::SCATTER);
+}
 }  //  namespace distributed
 }  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -98,6 +98,20 @@ class ProcessGroupNCCL : public ProcessGroup {
  std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
                                           int src_rank) override;
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors,
+      std::vector<Tensor>& out_tensors) override;
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<Tensor>& in, std::vector<Tensor>& out) override;
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
+  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
+                                              std::vector<Tensor>& out_tensors,
+                                              const ScatterOptions&) override;
 protected:
  virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
      std::vector<Place> places, int rank, CommType opType,

--- a/paddle/fluid/distributed/collective/Types.h
+++ b/paddle/fluid/distributed/collective/Types.h
@@ -36,5 +36,14 @@ struct BarrierOptions {
  std::vector<int> place_ids;
 };
+struct ReduceOptions {
+  ReduceOp reduce_op = ReduceOp::SUM;
+  int root_rank = 0;
+};
+struct ScatterOptions {
+  int root_rank = 0;
+};
 }  //  namespace distributed
 }  //  namespace paddle
--- a/paddle/fluid/distributed/store/store.h
+++ b/paddle/fluid/distributed/store/store.h
@@ -32,6 +32,8 @@ class Store {
  virtual int64_t add(const std::string& key, int64_t value) = 0;
  virtual std::vector<uint8_t> get(const std::string& key) = 0;
  virtual void wait(const std::string& key) = 0;
+  virtual void set(const std::string& key,
+                   const std::vector<uint8_t>& value) = 0;
  virtual const std::chrono::seconds& timeout() const { return _timeout; }

--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -27,11 +27,13 @@ namespace detail {
 constexpr int INFTIME = -1;
-std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket) {
+std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket,
-  return std::make_unique<MasterDaemon>(socket);
+                                                  int nranks) {
+  return std::make_unique<MasterDaemon>(socket, nranks);
 }
-MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) {
+MasterDaemon::MasterDaemon(SocketType socket, int nranks)
+    : _listen_socket(socket), _nranks(nranks) {
  _background_thread = std::thread{&MasterDaemon::run, this};
 }
@@ -64,6 +66,13 @@ void MasterDaemon::_do_add(SocketType socket) {
  tcputils::send_value<int64_t>(socket, new_value);
 }
+void MasterDaemon::_do_set(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_set";
+  std::string key = tcputils::receive_string(socket);
+  auto value = tcputils::receive_vector<uint8_t>(socket);
+  _store[key] = value;
+}
 void MasterDaemon::_do_get(SocketType socket) {
  std::string key = tcputils::receive_string(socket);
  auto iter = _store.find(key);
@@ -71,16 +80,15 @@ void MasterDaemon::_do_get(SocketType socket) {
      iter, _store.end(),
      platform::errors::InvalidArgument("Key %s not found in TCPStore.", key));
  std::vector<uint8_t> value = iter->second;
-  VLOG(3) << "TCPStore: value ("
-          << std::stoll(std::string(reinterpret_cast<char*>(value.data()),
-                                    value.size()))
-          << ") for key (" << key << ").";
  tcputils::send_vector<uint8_t>(socket, value);
 }
 void MasterDaemon::_do_stop(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_stop";
  ReplyType value = ReplyType::STOP_WAIT;
+  if (--_nranks == 0) {
    _stop = true;
+  }
  tcputils::send_value<ReplyType>(socket, value);
 }
@@ -140,21 +148,27 @@ void MasterDaemon::run() {
        case Command::GET:
          _do_get(fds[i].fd);
          break;
+        case Command::SET:
+          _do_set(fds[i].fd);
+          break;
        case Command::WAIT:
          _do_wait(fds[i].fd);
          break;
        case Command::STOP:
          _do_stop(fds[i].fd);
          break;
+        default:
+          VLOG(0) << "Unknow command: " << static_cast<int>(command);
+          exit(-1);
      }
    }
  }
 }
-std::unique_ptr<TCPServer> TCPServer::create(uint16_t port) {
+std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks) {
  int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
  auto server = std::make_unique<TCPServer>();
-  server->_master_daemon = MasterDaemon::start(socket);
+  server->_master_daemon = MasterDaemon::start(socket, nranks);
  return server;
 }
@@ -200,7 +214,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
                   size_t num_workers, std::chrono::seconds timeout)
    : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
  if (_is_master) {
-    _server = detail::TCPServer::create(port);
+    _server = detail::TCPServer::create(port, num_workers);
  }
  _client = detail::TCPClient::connect(host, port);
@@ -213,7 +227,6 @@ void TCPStore::waitWorkers() {
  }
  add(_init_key, 1);
-  if (_server) {
  auto begin = std::chrono::steady_clock::now();
  do {
    auto value = get(_init_key);
@@ -233,16 +246,22 @@ void TCPStore::waitWorkers() {
              "TCPStore timeouted and not all workers got ready."));
    }
  } while (true);
-  }
  VLOG(3) << "TCPStore initialized.";
 }
 int64_t TCPStore::add(const std::string& key, int64_t value) {
+  VLOG(3) << "TCPStore add.";
  _client->send_command_for_key(Command::ADD, _key_prefix + key);
  _client->send_value<std::int64_t>(value);
  return _client->receive_value<std::int64_t>();
 }
+void TCPStore::set(const std::string& key, const std::vector<uint8_t>& value) {
+  VLOG(3) << "TCPStore set.";
+  _client->send_command_for_key(Command::SET, _key_prefix + key);
+  _client->send_vector<std::uint8_t>(value);
+}
 std::vector<uint8_t> TCPStore::get(const std::string& key) {
  wait(key);
  _client->send_command_for_key(Command::GET, _key_prefix + key);
@@ -252,6 +271,7 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) {
 void TCPStore::wait(const std::string& key) {
  ReplyType reply;
+  VLOG(3) << "TCPStore wait.";
  do {
    _client->send_command_for_key(Command::WAIT, _key_prefix + key);
@@ -262,6 +282,7 @@ void TCPStore::wait(const std::string& key) {
 TCPStore::~TCPStore() {
  _client->send_command_for_key(Command::STOP, "");
+  VLOG(3) << "~TCPStore";
  ReplyType ret = _client->receive_value<ReplyType>();
  PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
                    platform::errors::InvalidArgument(

--- a/paddle/fluid/distributed/store/tcp_store.h
+++ b/paddle/fluid/distributed/store/tcp_store.h
@@ -27,15 +27,16 @@ namespace paddle {
 namespace distributed {
 enum class ReplyType { WAITING, STOP_WAIT };
-enum class Command { ADD, GET, WAIT, STOP };
+enum class Command { ADD, GET, SET, WAIT, STOP };
 namespace detail {
 class MasterDaemon {
 public:
-  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket);
+  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket,
+                                             int nranks);
  MasterDaemon() = delete;
-  explicit MasterDaemon(SocketType listen_socket);
+  explicit MasterDaemon(SocketType listen_socket, int nranks);
  ~MasterDaemon();
 private:
@@ -43,18 +44,20 @@ class MasterDaemon {
  void _do_add(SocketType socket);
  void _do_wait(SocketType socket);
  void _do_get(SocketType socket);
+  void _do_set(SocketType socket);
  void _do_stop(SocketType socket);
  SocketType _listen_socket;
  std::vector<SocketType> _sockets;
  std::unordered_map<std::string, std::vector<uint8_t>> _store;
  std::thread _background_thread{};
+  int _nranks;
  bool _stop = false;
 };
 class TCPServer {
 public:
  TCPServer() = default;
-  static std::unique_ptr<TCPServer> create(std::uint16_t port);
+  static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks);
 private:
  std::unique_ptr<MasterDaemon> _master_daemon;
@@ -97,6 +100,7 @@ class TCPStore : public Store {
  int64_t add(const std::string& key, int64_t value) override;
  std::vector<uint8_t> get(const std::string& key) override;
  void wait(const std::string& key) override;
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
 private:
  void waitWorkers();

--- a/paddle/fluid/distributed/store/tcp_utils.cc
+++ b/paddle/fluid/distributed/store/tcp_utils.cc
@@ -46,9 +46,10 @@ void close_socket(SocketType socket) {
  hints.ai_socktype = SOCK_STREAM;
  const char* node = host.empty() ? nullptr : host.c_str();
+  const char* port_cstr = port.empty() ? nullptr : port.c_str();
  int n;
-  n = ::getaddrinfo(node, port.c_str(), &hints, &res);
+  n = ::getaddrinfo(node, port_cstr, &hints, &res);
  const char* gai_err = ::gai_strerror(n);
  const char* proto =
      (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");

--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase {
 public:
  // Constructor: configure fwd input tensors to grad node
  explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
+    VLOG(6) << "Construct GradNodeAccumulation";
    weak_grad_ = meta->WeakGrad();
    SetDefaultGradInOutMeta();
  }
-  ~GradNodeAccumulation() override = default;
+  ~GradNodeAccumulation() override {
+    VLOG(6) << "Destruct GradNodeAccumulation";
+  }
  // Functor: perform backward computations
  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(

--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase {
      const std::vector<paddle::experimental::Tensor>& tensors);
  void SetAttributes_scale(float scale);
+  std::string name() override { return ""; }
  // Members: define fwd input tensors
  // For Scale there is no fwd input tensor needed
 private:

--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -996,6 +996,29 @@ static std::string GenerateGradNodeCreationContent(
  // then generate: "egr::AutogradMeta* p_autograd_out =
  // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
  std::string get_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  // If single output slotname and not duplicable,
+  // then generate: "egr::AutogradMeta* p_autograd_out =
+  // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
+  for (const proto::OpProto::Var& output : out_vars) {
+    const std::string& output_name = output.name();
+    const std::string& output_autograd_name = "p_autograd_" + output_name;
+    if (output.duplicable()) {
+      const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
+          "  std::vector<egr::AutogradMeta*> %s = "
+          "egr::EagerUtils::autograd_meta(&%s);\n";
+      get_autograd_meta_str += paddle::string::Sprintf(
+          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+    } else {
+      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+          "  egr::AutogradMeta* %s = "
+          "egr::EagerUtils::autograd_meta(&%s);\n";
+      get_autograd_meta_str += paddle::string::Sprintf(
+          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+    }
+  }
+  VLOG(6) << "Generated outputs autograd_meta";
  for (const proto::OpProto::Var& input : in_vars) {
    const std::string& input_name = input.name();
    const std::string& input_autograd_name = "p_autograd_" + input_name;
@@ -1024,31 +1047,6 @@ static std::string GenerateGradNodeCreationContent(
  }
  VLOG(6) << "Generated inputs autograd_meta";
-  // If single output slotname and not duplicable,
-  // then generate: "egr::AutogradMeta* p_autograd_out =
-  // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
-  for (const proto::OpProto::Var& output : out_vars) {
-    const std::string& output_name = output.name();
-    const std::string& output_autograd_name = "p_autograd_" + output_name;
-    // Skip Intermediate Tensor
-    if (output.duplicable()) {
-      const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
-          "  std::vector<egr::AutogradMeta*> %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-    } else {
-      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
-          "  egr::AutogradMeta* %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-    }
-  }
-  VLOG(6) << "Generated outputs autograd_meta";
  std::string prepare_autograd_meta_str = "";
  prepare_autograd_meta_str += get_autograd_meta_str;
  prepare_autograd_meta_str += "\n";
@@ -1204,11 +1202,12 @@ static std::string GenerateGradNodeCreationContent(
      "  %s"
      "  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
      "  if(require_any_grad) {\n"
+      "    VLOG(6) << \" Construct Grad for %s \"; \n"
      "    egr::EagerUtils::PassStopGradient(%s);\n"
      "%s\n  }";
  std::string grad_node_creation_body_str = paddle::string::Sprintf(
      GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
-      compute_require_grad_args, pass_stop_gradient_args,
+      compute_require_grad_args, op_type, pass_stop_gradient_args,
      grad_node_creation_str);
  return grad_node_creation_body_str;
@@ -2083,22 +2082,24 @@ static std::string GenerateGradNodeHeaderContents(
  const char* GRAD_NODE_TEMPLATE =
      "class GradNode%s : public egr::GradNodeBase {\n"
      " public:\n"
-      "  GradNode%s() : egr::GradNodeBase() {}\n"
+      "  GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct "
+      "GradNode%s \"; }\n"
      "  GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : "
-      "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n"
+      "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" "
-      "  ~GradNode%s() override = default;\n"
+      "Construct GradNode%s \"; }\n"
+      "  ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
      "\n"
      "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
      "operator()(const "
      "std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
      "override;\n"
      "\n"
+      "  std::string name() override { return \" GradNode%s \"; } \n "
+      "\n"
      "  // SetX, SetY, ...\n"
      "%s\n"
      "  // SetAttrMap\n"
      "%s\n"
-      "  std::string name() { return \"GradNode%s\"; }\n"
-      "\n"
      " private:\n"
      "   // TensorWrappers\n"
      "%s\n"
@@ -2195,8 +2196,8 @@ static std::string GenerateGradNodeHeaderContents(
  VLOG(6) << "Generated TensorWrapper";
  std::string grad_node_str = paddle::string::Sprintf(
-      GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type,
+      GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
-      set_tensor_wrappers_str, set_attr_map_str, op_type,
+      op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
      tensor_wrapper_members_str, attr_members_str);
  return grad_node_str;

--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -213,8 +213,12 @@ def ParseYamlReturns(string):
    returns = [x.strip() for x in string.strip().split(",")]
    for i in range(len(returns)):
-        ret = returns[i]
+        ret_type = returns[i]
-        returns_list.append(["", ret, i])
+        assert ret_type in yaml_types_mapping.keys()
+        ret_type = yaml_types_mapping[ret_type]
+        returns_list.append(["", ret_type, i])
    return returns_list
@@ -534,7 +538,7 @@ class {} : public egr::GradNodeBase {{
  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
      const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
+  std::string name() override {{ return \" {} \"; }}
  // SetTensorWrapperX, SetTensorWrapperY, ...
  {}
  // SetAttributes
@@ -549,8 +553,9 @@ class {} : public egr::GradNodeBase {{
 """
    node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
        grad_node_name, grad_node_name, grad_node_name, grad_node_name,
-        set_tensor_wrapper_methods_str, set_attribute_methods_str,
+        grad_node_name, set_tensor_wrapper_methods_str,
-        tensor_wrapper_members_str, attribute_members_str)
+        set_attribute_methods_str, tensor_wrapper_members_str,
+        attribute_members_str)
    return node_declaration_str

--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -48,12 +48,16 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
    }
    visited.insert(node);
+    PADDLE_ENFORCE_NOT_NULL(
+        node,
+        paddle::platform::errors::Fatal(
+            "We got null node when we traverse the backward graph, and this "
+            "should not happened please check your code and contact us."));
    // Find and append next nodes
    const std::vector<std::vector<Edge>>& edges = node->GetEdges();
    for (const auto& edge_list : edges) {
      for (const Edge& edge : edge_list) {
        GradNodeBase* next_node = edge.GetMutableGradNode().get();
        // Next node could be nullptr if it is leaf tensor with no
        // AccumulationNode attached
        // Or it could also originated from dispensable inputs
@@ -67,7 +71,6 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
      }
    }
  }
  return node_in_degree_map;
 }

--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -30,6 +30,7 @@
 namespace egr {
 GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
+  VLOG(6) << "Construct GradNodeBase";
  bwd_in_meta_.resize(bwd_in_slot_num);
  bwd_out_meta_.resize(bwd_out_slot_num);
  // adj_edges has the same num as backward outputs
@@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
    // its pre-ops
    if (meta && !meta->StopGradient()) {
      auto node = meta->GetMutableGradNode();
-      if (node) {
+      if (node && node.get()) {
+        VLOG(6) << "Add Edges for slot: " << slot_id
+                << " which is: " << meta->GetMutableGradNode()->name();
        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                         meta->OutRankInfo());
      } else {
        meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
+        VLOG(6) << "Add Edges for slot: " << slot_id
+                << " which is: " << meta->GetMutableGradNode()->name();
        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                         meta->OutRankInfo());
      }
@@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
          "inputs's slot num."));
  if (meta && !meta->StopGradient()) {
    auto node = meta->GetMutableGradNode();
-    if (node) {
+    if (node && node.get()) {
      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
              << this->name() << " to " << meta->GetMutableGradNode()->name();
      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),

--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -76,10 +76,10 @@ class GradSlotMeta {
 class GradNodeBase {
 public:
-  GradNodeBase() = default;
+  GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; }
  GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
  // TODO(jiabin): Should we have other constructor here?
-  virtual ~GradNodeBase() = default;
+  virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; }
  /**
   * operator() designed to contian the real backward execution logic, it should

--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase {
  GradTestNode(float val, int in_num, int out_num)
      : GradNodeBase(in_num, out_num), val_(val) {}
  GradTestNode() : GradNodeBase() { val_ = 1.0; }
+  std::string name() override { return "GradTestNode"; }
  std::vector<std::vector<paddle::experimental::Tensor>> operator()(
      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
      override {

--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad(
 void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
                            const std::shared_ptr<GradNodeBase>& grad_node) {
  for (const auto& autograd_meta : *autograd_metas) {
-    if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
+    if (autograd_meta->GradNode()) {
-      VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is "
+      VLOG(7) << "Should not set grad node twice, original node is:"
-                 "detected";
+              << autograd_meta->GradNode()->name()
+              << "current is: " << grad_node->name();
    }
    autograd_meta->SetGradNode(grad_node);
  }
@@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
 void EagerUtils::SetHistory(AutogradMeta* autograd_meta,
                            const std::shared_ptr<GradNodeBase>& grad_node) {
-  if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
+  if (autograd_meta->GradNode()) {
-    VLOG(6)
+    VLOG(7) << "Should not set grad node twice, original node is:"
-        << "Warning: Reseting GradNodeAccumulation for leaf tensor is detected";
+            << autograd_meta->GradNode()->name()
+            << "current is: " << grad_node->name();
  }
  autograd_meta->SetGradNode(grad_node);
 }

--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <random>
 #include <string>
+#include <unordered_set>
-#include <gtest/gtest.h>
 #include <boost/logic/tribool.hpp>
-#include <random>
-#include <unordered_set>
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
@@ -25,7 +26,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/place.h"
-USE_OP(batch_norm);
+USE_OP_ITSELF(batch_norm);
 USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
 USE_OP(conv2d_transpose);
 USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);

--- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
@@ -409,7 +409,7 @@ class ThreadPoolTempl {
      return false;
    }
    platform::RecordEvent("SleepWaitForWork",
-                          platform::TracerEventType::UserDefined, 2);
+                          platform::TracerEventType::UserDefined, 10);
    ec_.CommitWait(waiter);
    blocked_--;
    return true;

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2106,6 +2106,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
      phi::TensorBase* tensor_out = nullptr;
      auto* var = outs_vector[offset];
+      if (var) {
        if (var->template IsType<framework::LoDTensor>()) {
          tensor_out = var->template GetMutable<framework::LoDTensor>();
        } else if (var->template IsType<phi::SelectedRows>()) {
@@ -2115,6 +2117,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
              "Unsupported output `%s` type when call pt kernel.",
              framework::ToTypeName(var->Type())));
        }
+      }
      pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
    }
@@ -2215,8 +2219,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
                                                       vector_int_attr.end());
          pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
        }
-        // TODO(YuanRisheng) Need support vector<int64_t> attr
      } else if (attr_defs[i].type_index ==
                 std::type_index(typeid(std::vector<int32_t>))) {
        const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);

--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -314,6 +314,7 @@ void BuildDygraphPhiKernelContext(
      phi::TensorBase* tensor_out = nullptr;
      auto* var = outs_vector[offset]->MutableVar();
+      if (var) {
        if (var->template IsType<phi::DenseTensor>()) {
          tensor_out = var->template GetMutable<phi::DenseTensor>();
        } else if (var->template IsType<phi::SelectedRows>()) {
@@ -323,6 +324,8 @@ void BuildDygraphPhiKernelContext(
              "Unsupported output `%s` type when call pt kernel.",
              framework::ToTypeName(var->Type())));
        }
+      }
      kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
    }
    kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
                  ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
                  ops::BatchNormDoubleGradOpInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
-template <typename T, framework::DataLayout layout>
-static __global__ void BNForwardInference(
-    const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
-    const double epsilon, T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int num = N * C * HxW;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> x_sub_mean =
-        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
-    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
-    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
-  }
-}
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
-    const T *x, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
-    const double epsilon, double exponentialAverageFactor, T *y,
-    BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
-    BatchNormParamType<T> *save_mean,
-    BatchNormParamType<T> *save_inv_variance) {
-  int outer_size = C;
-  int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage mean_storage;
-  __shared__ typename BlockReduce::TempStorage variance_storeage;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> variance_val;
-  __shared__ BatchNormParamType<T> inv_var_val;
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
-      x_sum += x_i;
-      x_square_sum += x_i * x_i;
-    }
-    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
-    x_square_sum =
-        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      mean_val = x_sum / inner_size;
-      variance_val = x_square_sum / inner_size - mean_val * mean_val;
-      inv_var_val = 1 / sqrt(variance_val + epsilon);
-      if (save_mean && save_inv_variance) {
-        save_mean[i] = mean_val;
-        save_inv_variance[i] = inv_var_val;
-      }
-      mean[i] = (1 - exponentialAverageFactor) * mean_val +
-                exponentialAverageFactor * mean[i];
-      variance[i] = (1 - exponentialAverageFactor) * variance_val +
-                    exponentialAverageFactor * variance[i];
-    }
-    __syncthreads();
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_sub_mean =
-          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
-      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
-    }
-  }
-}
-template <typename T>
-class BatchNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    bool test_mode = is_test && (!trainable_stats);
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(
-        x_dims.size() >= 2 && x_dims.size() <= 5, true,
-        platform::errors::InvalidArgument(
-            "The size of input's dimensions should be between 2 and 5"
-            "But received: the size of input's dimensions is [%d]",
-            x_dims.size()));
-    auto *y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-    auto dtype = platform::CudnnDataType<T>::type;
-#ifdef PADDLE_WITH_HIP
-    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                           : DataLayout::kNCHW;
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// HIP do not support compute format of NHWC
-// auto compute_format = DataLayout::kNCHW;
-#else
-    const bool fast_nhwc_batch_norm =
-        test_mode ||
-        (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
-    auto compute_format =
-        fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
-            ? DataLayout::kNHWC
-            : DataLayout::kNCHW;
-#endif
-    Tensor transformed_x(x->type());
-    Tensor transformed_y(y->type());
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                           &transformed_x);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                          &transformed_x);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, y,
-                                                           &transformed_y);
-    } else {
-      transformed_x.ShareDataWith(*x);
-      transformed_y.ShareDataWith(*y);
-    }
-// ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// miopenTensorDescriptor_t data_desc_;
-// miopenTensorDescriptor_t bn_param_desc_;
-// miopenBatchNormMode_t mode_;
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#else
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-#endif
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// mode_ = miopenBNSpatial;
-#elif CUDNN_VERSION_MIN(7, 0, 1)
-    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-    } else if (H == 1 && W == 1) {
-      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-    } else {
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-    }
-#else
-    if (H == 1 && W == 1) {
-      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-    } else {
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-    }
-#endif  // CUDNN_VERSION_MIN(7, 0, 1)
-    VLOG(3) << "Setting descriptors.";
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (compute_format == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * D * C, 1, W * D * C, D * C, C};
-    }
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-//     data_desc_, CudnnDataType<T>::type,
-//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-//     const_cast<int *>(strides.data())));
-// Note: PERSISTENT not implemented for inference
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDeriveBNTensorDescriptor(
-//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    // Note: PERSISTENT not implemented for inference
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_,
-        test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
-#endif
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    // Now, depending on whether we are running test or not, we have two paths.
-    // It is training mode when it's not reference AND not using pre-trained
-    // model.
-    bool training = !test_mode && !use_global_stats;
-    if (!training) {
-      // only when test we use input to do computation.
-      const auto *est_mean = ctx.Input<Tensor>("Mean");
-      const auto *est_var = ctx.Input<Tensor>("Variance");
-      // Run inference mode.
-      PADDLE_ENFORCE_EQ(
-          est_mean->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The size of mean's dimensions must equal to 1."
-              "But received: the size of mean's dimensions mean is [%d],"
-              "the dimensions of mean is [%s].",
-              est_mean->dims().size(), est_mean->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_var->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The size of variance's dimensions must equal to 1."
-              "But received: the size of variance's dimensions is [%d],"
-              "the dimensions of variance is [%s].",
-              est_var->dims().size(), est_var->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_mean->dims()[0], C,
-          platform::errors::InvalidArgument(
-              "The first dimension of mean must equal to the number of "
-              "Channels, which is [%d]. But received: the first dimension"
-              "of mean is [%d], the dimensions of mean is [%s].",
-              C, est_mean->dims()[0], est_mean->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_var->dims()[0], C,
-          platform::errors::InvalidArgument(
-              "The first dimension of variance must equal to the number"
-              "of Channels, which is [%d]. But received: the first dimension of"
-              "variance is [%d], the dimensions of variance is [%s].",
-              C, est_var->dims()[0], est_var->dims()));
-#ifdef PADDLE_WITH_HIP
-      const int block_size = 256;
-      const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
-      if (compute_format == DataLayout::kNCHW) {
-        BNForwardInference<
-            T,
-            DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            est_mean->template data<BatchNormParamType<T>>(),
-            est_var->template data<BatchNormParamType<T>>(),
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-            epsilon, transformed_y.template data<T>());
-      } else {
-        BNForwardInference<
-            T,
-            DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            est_mean->template data<BatchNormParamType<T>>(),
-            est_var->template data<BatchNormParamType<T>>(),
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-            epsilon, transformed_y.template data<T>());
-      }
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationForwardInference(
-//         handle, miopenBNSpatial,
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kOne())),
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kZero())),
-//         data_desc_,
-//         static_cast<const void *>(transformed_x.template data<T>()),
-//         data_desc_,
-//         static_cast<void *>(
-//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
-//         bn_param_desc_,
-//         const_cast<void *>(static_cast<const void *>(
-//             scale->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             bias->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             est_mean->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             est_var->template data<BatchNormParamType<T>>())),
-//         epsilon));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnBatchNormalizationForwardInference(
-              handle,
-              // Note: PERSISTENT not implemented for inference
-              CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(), data_desc_,
-              transformed_x.template data<T>(), data_desc_,
-              transformed_y.template mutable_data<T>(ctx.GetPlace()),
-              bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-              bias->template data<BatchNormParamType<T>>(),
-              est_mean->template data<BatchNormParamType<T>>(),
-              est_var->template data<BatchNormParamType<T>>(), epsilon));
-#endif
-    } else {
-      // if MomentumTensor is set, use MomentumTensor value, momentum
-      // is only used in this training branch
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
-        Tensor mom_cpu;
-        paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
-                                          &mom_cpu);
-        momentum = mom_cpu.data<float>()[0];
-      }
-      // Run training mode.
-      // obtain running mean and running inv var, and there is no need
-      // to initialize them.
-      auto *mean_out = ctx.Output<Tensor>("MeanOut");
-      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      if ((N * H * W * D) == 1) {
-        // Only 1 element in normalization dimension,
-        // skip the batch norm calculation, let y = x.
-        framework::TensorCopy(*x, ctx.GetPlace(), y);
-      } else {
-        double this_factor = 1. - momentum;
-        bool called = false;
-#if CUDNN_VERSION_MIN(7, 4, 1)
-        called = true;
-        size_t workspace_size = 0;
-        size_t reserve_space_size = 0;
-        void *reserve_space_ptr = nullptr;
-        void *workspace_ptr = nullptr;
-        Tensor workspace_tensor;
-        // Create reserve space and workspace for batch norm.
-        // Create tensor for each batchnorm op, it will be used in the
-        // backward. Thus this tensor shouldn't be temp.
-        auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
-        PADDLE_ENFORCE_NOT_NULL(
-            reserve_space,
-            platform::errors::NotFound(
-                "The argument ReserveSpace of batch_norm op is not found."));
-        // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-                    /*handle=*/handle,
-                    /*mode=*/mode_,
-                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*xDesc=*/data_desc_,
-                    /*zDesc=*/nullptr,
-                    /*yDesc=*/data_desc_,
-                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                    /*activationDesc=*/nullptr,
-                    /*sizeInBytes=*/&workspace_size));
-        // -------------- cudnn batchnorm reserve space --------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-                    /*handle=*/handle,
-                    /*mode=*/mode_,
-                    /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*activationDesc=*/nullptr,
-                    /*xDesc=*/data_desc_,
-                    /*sizeInBytes=*/&reserve_space_size));
-        reserve_space_ptr = reserve_space->mutable_data(
-            ctx.GetPlace(), transformed_x.type(), reserve_space_size);
-        workspace_ptr = workspace_tensor.mutable_data(
-            ctx.GetPlace(), transformed_x.type(), workspace_size);
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
-                handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
-                CudnnDataType<T>::kZero(), data_desc_,
-                transformed_x.template data<T>(), nullptr, nullptr, data_desc_,
-                transformed_y.template data<T>(), bn_param_desc_,
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), this_factor,
-                mean_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                variance_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                epsilon,
-                saved_mean->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                saved_variance->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                nullptr, workspace_ptr, workspace_size, reserve_space_ptr,
-                reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-        if (!called) {
-#ifdef PADDLE_WITH_HIP
-          const int num = transformed_x.numel();
-          const int block = 256;
-          const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-          const int max_blocks = std::max(max_threads / block, 1);
-          const int grid = std::min(C, max_blocks);
-          if (compute_format == DataLayout::kNCHW) {
-            BNForwardTraining<
-                T, block,
-                DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-                epsilon, this_factor, transformed_y.template data<T>(),
-                mean_out->template data<BatchNormParamType<T>>(),
-                variance_out->template data<BatchNormParamType<T>>(),
-                saved_mean->template data<BatchNormParamType<T>>(),
-                saved_variance->template data<BatchNormParamType<T>>());
-          } else {
-            BNForwardTraining<
-                T, block,
-                DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-                epsilon, this_factor, transformed_y.template data<T>(),
-                mean_out->template data<BatchNormParamType<T>>(),
-                variance_out->template data<BatchNormParamType<T>>(),
-                saved_mean->template data<BatchNormParamType<T>>(),
-                saved_variance->template data<BatchNormParamType<T>>());
-          }
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationForwardTraining(
-//         handle, mode_, const_cast<void *>(static_cast<const void *>(
-//                            CudnnDataType<T>::kOne())),
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kZero())),
-//         data_desc_,
-//         static_cast<const void *>(transformed_x.template data<T>()),
-//         data_desc_,
-//         static_cast<void *>(
-//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
-//         bn_param_desc_,
-//         const_cast<void *>(static_cast<const void *>(
-//             scale->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             bias->template data<BatchNormParamType<T>>())),
-//         this_factor,
-//         static_cast<void *>(
-//             mean_out->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(variance_out->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace())),
-//         epsilon,
-//         static_cast<void *>(
-//             saved_mean->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(saved_variance->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace()))));
-#else
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnBatchNormalizationForwardTraining(
-                  handle, mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_y.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  bias->template data<BatchNormParamType<T>>(), this_factor,
-                  mean_out->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  variance_out->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon,
-                  saved_mean->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  saved_variance->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace())));
-#endif
-        }
-      }
-    }
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-      TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-          ctx, &transformed_y, y);
-    }
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// clean when exit.
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
-    // clean when exit.
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-#endif
-  }
-};
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
-    const T *dy, const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *variance, const double epsilon, const int N,
-    const int C, const int HxW, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
-    BatchNormParamType<T> mean_i = mean[i];
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
-                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
-    }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dscale[i] = ds_sum * inv_var_i;
-      dbias[i] = db_sum;
-    }
-    __syncthreads();
-  }
-}
-template <typename T, framework::DataLayout layout>
-static __global__ void KeBNBackwardData(const T *dy,
-                                        const BatchNormParamType<T> *scale,
-                                        const BatchNormParamType<T> *variance,
-                                        const double epsilon, const int C,
-                                        const int HxW, const int num, T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
-    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
-                           scale[c] * inv_var);
-  }
-}
-template <typename T>
-static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x,
-                                       const BatchNormParamType<T> *scale,
-                                       const BatchNormParamType<T> *bias,
-                                       const BatchNormParamType<T> *mean,
-                                       const BatchNormParamType<T> *variance,
-                                       double epsilon, int C, int M,
-                                       const int num, const T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
-    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
-    auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
-    x[i] = static_cast<T>(x_i);
-  }
-}
-template <typename T>
-class InplaceHelper {
- public:
-  void operator()(const framework::DataLayout layout, T *x,
-                  const BatchNormParamType<T> *scale,
-                  const BatchNormParamType<T> *bias,
-                  const BatchNormParamType<T> *mean,
-                  const BatchNormParamType<T> *variance, double epsilon, int C,
-                  int M, const int num, const T *y, int grid2, const int block,
-                  const gpuStream_t &stream) {
-    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
-                                "X and Y should be inplaced in inplace mode"));
-    KeBNRestoreData<<<grid2, block, 0, stream>>>(
-        layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
-  }
-};
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
-    const T *dy, const T *x, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *saved_mean,
-    const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
-    const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-  __shared__ typename BlockReduce::TempStorage mean_storage;
-  __shared__ typename BlockReduce::TempStorage variance_storeage;
-  __shared__ BatchNormParamType<T> inv_var_val;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> dscale_val;
-  __shared__ BatchNormParamType<T> dbias_val;
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
-    if (saved_mean && saved_inv_variance) {
-      if (threadIdx.x == 0) {
-        inv_var_val = saved_inv_variance[i];
-        mean_val = saved_mean[i];
-      }
-    } else {
-      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
-      BatchNormParamType<T> x_square_sum =
-          static_cast<BatchNormParamType<T>>(0);
-      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-        const int index = layout == framework::DataLayout::kNCHW
-                              ? (j / HxW * C + i) * HxW + j % HxW
-                              : j * outer_size + i;
-        BatchNormParamType<T> x_i =
-            static_cast<BatchNormParamType<T>>(x[index]);
-        x_sum += x_i;
-        x_square_sum += x_i * x_i;
-      }
-      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
-      x_square_sum =
-          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
-      if (threadIdx.x == 0) {
-        mean_val = x_sum / inner_size;
-        inv_var_val =
-            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
-      }
-    }
-    __syncthreads();
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> dy_i =
-          static_cast<BatchNormParamType<T>>(dy[index]);
-      ds_sum +=
-          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
-      db_sum += dy_i;
-    }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dscale_val = ds_sum * inv_var_val;
-      dbias_val = db_sum;
-      dscale[i] = dscale_val;
-      dbias[i] = dbias_val;
-    }
-    __syncthreads();
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      dx[index] = scale[i] * inv_var_val *
-                  (static_cast<BatchNormParamType<T>>(dy[index]) -
-                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
-                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
-                       inv_var_val * dscale_val / inner_size);
-    }
-  }
-}
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
-    const T *dy, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *mean, const T *x,
-    const BatchNormParamType<T> *variance, const int C, const int N,
-    const int HxW, T *dx) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
-  __shared__ BatchNormParamType<T> dy_sum_val;
-  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> inv_var_i = variance[i];
-    BatchNormParamType<T> mean_i = mean[i];
-    BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> dy_x_sub_mean_sum =
-        static_cast<BatchNormParamType<T>>(0);
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> dy_i =
-          static_cast<BatchNormParamType<T>>(dy[index]);
-      dy_sum += dy_i;
-      dy_x_sub_mean_sum +=
-          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-    }
-    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-    dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
-                            .Reduce(dy_x_sub_mean_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dy_sum_val = dy_sum;
-      dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
-    }
-    __syncthreads();
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      dx[index] =
-          (static_cast<BatchNormParamType<T>>(dy[index]) -
-           dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
-           (static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
-               dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
-          scale[i] * inv_var_i;
-    }
-  }
-}
-template <typename T>
-class BatchNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    // batch_norm with inplace as false will take X as grad input, which
-    // is same as cuDNN batch_norm backward calculation, batch_norm
-    // with inplace as true only take Y as input and X should be calculate
-    // by inverse operation of batch_norm on Y
-    const Tensor *x;
-    bool is_inplace;
-    if (ctx.HasInput("Y")) {
-      x = ctx.Input<Tensor>("Y");
-      is_inplace = true;
-      if (d_x) {
-        PADDLE_ENFORCE_EQ(d_x, d_y,
-                          platform::errors::InvalidArgument(
-                              "X@GRAD and Y@GRAD not inplace in inplace mode"));
-      }
-    } else {
-      x = ctx.Input<Tensor>("X");
-      is_inplace = false;
-      if (d_x) {
-        PADDLE_ENFORCE_NE(
-            d_x, d_y, platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
-      }
-    }
-    const bool is_test = ctx.Attr<bool>("is_test");
-    use_global_stats = is_test || use_global_stats;
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(
-        x_dims.size() >= 2 && x_dims.size() <= 5, true,
-        platform::errors::InvalidArgument(
-            "The size of input's dimensions should be between 2 and 5."
-            "But received: the size of input's dimensions is [%d],"
-            "the dimensions of input is [%s]",
-            x_dims.size(), x_dims));
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-    // init output
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-    }
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    }
-    PADDLE_ENFORCE_EQ(
-        scale->dims().size(), 1UL,
-        platform::errors::InvalidArgument(
-            "The size of scale's dimensions must equal to 1. But received: "
-            "the size of scale's dimensions is [%d], the dimensions of scale "
-            "is [%s].",
-            scale->dims().size(), scale->dims()));
-    PADDLE_ENFORCE_EQ(
-        scale->dims()[0], C,
-        platform::errors::InvalidArgument(
-            "The first dimension of scale must equal to Channels[%d]. But "
-            "received: the first dimension of scale is [%d]",
-            C, scale->dims()[0]));
-    auto dtype = platform::CudnnDataType<T>::type;
-    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
-#ifdef PADDLE_WITH_HIP
-    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                           : DataLayout::kNCHW;
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// HIP do not support compute format of NHWC
-// auto compute_format = DataLayout::kNCHW;
-#else
-    const bool fast_nhwc_batch_norm =
-        dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
-        reserve_space != nullptr;
-    auto compute_format =
-        fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
-            ? DataLayout::kNHWC
-            : DataLayout::kNCHW;
-#endif
-    Tensor transformed_x(x->type());
-    Tensor transformed_d_y(d_y->type());
-    Tensor transformed_d_x;
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                           &transformed_x);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                          &transformed_x);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
-                                                           &transformed_d_y);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
-                                                          &transformed_d_y);
-      if (d_x) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_x,
-                                                             &transformed_d_x);
-      }
-    } else {
-      transformed_x.ShareDataWith(*x);
-      transformed_d_y.ShareDataWith(*d_y);
-      if (d_x) {
-        transformed_d_x.ShareDataWith(*d_x);
-      }
-    }
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (compute_format == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * C * D, 1, W * D * C, D * C, C};
-    }
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    const int num = transformed_x.numel();
-#ifdef HIPCC
-    const int block = 256;
-#else
-    const int block = 512;
-#endif
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    int grid1 = (num + block - 1) / block;
-    int grid2 = std::min(C, max_blocks);
-    auto stream = dev_ctx.stream();
-    InplaceHelper<T> inplace_functor;
-    if (!use_global_stats) {
-      if ((N * H * W * D) == 1) {
-        if (d_x) {
-          framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-        }
-        phi::funcs::SetConstant<platform::CUDADeviceContext,
-                                BatchNormParamType<T>>
-            functor;
-        functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-        functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-        return;
-      }
-// ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// miopenTensorDescriptor_t data_desc_;
-// miopenTensorDescriptor_t bn_param_desc_;
-// miopenBatchNormMode_t mode_;
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#else
-      cudnnTensorDescriptor_t data_desc_;
-      cudnnTensorDescriptor_t bn_param_desc_;
-      cudnnBatchNormMode_t mode_;
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-#endif
-      if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-        LOG(ERROR) << "Provided epsilon is smaller than "
-                   << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                   << "CUDNN_BN_MIN_EPSILON instead.";
-      }
-      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// mode_ = miopenBNSpatial;
-#elif CUDNN_VERSION_MIN(7, 0, 1)
-      if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-        mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-      } else if (H == 1 && W == 1) {
-        mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-      } else {
-        mode_ = CUDNN_BATCHNORM_SPATIAL;
-      }
-#else
-      if (H == 1 && W == 1) {
-        mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-      } else {
-        mode_ = CUDNN_BATCHNORM_SPATIAL;
-      }
-#endif  // CUDNN_VERSION_MIN(7, 0, 1)
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-//     data_desc_, CudnnDataType<T>::type,
-//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-//     const_cast<int *>(strides.data())));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
-//                                                       data_desc_, mode_));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          data_desc_, CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                           data_desc_, mode_));
-#endif
-      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-      const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-      const auto *saved_mean_data =
-          saved_mean->template data<BatchNormParamType<T>>();
-      const auto *saved_var_data =
-          saved_var->template data<BatchNormParamType<T>>();
-      if (is_inplace) {
-        inplace_functor(compute_format, transformed_x.data<T>(),
-                        scale->template data<BatchNormParamType<T>>(),
-                        bias->template data<BatchNormParamType<T>>(),
-                        saved_mean_data, saved_var_data, epsilon, C, H * W * D,
-                        num, transformed_x.data<T>(), grid2, block, stream);
-      }
-      // This branch calls CUDNN APIs
-      if (d_x && d_scale && d_bias) {
-        bool called = false;
-#if CUDNN_VERSION_MIN(7, 4, 1)
-        called = true;
-        size_t workspace_size = 0;
-        void *workspace_ptr = nullptr;
-        Tensor workspace_tensor;
-        auto reserve_space_size = reserve_space->memory_size();
-        // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-                    /*handle=*/dev_ctx.cudnn_handle(),
-                    /*mode=*/mode_,
-                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*xDesc=*/data_desc_,
-                    /*yDesc=*/data_desc_,
-                    /*dyDesc=*/data_desc_,
-                    /*dzDesc=*/nullptr,
-                    /*dxDesc=*/data_desc_,
-                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                    /*activationDesc=*/nullptr,
-                    /*sizeInBytes=*/&workspace_size));
-        workspace_ptr = workspace_tensor.mutable_data(
-            ctx.GetPlace(), transformed_x.type(), workspace_size);
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnBatchNormalizationBackwardEx(
-                /*handle=*/dev_ctx.cudnn_handle(),
-                /*mode=*/mode_,
-                /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
-                /*betaDataDiff=*/CudnnDataType<T>::kZero(),
-                /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
-                /*betaParamDiff=*/CudnnDataType<T>::kZero(),
-                /*xDesc=*/data_desc_,
-                /*xData=*/transformed_x.template data<T>(),
-                /*yDesc=*/nullptr,
-                /*yData=*/nullptr,
-                /*dyDesc=*/data_desc_,
-                /*dyData=*/transformed_d_y.template data<T>(),
-                /*dzDesc=*/nullptr,
-                /*dzData=*/nullptr,
-                /*dxDesc=*/data_desc_,
-                /*dxData=*/transformed_d_x.template mutable_data<T>(
-                    ctx.GetPlace()),
-                /*dBnScaleBiasDesc=*/bn_param_desc_,
-                /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
-                /*bnBiasData=*/nullptr,
-                /*dBnScaleData=*/d_scale
-                    ->template mutable_data<BatchNormParamType<T>>(
-                        ctx.GetPlace()),
-                /*dBnBiasData=*/d_bias
-                    ->template mutable_data<BatchNormParamType<T>>(
-                        ctx.GetPlace()),
-                /*epsilon=*/epsilon,
-                /*savedMean=*/saved_mean_data,
-                /*savedInvVariance=*/saved_var_data,
-                /*activationDesc=*/nullptr,
-                /*workspace=*/workspace_ptr,
-                /*workSpaceSizeInBytes=*/workspace_size,
-                /*reserveSpace=*/const_cast<T *>(
-                    reserve_space->template data<T>()),
-                /*reserveSpaceSizeInBytes=*/reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-        if (!called) {
-#ifdef PADDLE_WITH_HIP
-          if (compute_format == DataLayout::kNCHW) {
-            BNBackward<
-                T, block,
-                DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
-                transformed_d_y.template data<T>(),
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
-                saved_var_data, C, N, H * W * D, epsilon,
-                transformed_d_x.template data<T>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()));
-          } else {
-            BNBackward<
-                T, block,
-                DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
-                transformed_d_y.template data<T>(),
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
-                saved_var_data, C, N, H * W * D, epsilon,
-                transformed_d_x.template data<T>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()));
-          }
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationBackward(
-//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), data_desc_,
-//         transformed_x.template data<T>(), data_desc_,
-//         transformed_d_y.template data<T>(), data_desc_,
-//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-//         d_scale->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         d_bias->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         epsilon, saved_mean_data, saved_var_data));
-#else
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnBatchNormalizationBackward(
-                  dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_d_y.template data<T>(), data_desc_,
-                  transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  d_scale->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  d_bias->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon, saved_mean_data, saved_var_data));
-#endif
-        }
-        if (data_layout == DataLayout::kNHWC &&
-            compute_format == DataLayout::kNCHW) {
-          VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-          TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-              ctx, &transformed_d_x, d_x);
-        }
-      } else {
-        // This branch call CUDA kernels
-        if (compute_format == DataLayout::kNCHW) {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNCHW><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-          if (d_scale && d_bias) {
-            KeBNBackwardScaleBias<
-                T, block,
-                framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-                d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
-                epsilon, N, C, H * W * D,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-          }
-        } else {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNHWC><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-          if (d_scale && d_bias) {
-            KeBNBackwardScaleBias<
-                T, block,
-                framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-                d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
-                epsilon, N, C, H * W * D,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-          }
-        }
-      }
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// clean when exit.
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
-      // clean when exit.
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-#endif
-    } else {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_var = ctx.Input<Tensor>("Variance");
-      const auto *running_mean_data =
-          running_mean->template data<BatchNormParamType<T>>();
-      const auto *running_var_data =
-          running_var->template data<BatchNormParamType<T>>();
-      if (is_inplace) {
-        auto px = *x;
-        inplace_functor(data_layout, px.mutable_data<T>(ctx.GetPlace()),
-                        scale->template data<BatchNormParamType<T>>(),
-                        bias->template data<BatchNormParamType<T>>(),
-                        running_mean_data, running_var_data, epsilon, C,
-                        H * W * D, num, x->data<T>(), grid2, block, stream);
-      }
-      if (compute_format == DataLayout::kNCHW) {
-        if (d_x) {
-          KeBNBackwardData<
-              T, framework::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T, block,
-              framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      } else {
-        if (d_x) {
-          KeBNBackwardData<
-              T, framework::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T, block,
-              framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      }
-    }
-  }
-};
-template <typename T>
-class BatchNormDoubleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-    NormDoubleGradFunctor<platform::CUDADeviceContext, T>(
-        ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon,
-        use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY);
-  }
-};
 }  // namespace operators
 }  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, double>);
-#endif
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -25,10 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_helper.h"
 #endif
 #include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 DECLARE_bool(cudnn_deterministic);
 DECLARE_uint64(conv_workspace_size_limit);
@@ -148,7 +148,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
                             in_data_dims, strides, ksize);
    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
    Tensor transformed_input;
    std::vector<int> padding_common(data_dim, 0);
@@ -196,13 +196,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
      T pad_value(0.0);
      switch (rank) {
        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
              &transformed_input);
        } break;
        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
              &transformed_input);
        } break;
        default:
@@ -488,7 +488,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    // cuDNN only supports padding the same amount on every dimension.
    // So we create a new padded input tensor.
    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
    Tensor transformed_input(input->type());
    Tensor transformed_input_grad(input->type());
    std::vector<int> padding_common(data_dim, 0);
@@ -544,13 +544,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
      T pad_value(0.0);
      switch (rank) {
        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
              &transformed_input);
        } break;
        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
              &transformed_input);
        } break;
        default:
@@ -956,7 +956,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
                             in_data_dims, strides, ksize);
    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
    Tensor transformed_X(X->type());
    Tensor transformed_ddX(X->type());
@@ -1004,20 +1004,22 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
      T pad_value(0.0);
      switch (rank) {
        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
          if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                &transformed_ddX);
          }
        } break;
        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
          if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                &transformed_ddX);
          }
        } break;

--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_helper.h"
 #endif
 #include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 namespace paddle {
 namespace operators {
@@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
                             in_data_dims, strides, ksize);
    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
    std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
    Tensor transformed_input;
@@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
      T pad_value(0.0);
      switch (rank) {
        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, input_transpose, pad_value, &transformed_input);
+              dev_ctx, input_pad, input_transpose, pad_value,
+              &transformed_input);
        } break;
        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, input_transpose, pad_value, &transformed_input);
+              dev_ctx, input_pad, input_transpose, pad_value,
+              &transformed_input);
        } break;
        default:
          PADDLE_THROW(platform::errors::InvalidArgument(
@@ -375,7 +377,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
                             in_data_dims, strides, ksize);
    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
    std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
    Tensor transformed_output_grad;
@@ -407,13 +409,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
      T pad_value(0.0);
      switch (rank) {
        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, output_grad_transpose, pad_value,
+              dev_ctx, input_pad, output_grad_transpose, pad_value,
              &transformed_output_grad);
        } break;
        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, output_grad_transpose, pad_value,
+              dev_ctx, input_pad, output_grad_transpose, pad_value,
              &transformed_output_grad);
        } break;
        default:
@@ -735,7 +737,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
                             in_data_dims, strides, ksize);
    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
    Tensor transformed_X(X->type());
    Tensor transformed_ddX(X->type());
@@ -794,26 +796,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
      T pad_value(0.0);
      switch (rank) {
        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
          if (dO) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_dO_channel, pad_value,
+                dev_ctx, input_pad, transformed_dO_channel, pad_value,
                &transformed_dO);
          }
          if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                &transformed_ddX);
          }
        } break;
        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+              dev_ctx, input_pad, transformed_X_channel, pad_value,
+              &transformed_X);
          if (ddX) {
-            math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                ctx, input_pad, transformed_ddX_channel, pad_value,
+                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
                &transformed_ddX);
          }
        } break;

--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -184,15 +184,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
                              bool is_fix_seed, int seed_val, const Tensor& x,
                              const Tensor* seed, Tensor* mask, Tensor* y) {
  auto& place = *dev_ctx.eigen_device();
-  if (!is_test) {
  int64_t x_numel = x.numel();
  auto stream = dev_ctx.stream();
+  auto* x_data = x.data<T>();
+  auto* y_data = y->data<T>();
+  if (!is_test) {
    auto* mask_data = mask->data<uint8_t>();
    size_t size = phi::product(mask->dims());
-    auto* x_data = x.data<T>();
-    auto* y_data = y->data<T>();
    if (dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
      PADDLE_ENFORCE_GPU_SUCCESS(
@@ -254,12 +254,24 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
    }
 #endif
  } else {
-    auto X = EigenMatrix<T>::Reshape(x, 1);
-    auto Y = EigenMatrix<T>::Reshape(*y, 1);
    if (upscale_in_train) {
-      Y.device(place) = X;
+// todo: can y share with data with x directly?
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
+                         hipMemcpyDeviceToDevice, stream));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
+                          cudaMemcpyDeviceToDevice, stream));
+#endif
    } else {
-      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      T factor = static_cast<T>(1.0f - dropout_prob);
+      std::vector<const framework::Tensor*> ins = {&x};
+      std::vector<framework::Tensor*> outs = {y};
+      auto functor = phi::funcs::ScaleFunctor<T>(factor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                                &outs, functor);
    }
  }
 }

--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 DECLARE_int64(cudnn_exhaustive_search_times);
@@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
                             in_data_dims, strides, ksize);
    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
+    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
    Tensor transformed_input;
    std::vector<int> padding_common(data_dim, 0);
@@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
      T pad_value(0.0);
      switch (rank) {
        case 4: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
              &transformed_input);
        } break;
        case 5: {
-          math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, input_pad, transformed_input_channel, pad_value,
+              dev_ctx, input_pad, transformed_input_channel, pad_value,
              &transformed_input);
        } break;
        default:

--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -32,7 +32,7 @@ namespace platform = paddle::platform;
 namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
-USE_OP(batch_norm);
+USE_OP_ITSELF(batch_norm);
 USE_CUDA_ONLY_OP(fused_bn_add_activation);
 USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);

--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -45,6 +45,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
        axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
      } else if (axis_type == framework::proto::VarType::INT64) {
        axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
+      } else if (axis_type == framework::proto::VarType::INT16) {
+        axis = static_cast<int>(cpu_axis.data<int16_t>()[0]);
      }
    }
    const auto &place = ctx.GetPlace();
@@ -57,6 +59,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
      } else if (index_type == framework::proto::VarType::INT64) {
        phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
                                                     dev_ctx);
+      } else if (index_type == framework::proto::VarType::INT16) {
+        phi::funcs::GatherV2CUDAFunction<T, int16_t>(x, index, axis, output,
+                                                     dev_ctx);
      }
      return;
    }
@@ -67,6 +72,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
      phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
    } else if (index_type == framework::proto::VarType::INT64) {
      phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
+    } else if (index_type == framework::proto::VarType::INT16) {
+      phi::funcs::GPUGather<T, int16_t>(dev_ctx, *x, *index, output);
    }
  }
 };
@@ -134,6 +141,7 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
                        ops::GatherOpCUDAKernel<double>,
                        ops::GatherOpCUDAKernel<int64_t>,
                        ops::GatherOpCUDAKernel<int>,
+                        ops::GatherOpCUDAKernel<int16_t>,
                        ops::GatherOpCUDAKernel<plat::float16>,
                        ops::GatherOpCUDAKernel<plat::bfloat16>);
 REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,

--- a/paddle/fluid/operators/gather_tree_op.cc
+++ b/paddle/fluid/operators/gather_tree_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
@@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree");
-    OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree");
-    auto ids_dims = ctx->GetInputDim("Ids");
-    auto parents_dims = ctx->GetInputDim("Parents");
-    PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Parents) must be same with the "
-                          "shape of Input(Ids)."));
-    ctx->SetOutputDim("Out", ids_dims);
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -72,4 +61,8 @@ selected ids.
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker);
+DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
+                            PT_INFER_META(phi::GatherTreeMeta));
+REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker,
+                  GatherTreeInferShapeFunctor);
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -26,27 +26,6 @@ namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
-template <typename T>
-class CPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    std::normal_distribution<T> dist(mean, std);
-    auto shape = GetShape(context);
-    tensor->Resize(shape);
-    int64_t size = tensor->numel();
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
-  }
-};  // namespace operators
 template <typename T>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
@@ -194,8 +173,6 @@ Used to initialize tensors with gaussian random generator.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
                             ops::GaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>,
-                       ops::CPUGaussianRandomKernel<double>);
 REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
                       ops::CPUGaussianRandomBatchSizeLikeKernel<float>,
                       ops::CPUGaussianRandomBatchSizeLikeKernel<double>);

--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -52,53 +52,6 @@ struct GaussianGenerator {
  }
 };
-template <typename T>
-class GPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-    T mean = static_cast<T>(context.Attr<float>("mean"));
-    T std = static_cast<T>(context.Attr<float>("std"));
-    auto shape = GetShape(context);
-    tensor->Resize(shape);
-    auto& dev_cxt =
-        context.template device_context<platform::CUDADeviceContext>();
-    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
-    int64_t size = tensor->numel();
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      if (FLAGS_use_curand) {
-        using MT = typename details::MPTypeTrait<T>::Type;
-        distribution::normal_distribution<MT> dist;
-        distribution::normal_transform<MT> trans(mean, std);
-        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
-                                                    trans);
-      } else {
-        auto seed_offset = gen_cuda->IncrementOffset(1);
-        int64_t gen_offset = size * seed_offset.second;
-        auto func =
-            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset);
-        IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-      }
-    } else {
-      auto func = GaussianGenerator<T>(mean, std, seed);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-    }
-  }
-};
 template <typename T>
 class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
 public:
@@ -136,11 +89,6 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
-REGISTER_OP_CUDA_KERNEL(
-    gaussian_random,
-    paddle::operators::GPUGaussianRandomKernel<paddle::platform::float16>,
-    paddle::operators::GPUGaussianRandomKernel<float>,
-    paddle::operators::GPUGaussianRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(
    gaussian_random_batch_size_like,
    paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<

--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
 namespace paddle {
 namespace operators {
@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
 };
 template <typename DeviceContext, typename T>
-class InplaceABNKernel
+class InplaceABNKernel : public framework::OpKernel<T> {
-    : public paddle::operators::BatchNormKernel<DeviceContext, T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* x = ctx.Input<Tensor>("X");
@@ -213,7 +214,33 @@ class InplaceABNKernel
    auto activation =
        GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    BatchNormKernel<DeviceContext, T>::Compute(ctx);
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* variance = ctx.Input<Tensor>("Variance");
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+    auto* mean_out = ctx.Output<Tensor>("MeanOut");
+    auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+    auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    phi::BatchNormKernel<T>(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
+        is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
+        mean_out, variance_out, saved_mean, saved_variance, reserve_space);
    auto cur_y = EigenVector<T>::Flatten(*y);
    InplaceABNActivation<DeviceContext, T> functor;
@@ -222,8 +249,7 @@ class InplaceABNKernel
 };
 template <typename DeviceContext, typename T>
-class InplaceABNGradKernel
+class InplaceABNGradKernel : public framework::OpKernel<T> {
-    : public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* y = ctx.Input<Tensor>("Y");
@@ -244,7 +270,52 @@ class InplaceABNGradKernel
    InplaceABNActivation<DeviceContext, T> functor;
    functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
-    BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+    // BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+    auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+    auto* mean = ctx.Input<Tensor>("ReserveSpace");
+    auto* variance = ctx.Input<Tensor>("ReserveSpace");
+    paddle::optional<const Tensor&> space_opt = paddle::none;
+    paddle::optional<const Tensor&> mean_opt = paddle::none;
+    paddle::optional<const Tensor&> variance_opt = paddle::none;
+    if (reserve_space != nullptr) {
+      space_opt = *reserve_space;
+    }
+    if (mean != nullptr) {
+      mean_opt = *mean;
+    }
+    if (variance != nullptr) {
+      variance_opt = *variance;
+    }
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    phi::BatchNormGradRawKernel<T>(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
+        mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
+        use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
+        scale_grad, bias_grad);
  }
 };

--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -15,14 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/inplace_abn_op.h"
 #include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
+#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
 namespace paddle {
 namespace operators {
 template <typename DeviceContext, typename T>
 class InplaceABNKernel
-    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T>,
+    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
-      public paddle::operators::BatchNormKernel<DeviceContext, T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* y = ctx.Output<Tensor>("Y");
@@ -36,7 +37,33 @@ class InplaceABNKernel
    if (ctx.Attr<bool>("use_sync_bn")) {
      SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
    } else {
-      BatchNormKernel<DeviceContext, T>::Compute(ctx);
+      // BatchNormKernel<DeviceContext, T>::Compute(ctx);
+      auto* scale = ctx.Input<Tensor>("Scale");
+      auto* bias = ctx.Input<Tensor>("Bias");
+      auto* mean = ctx.Input<Tensor>("Mean");
+      auto* variance = ctx.Input<Tensor>("Variance");
+      auto momentum = ctx.Attr<float>("momentum");
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto data_layout = ctx.Attr<std::string>("data_layout");
+      auto is_test = ctx.Attr<bool>("is_test");
+      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+      auto* mean_out = ctx.Output<Tensor>("MeanOut");
+      auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+      auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+      auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::BatchNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
+          is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
+          mean_out, variance_out, saved_mean, saved_variance, reserve_space);
    }
    auto cur_y = EigenVector<T>::Flatten(*y);
@@ -49,8 +76,7 @@ class InplaceABNKernel
 // https://kevinzakka.github.io/2016/09/14/batch_normalization/
 template <typename DeviceContext, typename T>
 class InplaceABNGradKernel
-    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T>,
+    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
-      public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    const auto* y = ctx.Input<Tensor>("Y");
@@ -74,7 +100,50 @@ class InplaceABNGradKernel
    if (ctx.Attr<bool>("use_sync_bn")) {
      SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
    } else {
-      BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+      auto* scale = ctx.Input<Tensor>("Scale");
+      auto* bias = ctx.Input<Tensor>("Bias");
+      auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+      auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+      auto momentum = ctx.Attr<float>("momentum");
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto data_layout = ctx.Attr<std::string>("data_layout");
+      auto is_test = ctx.Attr<bool>("is_test");
+      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+      auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+      auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+      auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+      auto* mean = ctx.Input<Tensor>("ReserveSpace");
+      auto* variance = ctx.Input<Tensor>("ReserveSpace");
+      paddle::optional<const Tensor&> space_opt = paddle::none;
+      paddle::optional<const Tensor&> mean_opt = paddle::none;
+      paddle::optional<const Tensor&> variance_opt = paddle::none;
+      if (reserve_space != nullptr) {
+        space_opt = *reserve_space;
+      }
+      if (mean != nullptr) {
+        mean_opt = *mean;
+      }
+      if (variance != nullptr) {
+        variance_opt = *variance;
+      }
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::BatchNormGradRawKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
+          mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
+          use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
+          scale_grad, bias_grad);
    }
  }
 };

--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal(
 }
 template <typename DeviceContext, typename T>
-void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
+void NormDoubleGradFunctor(const DeviceContext &ctx,
                           const DataLayout data_layout, const Tensor *X,
                           const Tensor *Scale, const Tensor *dY,
                           const Tensor *Saved_mean,
-                           const Tensor *Saved_variance, const double epsilon,
+                           const Tensor *Saved_variance, const Tensor *Mean,
+                           const Tensor *Variance, const double epsilon,
                           const bool use_global_stats, const Tensor *ddX,
                           const Tensor *ddScale, const Tensor *ddBias,
                           Tensor *dX, Tensor *dScale, Tensor *ddY) {
@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
  const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
  const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
-  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  phi::funcs::SetConstant<DeviceContext, T> set_constant;
-  phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
  auto &x_dims = X->dims();
  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
  Tensor scale_tmp;
  if (!Scale) {
    scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
-    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+    set_constant(ctx, &scale_tmp, static_cast<T>(1));
  }
  const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
 #ifdef __HIPCC__
@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
 #else
  const int block = 512;
 #endif
-  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
  const int max_blocks = std::max(max_threads / block, 1);
  int grid = std::min(C, max_blocks);
  int grid1 = (num + block - 1) / block;
  const T *mean_data, *variance_data;
  if (use_global_stats) {
-    const auto *running_mean = ctx.Input<Tensor>("Mean");
+    const auto *running_mean = Mean;
-    const auto *running_var = ctx.Input<Tensor>("Variance");
+    const auto *running_var = Variance;
    const auto *running_mean_data = running_mean->template data<T>();
    const auto *running_var_data = running_var->template data<T>();
    mean_data = running_mean_data;
@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
  } else {
    const T *smean_data = Saved_mean->data<T>();
    const T *svariance_data = Saved_variance->data<T>();
    mean_data = smean_data;
    variance_data = svariance_data;
  }
  if (dX) {
    T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, dX, static_cast<T>(0));
+    set_constant(ctx, dX, static_cast<T>(0));
    if (use_global_stats) {
      if (data_layout == DataLayout::kNHWC) {
        DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
            dx_data);
      } else {
        DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
            dx_data);
      }
    } else {
      if (data_layout == DataLayout::kNHWC) {
        DoubleGradComputeDX<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
            x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
            ddscale_data, N, C, sample_size, epsilon, dx_data);
      } else {
        DoubleGradComputeDX<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
            x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
            ddscale_data, N, C, sample_size, epsilon, dx_data);
      }
@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
  }
  if (dScale) {
    T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, dScale, static_cast<T>(0));
+    set_constant(ctx, dScale, static_cast<T>(0));
    if (use_global_stats) {
      if (data_layout == DataLayout::kNHWC) {
        DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
            ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
            dscale_data);
      } else {
        DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
            ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
            dscale_data);
      }
    } else {
      if (data_layout == DataLayout::kNHWC) {
        DoubleGradComputeDScale<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
            x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
            sample_size, epsilon, dscale_data);
      } else {
        DoubleGradComputeDScale<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
            x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
            sample_size, epsilon, dscale_data);
      }
@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
  }
  if (ddY) {
    T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, ddY, static_cast<T>(0));
+    set_constant(ctx, ddY, static_cast<T>(0));
    if (use_global_stats) {
      if (data_layout == DataLayout::kNHWC) {
        DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
            ddscale_data, epsilon, C, sample_size, num, ddy_data);
      } else {
        DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
            ddscale_data, epsilon, C, sample_size, num, ddy_data);
      }
    } else {
      if (data_layout == DataLayout::kNHWC) {
        DoubleGradComputeDDY<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
            x_data, mean_data, variance_data, ddscale_data, ddbias_data,
            ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
      } else {
        DoubleGradComputeDDY<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
            x_data, mean_data, variance_data, ddscale_data, ddbias_data,
            ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
      }

--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/padding.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 namespace paddle {
 namespace operators {
@@ -50,7 +50,8 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
      pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
    }
-    math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value,
+    phi::funcs::PaddingFunctor<DeviceContext, T>(
+        rank, context.template device_context<DeviceContext>(), pads, pad_value,
        *in_y, out);
  }
 };
@@ -82,7 +83,8 @@ class PadConstantLikeGradKernel : public framework::OpKernel<T> {
      pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
    }
-    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout,
+    phi::funcs::PaddingGradFunctor<DeviceContext, T>(
+        rank, context.template device_context<DeviceContext>(), pads, *in_dout,
        d_y);
  }
 };

--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/pad_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/complex.h"
 namespace paddle {
@@ -167,40 +167,3 @@ REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker,
 REGISTER_OPERATOR(pad_grad, ops::PadOpGrad,
                  ops::PadOpDoubleGradMaker<paddle::framework::OpDesc>,
                  ops::PadOpDoubleGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<float>>,
-    ops::PadKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::PadGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex<float>>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex<double>>);
--- a/paddle/fluid/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/padding.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-template <typename DeviceContext, typename T>
-class PadKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    float pad_value = context.Attr<float>("pad_value");
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    int rank = x->dims().size();
-    math::PaddingFunctor<DeviceContext, T>(rank, context, pads,
-                                           static_cast<T>(pad_value), *x, out);
-  }
-};
-template <typename DeviceContext, typename T>
-class PadGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-    if (d_x == nullptr) {
-      return;
-    }
-    d_x->mutable_data<T>(context.GetPlace());
-    int rank = d_out->dims().size();
-    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
-                                               d_x);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -20,9 +20,11 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 namespace paddle {
 namespace operators {
@@ -42,71 +44,86 @@ static inline int NumBlocks(const int N) {
 }
 template <typename T>
-__global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
+struct NonzeroFunctor {
-                                  const int ignore_index, const int limit,
+  HOSTDEVICE explicit inline NonzeroFunctor() {}
-                                  T *out_data, T *counts) {
+  HOSTDEVICE inline T operator()(const T x) const {
-  CUDA_KERNEL_LOOP(i, limit) {
+    return static_cast<T>(static_cast<double>(x) != 0);
-    T x = x_data[i];
+  }
-    T label = label_data[i];
+};
+template <typename T>
+struct SigmoidFwdFunctor {
+  T ignore_index_;
  T eps = static_cast<T>(1e-5);
-    T diff = label - static_cast<T>(ignore_index);
+  HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index)
+      : ignore_index_(ignore_index) {}
+  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label) {
+    T counts;
+    T out_data;
+    T diff = label - static_cast<T>(ignore_index_);
    if ((diff > -eps) && (diff < eps)) {
-      out_data[i] = static_cast<T>(0.);
+      out_data = static_cast<T>(0.);
-      counts[i] = 0;
+      counts = 0;
    } else {
      T term1 = (x > 0) ? x : 0;
      T term2 = x * label;
      T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
-      out_data[i] = term1 - term2 + term3;
-      counts[i] = 1;
-    }
-  }
-}
-template <typename T, int BlockDim>
+      out_data = term1 - term2 + term3;
-__global__ void Sum(const T *counts, int num, const T eps, T *sum) {
+      counts = 1;
-  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T in = 0;
-  for (int i = threadIdx.x; i < num; i += BlockDim) {
-    in += counts[i];
-  }
-  __syncthreads();
-  auto out =
-      BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    T a = out > eps ? out : eps;
-    sum[0] = a;
    }
-}
+    phi::Array<T, 2> outs;
-template <typename T>
+    outs[0] = out_data;
-__global__ void Div(T *loss, const int num, const T *norm) {
+    outs[1] = counts;
-  CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
+    return outs;
-}
+  }
+};
 template <typename T>
-__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
+struct SigmoidBwdFunctor {
-                                   const int ignore_index, const T *dout_data,
+  T ignore_index_;
-                                   const int limit, T *dx_data, T *counts) {
-  CUDA_KERNEL_LOOP(i, limit) {
-    T x = x_data[i];
-    T label = label_data[i];
-    T dout = dout_data[i];
  T eps = static_cast<T>(1e-5);
-    T diff = label - static_cast<T>(ignore_index);
+  HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index)
+      : ignore_index_(ignore_index) {}
+  HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label,
+                                                const T dout) {
+    T counts;
+    T dx_data;
+    T diff = label - static_cast<T>(ignore_index_);
    if ((diff > -eps) && (diff < eps)) {
-      dx_data[i] = static_cast<T>(0.);
+      dx_data = static_cast<T>(0.);
-      counts[i] = 0;
+      counts = 0;
    } else {
      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
      T diff = simoid_x - label;
-      dx_data[i] = dout * diff;
+      dx_data = dout * diff;
-      counts[i] = 1;
+      counts = 1;
    }
+    phi::Array<T, 2> outs;
+    outs[0] = dx_data;
+    outs[1] = counts;
+    return outs;
  }
-}
+};
+template <typename T>
+struct DivFunctor {
+  const T norm_;
+  HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {}
+  HOSTDEVICE inline T operator()(T loss) {
+    loss /= norm_;
+    return loss;
+  }
+};
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename DeviceContext, typename T>
@@ -123,20 +140,48 @@ class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
    bool normalize = context.Attr<bool>("normalize");
    // Temporary memory
-    auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T));
+    Tensor *counts_tensor = new Tensor();
-    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
+    counts_tensor->mutable_data<T>(context.GetPlace(),
+                                   Labels->numel() * sizeof(T));
+    counts_tensor->Resize(Out->dims());
    int limit = Out->numel();
    int blocks = NumBlocks(limit);
    int threads = kNumCUDAThreads;
-    GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+    std::vector<const framework::Tensor *> ins = {X, Labels};
-        X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts);
+    std::vector<framework::Tensor *> outs = {Out, counts_tensor};
+    auto functor = SigmoidFwdFunctor<T>(ignore_index);
+    constexpr int Size = 2;
+    phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
+                                                              &outs, functor);
    if (normalize) {
-      auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
+      T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
-      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
+      Tensor *norm_tensor = new Tensor();
-      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+      norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
-          counts, limit, static_cast<T>(1e-5), norm);
+      auto dims = phi::vectorize(counts_tensor->dims());
-      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm);
+      std::vector<int> reduce_dim = {};
+      for (int i = 0; i < dims.size(); i++) {
+        reduce_dim.push_back(i);
+      }
+      TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+          context.cuda_device_context(), *counts_tensor, norm_tensor,
+          NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
+      T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
+      auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
+      T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
+      memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
+                   sizeof(T), dev_ctx.stream());
+      auto eps = static_cast<T>(1e-5);
+      *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
+      std::vector<const framework::Tensor *> div_ins = {Out};
+      std::vector<framework::Tensor *> div_outs = {Out};
+      auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
+      phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
+                                       div_functor);
+      delete norm_tensor;
+      delete counts_tensor;
    }
  }
 };
@@ -157,22 +202,48 @@ class GPUSigmoidCrossEntropyWithLogitsGradKernel
    auto &dev_ctx = context.cuda_device_context();
    // Temporary memory
-    auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T));
+    Tensor *counts_tensor = new Tensor();
-    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
+    counts_tensor->mutable_data<T>(context.GetPlace(),
+                                   Labels->numel() * sizeof(T));
+    counts_tensor->Resize(dX->dims());
    int limit = dX->numel();
    int blocks = NumBlocks(limit);
    int threads = kNumCUDAThreads;
-    GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+    std::vector<const framework::Tensor *> ins = {X, Labels, dOut};
-        X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit,
+    std::vector<framework::Tensor *> outs = {dX, counts_tensor};
-        dx_data, counts);
+    auto functor = SigmoidBwdFunctor<T>(ignore_index);
+    constexpr int Size = 2;
+    phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
+                                                              &outs, functor);
    bool normalize = context.Attr<bool>("normalize");
    if (normalize) {
-      auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
+      T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
-      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
+      Tensor *norm_tensor = new Tensor();
-      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+      norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
-          counts, limit, static_cast<T>(1e-5), norm);
+      auto dims = phi::vectorize(counts_tensor->dims());
-      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm);
+      std::vector<int> reduce_dim = {};
+      for (int i = 0; i < dims.size(); i++) {
+        reduce_dim.push_back(i);
+      }
+      TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+          context.cuda_device_context(), *counts_tensor, norm_tensor,
+          NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
+      T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
+      auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
+      T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
+      memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
+                   sizeof(T), dev_ctx.stream());
+      auto eps = static_cast<T>(1e-5);
+      *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
+      std::vector<const framework::Tensor *> div_ins = {dX};
+      std::vector<framework::Tensor *> div_outs = {dX};
+      auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
+      phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
+                                       div_functor);
+      delete norm_tensor;
    }
  }
 };

--- a/paddle/fluid/operators/spectral_op.h
+++ b/paddle/fluid/operators/spectral_op.h
@@ -23,9 +23,9 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/conj_op.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "thrust/device_vector.h"
 #endif
@@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel<T> {
      std::vector<int> pads(rank * 2, 0);
      pads[axes.back() * 2 + 1] = zero_length;
-      paddle::operators::math::PaddingFunctor<DeviceContext, C>(
+      phi::funcs::PaddingFunctor<DeviceContext, C>(
-          rank, ctx, pads, static_cast<C>(0), *dy, &full_dy);
+          rank, ctx.template device_context<DeviceContext>(), pads,
+          static_cast<C>(0), *dy, &full_dy);
      fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization,
                   !forward);
    }

--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -23,12 +23,9 @@ namespace paddle {
 namespace platform {
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
-  auto& ops = get_kl1_ops();
  auto v = get_xpu_version(type.place_.device);
-  if (v == phi::backends::xpu::XPUVersion::XPU2) {
+  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
-    ops = get_kl2_ops();
+                                                          : get_kl2_ops();
-  }
  if (ops.find(op_name) != ops.end() &&
      ops[op_name].find(type) != ops[op_name].end()) {
    return true;
@@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) {
 #ifdef PADDLE_WITH_XPU_KP
 bool is_xpu_kp_support_op(const std::string& op_name,
                          const pOpKernelType& type) {
-  auto& ops = get_kl1_ops();
  auto v = get_xpu_version(type.place_.device);
-  if (v == phi::backends::xpu::XPUVersion::XPU2) {
+  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
-    ops = get_kp_ops();
+                                                          : get_kp_ops();
-  }
  if (ops.find(op_name) != ops.end() &&
      ops[op_name].find(type) != ops[op_name].end()) {
    return true;

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_XPU
@@ -161,6 +162,8 @@ void LoadCustomDevice(const std::string &library_dir) {
 #endif
 void InitDevices() {
+  // set name at the entry point of Paddle
+  platform::SetCurrentThreadName("MainThread");
 // CUPTI attribute should be set before any CUDA context is created (see CUPTI
 // documentation about CUpti_ActivityAttribute).
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/platform/os_info_test.cc
+++ b/paddle/fluid/platform/os_info_test.cc
@@ -30,8 +30,7 @@ TEST(ThreadInfo, TestThreadNameUtils) {
  using paddle::platform::GetCurrentThreadName;
  using paddle::platform::SetCurrentThreadName;
  using paddle::platform::GetAllThreadNames;
-  EXPECT_EQ("unset", GetCurrentThreadName());
+  SetCurrentThreadName("MainThread");
-  EXPECT_TRUE(SetCurrentThreadName("MainThread"));
  EXPECT_FALSE(SetCurrentThreadName("MainThread"));
  auto names = GetAllThreadNames();
  EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end());

--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -189,7 +189,10 @@ struct ThreadEventSection {
 class ThreadEventRecorder {
 public:
-  ThreadEventRecorder() { thread_id_ = GetCurrentThreadSysId(); }
+  ThreadEventRecorder() {
+    thread_id_ = GetCurrentThreadSysId();
+    thread_name_ = GetCurrentThreadName();
+  }
  DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
@@ -202,7 +205,7 @@ class ThreadEventRecorder {
  ThreadEventSection GatherEvents() {
    ThreadEventSection thr_sec;
-    thr_sec.thread_name = GetCurrentThreadName();
+    thr_sec.thread_name = thread_name_;
    thr_sec.thread_id = thread_id_;
    thr_sec.events = std::move(base_evt_cntr_.Reduce());
    return thr_sec;
@@ -210,6 +213,7 @@ class ThreadEventRecorder {
 private:
  uint64_t thread_id_;
+  std::string thread_name_;
  EventContainer<CommonEvent> base_evt_cntr_;
 };

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -85,6 +85,9 @@ if(NOT ON_INFER)
  if (WITH_NCCL)
    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
  endif()
+  if (WITH_GLOO)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
+  endif()
  set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
 endif()

--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -31,9 +31,15 @@ namespace pybind {
 using TCPStore = paddle::distributed::TCPStore;
 void BindTCPStore(py::module* m) {
-  py::class_<TCPStore>(*m, "TCPStore")
+  py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore")
-      .def(
+      .def(py::init([](std::string hostname, uint16_t port, bool is_master,
-          py::init<std::string, uint16_t, bool, size_t, std::chrono::seconds>())
+                       size_t world_size, std::chrono::seconds timeout) {
+             return std::make_shared<TCPStore>(hostname, port, is_master,
+                                               world_size, timeout);
+           }),
+           py::arg("hostname"), py::arg("port"), py::arg("is_master"),
+           py::arg("world_size"), py::arg("timeout"),
+           py::call_guard<py::gil_scoped_release>())
      .def("add", &TCPStore::add)
      .def("get", &TCPStore::get);
 }

--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -35,6 +35,11 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #endif
+#if defined(PADDLE_WITH_GLOO)
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+#endif
 namespace py = pybind11;
 namespace paddle {
@@ -42,6 +47,14 @@ namespace pybind {
 using Tensor = paddle::experimental::Tensor;
+#if defined(PADDLE_WITH_GLOO)
+using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo;
+using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore;
+using GlooOptions = paddle::distributed::ProcessGroupGloo::GlooOptions;
+#endif
+static std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME";  // NOLINT
 void BindDistributed(py::module *m) {
  py::enum_<distributed::ReduceOp>(*m, "ReduceOp")
      .value("SUM", distributed::ReduceOp::SUM)
@@ -64,6 +77,11 @@ void BindDistributed(py::module *m) {
      .def(py::init<>())
      .def_readwrite("place_ids", &distributed::BarrierOptions::place_ids);
+  py::class_<distributed::ReduceOptions>(*m, "ReduceOptions")
+      .def(py::init<>())
+      .def_readwrite("reduce_op", &distributed::ReduceOptions::reduce_op)
+      .def_readwrite("source_root", &distributed::ReduceOptions::root_rank);
  auto ProcessGroup =
      py::class_<distributed::ProcessGroup,
                 std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup")
@@ -121,6 +139,58 @@ void BindDistributed(py::module *m) {
                 return self.Recv(tensors, src);
               },
               py::arg("tensor"), py::arg("src"),
+               py::call_guard<py::gil_scoped_release>())
+          .def("all_gather",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.AllGather(in_tensors, out_tensors);
+               },
+               py::arg("in"), py::arg("out"),
+               py::call_guard<py::gil_scoped_release>())
+          .def("alltoall",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.AllToAll(in_tensors, out_tensors);
+               },
+               py::arg("in"), py::arg("out"),
+               py::call_guard<py::gil_scoped_release>())
+          .def("reduce",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  int dst, distributed::ReduceOp op) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 distributed::ReduceOptions opts;
+                 opts.reduce_op = op;
+                 opts.root_rank = dst;
+                 std::vector<Tensor> tensors = {in_tensor};
+                 return self.Reduce(tensors, opts);
+               },
+               py::arg("tensor"), py::arg("dst"),
+               py::arg("op") = distributed::ReduceOp::SUM,
+               py::call_guard<py::gil_scoped_release>())
+          .def("scatter",
+               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                  py::handle py_out_tensor, int src) {
+                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                 distributed::ScatterOptions opts;
+                 opts.root_rank = src;
+                 std::vector<Tensor> in_tensors = {in_tensor};
+                 std::vector<Tensor> out_tensors = {out_tensor};
+                 return self.Scatter(in_tensors, out_tensors, opts);
+               },
+               py::arg("in"), py::arg("out"), py::arg("src"),
               py::call_guard<py::gil_scoped_release>());
 #if defined(PADDLE_WITH_NCCL)
@@ -129,6 +199,7 @@ void BindDistributed(py::module *m) {
      *m, "ProcessGroupNCCL", ProcessGroup)
      .def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
           py::call_guard<py::gil_scoped_release>());
+#endif
  py::class_<distributed::ProcessGroup::Task,
             std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
@@ -138,7 +209,6 @@ void BindDistributed(py::module *m) {
           py::call_guard<py::gil_scoped_release>())
      .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
           py::call_guard<py::gil_scoped_release>());
-#endif
  // define parallel strategy, it will be removed
  py::class_<distributed::ProcessGroupStrategy> pg_strategy(
@@ -178,6 +248,45 @@ void BindDistributed(py::module *m) {
                      self.nrings_ = nrings;
                    });
+#if defined(PADDLE_WITH_GLOO)
+  py::class_<GlooOptions>(*m, "GlooOptions")
+      .def(py::init<>())
+      .def_readwrite("_device", &GlooOptions::device)
+      .def_static("create", &GlooOptions::create);
+  py::class_<GlooStore, std::shared_ptr<GlooStore>>(*m, "GlooStore")
+      .def(py::init(
+               [](const std::shared_ptr<paddle::distributed::TCPStore> &store) {
+                 return std::make_shared<GlooStore>(store);
+               }),
+           py::call_guard<py::gil_scoped_release>());
+  py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
+      *m, "ProcessGroupGloo", ProcessGroup)
+      .def(py::init<const std::shared_ptr<GlooStore> &, int, int,
+                    std::shared_ptr<GlooOptions> &>(),
+           py::call_guard<py::gil_scoped_release>())
+      .def(py::init([](const std::shared_ptr<GlooStore> &store, int rank,
+                       int world_size) {
+             auto opts = GlooOptions::create();
+             char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
+             if (ifname && strlen(ifname) > 1) {
+               opts->device = ProcessGroupGloo::createDeviceForInterface(
+                   std::string(ifname));
+             } else {
+               opts->device = ProcessGroupGloo::createDefaultDevice();
+             }
+             return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
+                                                       opts);
+           }),
+           py::arg("store"), py::arg("rank"),
+           py::arg("world_size"),  // py::arg("timeout") =
+                                   // kProcessGroupDefaultTimeout,
+           py::call_guard<py::gil_scoped_release>())
+      .def_static("create_default_device",
+                  &ProcessGroupGloo::createDefaultDevice);
+#endif
  m->def("eager_assign_group_by_size",
         [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
            std::vector<size_t> group_size_limits,

--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/hooks.h"
@@ -30,10 +31,12 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 namespace paddle {
 namespace pybind {
@@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self,
 extern PyTypeObject* p_tensor_type;
+Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj) {
+  if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
+    VLOG(6) << "Call GetSliceIndexFromTensor in Eager";
+    paddle::experimental::Tensor tensor = CastPyArg2Tensor(obj, 0);
+    PADDLE_ENFORCE_EQ(
+        tensor.initialized(), true,
+        paddle::platform::errors::InvalidArgument(
+            "We can only support initialized tensor in slice, however we got "
+            "uninitialized tensor %s, please check your code.",
+            tensor.name()));
+    return GetSliceIndexFromTensor((*static_cast<phi::DenseTensor*>(
+        CastPyArg2Tensor(obj, 0).impl().get())));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "We should only get paddle::experimental::Tensor or VarBase in this "
+        "method, when you reach this means we got another type index."));
+  }
+}
+bool PyCheckTensor(PyObject* obj) {
+  return PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type));
+}
 static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
                                     PyObject* kwargs) {
  EAGER_TRY
@@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }
-// NOTE(wuweilong): Set value and not change self's original place
+static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
-static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args,
+                                                  PyObject* args,
                                                  PyObject* kwargs) {
  EAGER_TRY
-  VLOG(4) << "Value " << self->tensor.name();
+  PyObject* _index = PyTuple_GET_ITEM(args, 0);
-  pybind11::object numpy_value =
+  VLOG(4) << "Call _getitem_index_not_tensor";
-      pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true);
+  std::vector<int> slice_axes, slice_starts, slice_ends, slice_strides,
-  InitTensorWithNumpyValue(self, numpy_value, false);
+      decrease_axis, none_axes, infer_flags, list_select_idxs;
-  Py_INCREF(Py_None);
+  // if index is a list, list_select_flag will be true
-  return Py_None;
+  bool list_select_flag = false;
+  PADDLE_ENFORCE_EQ(
+      self->tensor.is_initialized(), true,
+      platform::errors::InvalidArgument(
+          "tensor %s has not been initialized, we can only slice initialized "
+          "tensor please init it first with numpy or other tensor.",
+          self->tensor.name()));
+  auto tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
+  ParseIndexingSlice(tensor, _index, &slice_axes, &slice_starts, &slice_ends,
+                     &slice_strides, &decrease_axis, &none_axes, &infer_flags,
+                     &list_select_idxs, &list_select_flag);
+  auto out = slice_axes.empty() && !list_select_flag
+                 ? self->tensor
+                 : paddle::experimental::Tensor(
+                       egr::Controller::Instance().GenerateUniqueName());
+  if (!slice_axes.empty()) {
+    framework::AttributeMap attrs = {{"axes", slice_axes},
+                                     {"starts", slice_starts},
+                                     {"ends", slice_ends},
+                                     {"infer_flags", infer_flags},
+                                     {"decrease_axis", decrease_axis}};
+    std::string op_type = "slice";
+    for (auto stride : slice_strides) {
+      if (stride != 1) {
+        op_type = "strided_slice";
+        attrs.insert({"strides", slice_strides});
+        attrs.erase("decrease_axis");
+        break;
+      }
+    }
+    if (op_type == "slice") {
+      out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
+                                   paddle::experimental::Tensor(),
+                                   std::move(attrs));
+    } else if (op_type == "strided_slice") {
+      out = strided_slice_dygraph_function(self->tensor, attrs);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Slice is only support slice and strided_slice, but we got %s which "
+          "is impossible, please check your code first or contact us by "
+          "issue. ",
+          op_type));
+    }
+  }
+  if (!none_axes.empty()) {
+    // Deal with cases when all axes are decreased.
+    // After slice, the shape of out is [1], which should have been
+    // [], but Paddle doesn't support scalar.
+    // In order to ensure the correctness of the final shape of out,
+    // one dimension of out needs to be decreased.
+    // For example:
+    // # x.shape: (2,3,4)
+    // out = x[0, 1, 1, None] # out.shape : (1)
+    if (static_cast<int>(decrease_axis.size()) == tensor->dims().size()) {
+      none_axes.pop_back();
+    }
+    if (!none_axes.empty()) {
+      // Deal with cases that decrease_axes is not empty
+      // For example:
+      // # x.shape: (2,3,4)
+      // out = x[0, 0:2, None] # out.shape : (2, 1, 4)
+      for (auto& axis : none_axes) {
+        int len = 0;
+        for (int da : decrease_axis) {
+          if (da < axis) {
+            len++;
+          }
+        }
+        axis -= len;
+      }
+      paddle::experimental::Tensor new_out;
+      framework::AttributeMap attrs = {{"axes", none_axes}};
+      new_out = std::get<0>(unsqueeze2_dygraph_function(out, std::move(attrs)));
+      return ToPyObject(new_out);
+    }
+  }
+  // the index is a list
+  if (list_select_flag) {
+    auto select_index = paddle::experimental::Tensor(
+        egr::Controller::Instance().GenerateUniqueName());
+    auto idx_tensor = std::make_shared<phi::DenseTensor>();
+    auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
+        egr::Controller::Instance().GetExpectedPlace());
+    paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
+                                        idx_tensor.get());
+    framework::AttributeMap attrs = {{"dim", 0}};
+    out = index_select_dygraph_function(self->tensor, select_index,
+                                        std::move(attrs));
+  }
+  return ToPyObject(out);
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -602,7 +723,8 @@ PyMethodDef variable_methods[] = {
    {"get_tensor",
     (PyCFunction)(void (*)(void))tensor_method_get_underline_tensor,
     METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value,
+    {"_getitem_index_not_tensor",
+     (PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
     METH_VARARGS | METH_KEYWORDS, NULL},
    {"_register_grad_hook",
     (PyCFunction)(void (*)(void))tensor_register_grad_hook,

--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -16,8 +16,11 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
@@ -184,6 +187,11 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
  }
 }
+std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
+                                                       ssize_t arg_pos) {
+  return py::cast<std::shared_ptr<imperative::VarBase>>(obj);
+}
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
    PyObject* obj, ssize_t arg_pos) {
  std::vector<paddle::experimental::Tensor> result;
@@ -737,5 +745,6 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
  return result;
 }
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -14,7 +14,6 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 namespace paddle {
 namespace pybind {
@@ -33,6 +32,8 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
 float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
 std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
 paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos);
+std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
+                                                       ssize_t arg_pos);
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
    PyObject* obj, ssize_t arg_pos);
 platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
@@ -112,5 +113,7 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
    const std::string& op_type, const std::string& arg_name, PyObject* args,
    ssize_t arg_idx, bool dispensable = false);
+// end of Slice related methods
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -54,6 +54,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/pybind/op_function.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 namespace paddle {
@@ -319,6 +320,23 @@ static std::string GetTypeName(const imperative::VarBase &var) {
  }
 }
+Py_ssize_t GetSliceIndexFromPyObject(PyObject *obj) {
+  if (py::isinstance<imperative::VarBase>(obj)) {
+    VLOG(6) << "Call GetSliceIndexFromTensor in Imperative";
+    return GetSliceIndexFromTensor(
+        py::cast<std::shared_ptr<imperative::VarBase>>(obj)
+            ->Var()
+            .Get<framework::LoDTensor>());
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "We should only get paddle::experimental::Tensor or VarBase in this "
+        "method, when you reach this means we got another type index."));
+  }
+}
+bool PyCheckTensor(PyObject *obj) {
+  return py::isinstance<imperative::VarBase>(obj);
+}
 using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
 // NOTE(zjl): py::handle is a very light wrapper of PyObject *.
@@ -360,18 +378,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
  return result;
 }
-static bool IsNumpyType(PyObject *obj) {
-  // It is not a good way to judge the type of obj by its type'name. Maybe using
-  // `PyArray_IsScalar` will be better. However, this interface cannot be used
-  // by including pybind11, and it needs to compile with numpy.
-  auto type_name = std::string(Py_TYPE(obj)->tp_name);
-  return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
-         type_name == "numpy.int32" || type_name == "numpy.int16";
-}
-static bool PyCheckTensor(PyObject *obj) {
-  return py::isinstance<imperative::VarBase>(obj);
-}
 // cast numpy type form S to T, this may allocate new memory
 template <class T, class S>
@@ -429,260 +435,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
  return result;
 }
-static bool PyCheckInteger(PyObject *obj) {
-#if PY_VERSION_HEX < 0x03000000
-  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
-#else
-  return PyLong_Check(obj) && !PyBool_Check(obj);
-#endif
-}
-static Py_ssize_t GetSliceIndexFromTensor(
-    const std::shared_ptr<imperative::VarBase> &tensor_index) {
-  const auto &tensor = tensor_index->Var().Get<framework::LoDTensor>();
-  if (tensor.numel() == 1) {
-    if (framework::TransToProtoVarType(tensor.dtype()) ==
-        framework::proto::VarType::INT32) {
-      return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
-    } else if (framework::TransToProtoVarType(tensor.dtype()) ==
-               framework::proto::VarType::INT64) {
-      return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, the type of tensor in slice indices only allows "
-          "int32 and int64, please check the type of index tensor."));
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Currently, tensor in slice indices only allows 1 element, "
-        "but received %d.",
-        tensor.numel()));
-  }
-}
-// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
-// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
-// Original PySlice_GetIndices return wrong result when
-// slice_item contains long int, such as arr[:180L].
-// NOT sure why this happens !!!
-// Besides, PySlice_GetIndices cannot raise error when float in slice item.
-// So, I make a revised version of PySlice_GetIndices, named to
-// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
-// PySlice_GetIndices in the future.
-static int _PySlice_GetIndices(PySliceObject *r, Py_ssize_t length,
-                               Py_ssize_t *start, Py_ssize_t *stop,
-                               Py_ssize_t *step) {
-  /* XXX support long ints */
-  if (r->step == Py_None) {
-    *step = 1;
-  } else {
-    if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
-      *step = PyLong_AsLong(r->step);
-    } else if (PyCheckTensor(r->step)) {
-      *step = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->step));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->step)->tp_name)));
-    }
-  }
-  if (r->start == Py_None) {
-    *start = *step < 0 ? length - 1 : 0;
-  } else {
-    if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
-      *start = PyLong_AsLong(r->start);
-    } else if (PyCheckTensor(r->start)) {
-      *start = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->start));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->start)->tp_name)));
-    }
-    if (*start < 0) *start += length;
-    *start = std::max(*start, static_cast<Py_ssize_t>(0));
-  }
-  if (r->stop == Py_None) {
-    *stop = *step < 0 ? -1 : length;
-  } else {
-    if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
-      *stop = PyLong_AsLong(r->stop);
-    } else if (PyCheckTensor(r->stop)) {
-      *stop = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->stop));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->stop)->tp_name)));
-    }
-    if (0 < *step && *stop < 0) *stop += length;
-    *stop = std::min(*stop, length);
-  }
-  if (*stop > length) return -1;
-  if (*start >= length) return -1;
-  if (*step == 0) return -1;
-  return 0;
-}
-static void ParseIndexingSlice(
-    framework::LoDTensor *tensor, PyObject *_index,
-    std::vector<int> *slice_axes, std::vector<int> *slice_starts,
-    std::vector<int> *slice_ends, std::vector<int> *slice_strides,
-    std::vector<int> *decrease_axis, std::vector<int> *none_axes,
-    std::vector<int> *infer_flags, std::vector<int> *list_select_idxs,
-    bool *list_select_flag) {
-  // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
-  // types, and list of Bool and Integers.
-  // wrap to tuple
-  // NOTE(zhiqiu): PyTuple_Pack increases refcount.
-  PyObject *index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
-  DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
-    if (!PyTuple_Check(_index)) {
-      Py_DECREF(index);
-      VLOG(4) << "Call Py_DECREF";
-    }
-  });
-  PADDLE_ENFORCE_EQ(
-      tensor->IsInitialized(), true,
-      platform::errors::InvalidArgument("tensor has not been initialized"));
-  const auto &shape = tensor->dims();
-  const int rank = shape.size();
-  const int size = PyTuple_GET_SIZE(index);
-  // specified_dims is the number of dimensions which indexed by Interger,
-  // Slices.
-  int specified_dims = 0;
-  int ell_count = 0;
-  for (int dim = 0; dim < size; ++dim) {
-    PyObject *slice_item = PyTuple_GetItem(index, dim);
-    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
-      specified_dims++;
-    } else if (slice_item == Py_Ellipsis) {
-      ell_count++;
-    }
-  }
-  PADDLE_ENFORCE_LE(ell_count, 1,
-                    platform::errors::InvalidArgument(
-                        "An index can only have a single ellipsis ('...')"));
-  int none_count = 0;
-  for (int i = 0, dim = 0; i < size; ++i) {
-    PyObject *slice_item = PyTuple_GetItem(index, i);
-    infer_flags->push_back(1);
-    int dim_len = shape[dim];
-    if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
-      // integer, PyLong_AsLong supports both int and long
-      int start = static_cast<int>(PyLong_AsLong(slice_item));
-      auto s_t = start;
-      start = start < 0 ? start + dim_len : start;
-      if (start >= dim_len || start < 0) {
-        std::string str_error_message =
-            "The starting index " + std::to_string(s_t) +
-            " of slice is out of bounds in tensor " + std::to_string(dim) +
-            "-th axis, it shound be in the range of [" +
-            std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
-        // py::index_error is corresponding to IndexError in Python
-        // Used to indicate out of bounds access in __getitem__, __setitem__
-        throw py::index_error(str_error_message);
-      }
-      slice_axes->push_back(dim);
-      slice_starts->push_back(start);
-      slice_ends->push_back(start + 1);
-      slice_strides->push_back(1);
-      decrease_axis->push_back(dim);
-      dim++;
-    } else if (PySlice_Check(slice_item)) {
-      // slice item
-      Py_ssize_t start, end, step;
-      PySliceObject *p = reinterpret_cast<PySliceObject *>(slice_item);
-      _PySlice_GetIndices(p, dim_len, &start, &end, &step);
-      // :: or : or 0:dim_len:1
-      if (start == 0 && end == dim_len && step == 1) {
-        dim++;
-        continue;
-      }
-      slice_axes->push_back(dim);
-      slice_starts->push_back(start);
-      slice_ends->push_back(end);
-      slice_strides->push_back(step);
-      dim++;
-    } else if (slice_item == Py_Ellipsis) {
-      dim += rank - specified_dims;
-    } else if (slice_item == Py_None) {
-      none_axes->push_back(dim + none_count);
-      none_count++;
-    } else if (PyList_Check(slice_item)) {
-      *list_select_flag = true;
-      PADDLE_ENFORCE_EQ(
-          size, 1,
-          platform::errors::InvalidArgument(
-              "When index contains a list, its length is excepted to 1, "
-              "but received %d",
-              size));
-      bool all_bool = true;
-      int list_size = PyList_GET_SIZE(slice_item);
-      for (int j = 0; j < list_size; ++j) {
-        PyObject *list_item = PyList_GetItem(slice_item, j);
-        if (PyCheckInteger(list_item)) {
-          all_bool = false;
-        } else if (!PyBool_Check(list_item)) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support int or bool in index list."));
-        }
-      }
-      if (all_bool) {
-        PADDLE_ENFORCE_EQ(
-            list_size, shape[0],
-            platform::errors::InvalidArgument(
-                "The dimension of bool index doesn't match indexed array along "
-                "dimension 0, the target dimension is %d, but received %d.",
-                shape[0], list_size));
-        for (int j = 0; j < list_size; ++j) {
-          PyObject *list_item = PyList_GetItem(slice_item, j);
-          if (list_item == Py_True) {
-            list_select_idxs->push_back(j);
-          }
-        }
-      } else {
-        for (int j = 0; j < list_size; ++j) {
-          PyObject *list_item = PyList_GetItem(slice_item, j);
-          if (PyCheckInteger(list_item)) {
-            list_select_idxs->push_back(
-                static_cast<int>(PyLong_AsLong(list_item)));
-          } else if (list_item == Py_True) {
-            list_select_idxs->push_back(1);
-          } else {
-            list_select_idxs->push_back(0);
-          }
-        }
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, Tensor.__indices__() only allows indexing "
-          "by Integers, Slices, Ellipsis, None, tuples of these types "
-          "and list of Bool and Integers, but received "
-          "%s in %dth slice item",
-          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
-    }
-  }
-  // valid_index is the number of dimensions exclude None index
-  const int valid_indexs = size - none_axes->size() - ell_count;
-  PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
-                    platform::errors::InvalidArgument(
-                        "Too many indices (%d) for tensor of dimension %d.",
-                        valid_indexs, rank));
-}
 template <typename P>
 static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
                        imperative::VarBase &dst,                   // NOLINT

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -80,6 +80,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #include "paddle/fluid/pybind/distributed_py.h"
 #include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/io.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/lod_utils.h"
@@ -101,7 +102,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/gloo_context_py.h"
 #include "paddle/fluid/pybind/gloo_wrapper_py.h"
 #include "paddle/fluid/pybind/heter_wrapper_py.h"
-#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/metrics_py.h"
@@ -527,6 +527,7 @@ PYBIND11_MODULE(core_avx, m) {
 PYBIND11_MODULE(core_noavx, m) {
 #endif
+  BindImperative(&m);
  BindEager(&m);
  BindCudaStream(&m);
@@ -741,8 +742,6 @@ PYBIND11_MODULE(core_noavx, m) {
  m.def("_promote_types_if_complex_exists",
        &paddle::framework::PromoteTypesIfComplexExists);
-  BindImperative(&m);
  py::class_<framework::Tensor> framework_tensor(m, "Tensor",
                                                 py::buffer_protocol());
  g_framework_tensor_pytype =

--- a/paddle/fluid/pybind/slice_utils.h
+++ b/paddle/fluid/pybind/slice_utils.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <Python.h>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope_guard.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+static bool PyCheckTensor(PyObject* obj);
+static Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj);
+// Slice related methods
+static bool PyCheckInteger(PyObject* obj) {
+#if PY_VERSION_HEX < 0x03000000
+  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
+#else
+  return PyLong_Check(obj) && !PyBool_Check(obj);
+#endif
+}
+static bool IsNumpyType(PyObject* obj) {
+  // It is not a good way to judge the type of obj by its type'name. Maybe using
+  // `PyArray_IsScalar` will be better. However, this interface cannot be used
+  // by including pybind11, and it needs to compile with numpy.
+  auto type_name = std::string(Py_TYPE(obj)->tp_name);
+  return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
+         type_name == "numpy.int32" || type_name == "numpy.int16";
+}
+static Py_ssize_t GetSliceIndexFromTensor(const phi::DenseTensor& tensor) {
+  if (tensor.numel() == 1) {
+    if (framework::TransToProtoVarType(tensor.type()) ==
+        framework::proto::VarType::INT32) {
+      return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
+    } else if (framework::TransToProtoVarType(tensor.type()) ==
+               framework::proto::VarType::INT64) {
+      return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, the type of tensor in slice indices only allows "
+          "int32 and int64, please check the type of index tensor."));
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Currently, tensor in slice indices only allows 1 element, "
+        "but received %d.",
+        tensor.numel()));
+  }
+}
+// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
+// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
+// Original PySlice_GetIndices return wrong result when
+// slice_item contains long int, such as arr[:180L].
+// NOT sure why this happens !!!
+// Besides, PySlice_GetIndices cannot raise error when float in slice item.
+// So, I make a revised version of PySlice_GetIndices, named to
+// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
+// PySlice_GetIndices in the future.
+static int _PySlice_GetIndices(PySliceObject* r, Py_ssize_t length,
+                               Py_ssize_t* start, Py_ssize_t* stop,
+                               Py_ssize_t* step) {
+  /* XXX support long ints */
+  if (r->step == Py_None) {
+    *step = 1;
+  } else {
+    if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
+      *step = PyLong_AsLong(r->step);
+    } else if (PyCheckTensor(r->step)) {
+      *step = GetSliceIndexFromPyObject(r->step);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->step)->tp_name)));
+    }
+  }
+  if (r->start == Py_None) {
+    *start = *step < 0 ? length - 1 : 0;
+  } else {
+    if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
+      *start = PyLong_AsLong(r->start);
+    } else if (PyCheckTensor(r->start)) {
+      *start = GetSliceIndexFromPyObject(r->start);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->start)->tp_name)));
+    }
+    if (*start < 0) *start += length;
+    *start = std::max(*start, static_cast<Py_ssize_t>(0));
+  }
+  if (r->stop == Py_None) {
+    *stop = *step < 0 ? -1 : length;
+  } else {
+    if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
+      *stop = PyLong_AsLong(r->stop);
+    } else if (PyCheckTensor(r->stop)) {
+      *stop = GetSliceIndexFromPyObject(r->stop);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->stop)->tp_name)));
+    }
+    if (0 < *step && *stop < 0) *stop += length;
+    *stop = std::min(*stop, length);
+  }
+  if (*stop > length) return -1;
+  if (*start >= length) return -1;
+  if (*step == 0) return -1;
+  return 0;
+}
+static void ParseIndexingSlice(
+    framework::LoDTensor* tensor, PyObject* _index,
+    std::vector<int>* slice_axes, std::vector<int>* slice_starts,
+    std::vector<int>* slice_ends, std::vector<int>* slice_strides,
+    std::vector<int>* decrease_axis, std::vector<int>* none_axes,
+    std::vector<int>* infer_flags, std::vector<int>* list_select_idxs,
+    bool* list_select_flag) {
+  // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
+  // types, and list of Bool and Integers.
+  // wrap to tuple
+  // NOTE(zhiqiu): PyTuple_Pack increases refcount.
+  PyObject* index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
+  DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
+    if (!PyTuple_Check(_index)) {
+      Py_DECREF(index);
+      VLOG(4) << "Call Py_DECREF";
+    }
+  });
+  PADDLE_ENFORCE_EQ(
+      tensor->IsInitialized(), true,
+      platform::errors::InvalidArgument("tensor has not been initialized"));
+  const auto& shape = tensor->dims();
+  const int rank = shape.size();
+  const int size = PyTuple_GET_SIZE(index);
+  // specified_dims is the number of dimensions which indexed by Interger,
+  // Slices.
+  int specified_dims = 0;
+  int ell_count = 0;
+  for (int dim = 0; dim < size; ++dim) {
+    PyObject* slice_item = PyTuple_GetItem(index, dim);
+    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
+      specified_dims++;
+    } else if (slice_item == Py_Ellipsis) {
+      ell_count++;
+    }
+  }
+  PADDLE_ENFORCE_LE(ell_count, 1,
+                    platform::errors::InvalidArgument(
+                        "An index can only have a single ellipsis ('...')"));
+  int none_count = 0;
+  for (int i = 0, dim = 0; i < size; ++i) {
+    PyObject* slice_item = PyTuple_GetItem(index, i);
+    infer_flags->push_back(1);
+    int dim_len = shape[dim];
+    if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
+      // integer, PyLong_AsLong supports both int and long
+      int start = static_cast<int>(PyLong_AsLong(slice_item));
+      auto s_t = start;
+      start = start < 0 ? start + dim_len : start;
+      if (start >= dim_len || start < 0) {
+        std::string str_error_message =
+            "The starting index " + std::to_string(s_t) +
+            " of slice is out of bounds in tensor " + std::to_string(dim) +
+            "-th axis, it shound be in the range of [" +
+            std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
+        // py::index_error is corresponding to IndexError in Python
+        // Used to indicate out of bounds access in __getitem__, __setitem__
+        throw py::index_error(str_error_message);
+      }
+      slice_axes->push_back(dim);
+      slice_starts->push_back(start);
+      slice_ends->push_back(start + 1);
+      slice_strides->push_back(1);
+      decrease_axis->push_back(dim);
+      dim++;
+    } else if (PySlice_Check(slice_item)) {
+      // slice item
+      Py_ssize_t start, end, step;
+      PySliceObject* p = reinterpret_cast<PySliceObject*>(slice_item);
+      _PySlice_GetIndices(p, dim_len, &start, &end, &step);
+      // :: or : or 0:dim_len:1
+      if (start == 0 && end == dim_len && step == 1) {
+        dim++;
+        continue;
+      }
+      slice_axes->push_back(dim);
+      slice_starts->push_back(start);
+      slice_ends->push_back(end);
+      slice_strides->push_back(step);
+      dim++;
+    } else if (slice_item == Py_Ellipsis) {
+      dim += rank - specified_dims;
+    } else if (slice_item == Py_None) {
+      none_axes->push_back(dim + none_count);
+      none_count++;
+    } else if (PyList_Check(slice_item)) {
+      *list_select_flag = true;
+      PADDLE_ENFORCE_EQ(
+          size, 1,
+          platform::errors::InvalidArgument(
+              "When index contains a list, its length is excepted to 1, "
+              "but received %d",
+              size));
+      bool all_bool = true;
+      int list_size = PyList_GET_SIZE(slice_item);
+      for (int j = 0; j < list_size; ++j) {
+        PyObject* list_item = PyList_GetItem(slice_item, j);
+        if (PyCheckInteger(list_item)) {
+          all_bool = false;
+        } else if (!PyBool_Check(list_item)) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support int or bool in index list."));
+        }
+      }
+      if (all_bool) {
+        PADDLE_ENFORCE_EQ(
+            list_size, shape[0],
+            platform::errors::InvalidArgument(
+                "The dimension of bool index doesn't match indexed array along "
+                "dimension 0, the target dimension is %d, but received %d.",
+                shape[0], list_size));
+        for (int j = 0; j < list_size; ++j) {
+          PyObject* list_item = PyList_GetItem(slice_item, j);
+          if (list_item == Py_True) {
+            list_select_idxs->push_back(j);
+          }
+        }
+      } else {
+        for (int j = 0; j < list_size; ++j) {
+          PyObject* list_item = PyList_GetItem(slice_item, j);
+          if (PyCheckInteger(list_item)) {
+            list_select_idxs->push_back(
+                static_cast<int>(PyLong_AsLong(list_item)));
+          } else if (list_item == Py_True) {
+            list_select_idxs->push_back(1);
+          } else {
+            list_select_idxs->push_back(0);
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, Tensor.__indices__() only allows indexing "
+          "by Integers, Slices, Ellipsis, None, tuples of these types "
+          "and list of Bool and Integers, but received "
+          "%s in %dth slice item",
+          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
+    }
+  }
+  // valid_index is the number of dimensions exclude None index
+  const int valid_indexs = size - none_axes->size() - ell_count;
+  PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
+                    platform::errors::InvalidArgument(
+                        "Too many indices (%d) for tensor of dimension %d.",
+                        valid_indexs, rank));
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -32,6 +32,14 @@ set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc)
 set(bw_api_header_file_tmp ${bw_api_header_file}.tmp)
 set(bw_api_source_file_tmp ${bw_api_source_file}.tmp)
+# sparse api file
+set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py)
+set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml)
+set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
+set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
+set(sparse_api_header_file_tmp ${api_header_file}.tmp)
+set(sparse_api_source_file_tmp ${api_source_file}.tmp)
 # wrapped infermeta file
 set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
 set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
@@ -73,6 +81,19 @@ add_custom_command(
  DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
  VERBATIM)
+# generate sparse api
+add_custom_command(
+  OUTPUT ${sparse_api_header_file} ${sparse_api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} ${sparse_api_gen_file}
+                 --api_yaml_path ${sparse_api_yaml_file}
+                 --api_header_path ${sparse_api_header_file_tmp}
+                 --api_source_path ${sparse_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file}
+  COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
+  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base}
+  VERBATIM)
 # generate wrapped infermeta
 add_custom_command(
  OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
@@ -87,12 +108,14 @@ cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
 cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
 cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory)
+cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
 cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
-cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
+cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
+cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
-cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
+cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
-cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform api_custom_impl)
+cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl)
-cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
+cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
-cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api api_custom_impl)
+cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl)
 cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 #include "paddle/phi/api/lib/api_custom_impl.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"

--- a/paddle/phi/api/lib/api_utils.h
+++ b/paddle/phi/api/lib/api_utils.h
@@ -12,26 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#include "paddle/phi/api/lib/api_gen_utils.h"
-#include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/api/lib/utils/storage.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/meta_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
 namespace paddle {
 namespace experimental {
 /* ------------------ for input ----------------------- */
-inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
-    const Tensor& tensor) {
  return std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
 }
-inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
    const paddle::optional<Tensor>& tensor) {
  if (tensor) {
    return std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl());
@@ -39,7 +31,7 @@ inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
  return nullptr;
 }
-inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
+std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
    const std::vector<Tensor>& tensors) {
  auto pt_tensors = std::make_unique<std::vector<phi::DenseTensor>>();
  pt_tensors->reserve(tensors.size());
@@ -52,12 +44,11 @@ inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
  return std::move(pt_tensors);
 }
-inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor) {
-    const Tensor& tensor) {
  return std::dynamic_pointer_cast<phi::SelectedRows>(tensor.impl());
 }
-inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
    const paddle::optional<Tensor>& tensor) {
  if (tensor) {
    return std::dynamic_pointer_cast<phi::SelectedRows>(tensor->impl());
@@ -67,11 +58,11 @@ inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
 /* ----------------- for infer_meta --------------------- */
-inline phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
+phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
  return phi::MetaTensor(tensor);
 }
-inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
    const paddle::optional<const phi::DenseTensor&>& tensor) {
  if (tensor) {
    return {phi::MetaTensor(*tensor)};
@@ -79,7 +70,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
  return {paddle::none};
 }
-inline std::vector<phi::MetaTensor> MakeMetaTensor(
+std::vector<phi::MetaTensor> MakeMetaTensor(
    const std::vector<phi::DenseTensor>& tensors) {
  std::vector<phi::MetaTensor> meta_tensors;
  meta_tensors.reserve(tensors.size());
@@ -89,11 +80,11 @@ inline std::vector<phi::MetaTensor> MakeMetaTensor(
  return meta_tensors;
 }
-inline phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
+phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
  return phi::MetaTensor(tensor);
 }
-inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
    const paddle::optional<const phi::SelectedRows&>& tensor) {
  if (tensor) {
    return {phi::MetaTensor(*tensor)};
@@ -103,7 +94,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
 /* ------------------ for output ----------------------- */
-inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
+phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
  if (!out->initialized()) {
    auto dense_tensor = std::make_shared<phi::DenseTensor>(
        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
@@ -114,8 +105,9 @@ inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
  return static_cast<phi::DenseTensor*>(out->impl().get());
 }
-inline std::vector<phi::DenseTensor*> SetKernelOutput(
+std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
-    size_t out_size, Backend backend, std::vector<Tensor>* out) {
+                                               Backend backend,
+                                               std::vector<Tensor>* out) {
  out->reserve(out_size);
  std::vector<phi::DenseTensor*> results(out_size);
  for (size_t i = 0; i < out_size; ++i) {
@@ -129,8 +121,7 @@ inline std::vector<phi::DenseTensor*> SetKernelOutput(
  return results;
 }
-inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend,
+phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out) {
-                                                      Tensor* out) {
  if (!out->initialized()) {
    auto select_rows = std::make_shared<phi::SelectedRows>();
    out->set_impl(select_rows);
@@ -139,5 +130,29 @@ inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend,
  return static_cast<phi::SelectedRows*>(out->impl().get());
 }
+phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) {
+  if (!out->initialized()) {
+    if (type == TensorType::SPARSE_COO) {
+      auto sparse_tensor = std::make_shared<phi::SparseCooTensor>(
+          phi::DenseTensor(), phi::DenseTensor(), phi::DDim{-1});
+      out->set_impl(sparse_tensor);
+      return sparse_tensor.get();
+    } else if (type == TensorType::SPARSE_CSR) {
+      auto sparse_tensor =
+          std::make_shared<phi::SparseCsrTensor>(phi::DenseTensor(),
+                                                 phi::DenseTensor(),
+                                                 phi::DenseTensor(),
+                                                 phi::DDim{-1});
+      out->set_impl(sparse_tensor);
+      return sparse_tensor.get();
+    } else {
+      auto dense_tensor = std::make_shared<phi::DenseTensor>();
+      out->set_impl(dense_tensor);
+      return dense_tensor.get();
+    }
+  }
+  return out->impl().get();
+}
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+namespace paddle {
+namespace experimental {
+enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO };
+/* ------------------ for input ----------------------- */
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor);
+std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+    const paddle::optional<Tensor>& tensor);
+std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
+    const std::vector<Tensor>& tensors);
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor);
+std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
+    const paddle::optional<Tensor>& tensor);
+/* ----------------- for infer_meta --------------------- */
+phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor);
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::DenseTensor&>& tensor);
+std::vector<phi::MetaTensor> MakeMetaTensor(
+    const std::vector<phi::DenseTensor>& tensors);
+phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::SelectedRows&>& tensor);
+/* ------------------ for output ----------------------- */
+phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out);
+std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
+                                               Backend backend,
+                                               std::vector<Tensor>* out);
+phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out);
+phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type);
+}  // namespace experimental
+}  // namespace paddle
--- a/paddle/phi/api/lib/sparse_api.cc
+++ b/paddle/phi/api/lib/sparse_api.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/phi/api/include/sparse_api.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 #include <memory>
 #include "glog/logging.h"
@@ -20,29 +20,12 @@ limitations under the License. */
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/infermeta/unary.h"
-PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
-#endif
 namespace paddle {
 namespace experimental {
 namespace sparse {
-PADDLE_API Tensor to_sparse_coo(const Tensor& x,
+Tensor to_sparse_coo_impl(const Tensor& x,
                          Backend backend,
                          const int64_t sparse_dim) {
  if (x.layout() == phi::DataLayout::SPARSE_COO) {
@@ -105,7 +88,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x,
  return out;
 }
-PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
+Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
  if (x.layout() == phi::DataLayout::SPARSE_CSR) {
    return x;
  }
@@ -171,7 +154,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
  return out;
 }
-PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
+Tensor to_dense_impl(const Tensor& x, Backend backend) {
  if (x.layout() != phi::DataLayout::SPARSE_CSR &&
      x.layout() != phi::DataLayout::SPARSE_COO) {
    return x;

--- a/paddle/phi/api/include/sparse_api.h
+++ b/paddle/phi/api/include/sparse_api.h
@@ -21,13 +21,13 @@ namespace paddle {
 namespace experimental {
 namespace sparse {
-PADDLE_API Tensor to_sparse_coo(const Tensor& x,
+Tensor to_dense_impl(const Tensor& x, Backend backend);
+Tensor to_sparse_coo_impl(const Tensor& x,
                          Backend backend,
                          const int64_t sparse_dim);
-PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend);
+Tensor to_sparse_csr_impl(const Tensor& x, Backend backend);
-PADDLE_API Tensor to_dense(const Tensor& x, Backend backend);
 }  // namespace sparse
 }  // namespace experimental

--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -145,6 +145,7 @@ class SparseCooTensor : public TensorBase,
  void* AllocateFrom(Allocator* allocator,
                     DataType dtype,
                     size_t requested_size = 0) override;
+  void set_dims(const DDim& dims) { this->dims_ = dims; }
 private:
  // save the indices of non zero elements in original dense tensor

--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -348,4 +348,17 @@ void BCELossInferMeta(const MetaTensor& input,
  out->share_lod(input);
 }
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out) {
+  auto ids_dims = ids.dims();
+  auto parents_dims = parents.dims();
+  PADDLE_ENFORCE_EQ(ids_dims == parents_dims,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The shape of Input(Parents) must be same with the "
+                        "shape of Input(Ids)."));
+  out->set_dims(ids_dims);
+}
 }  // namespace phi
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -68,4 +68,8 @@ void BCELossInferMeta(const MetaTensor& input,
                      const MetaTensor& label,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out);
 }  // namespace phi
--- a/paddle/phi/kernels/batch_norm_grad_kernel.h
+++ b/paddle/phi/kernels/batch_norm_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context& dev_ctx,
+                            const DenseTensor& y_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            paddle::optional<const DenseTensor&> reserve_space,
+                            paddle::optional<const DenseTensor&> mean,
+                            paddle::optional<const DenseTensor&> variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad);
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y_grad,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& saved_mean,
+                         const DenseTensor& saved_variance,
+                         paddle::optional<const DenseTensor&> reserve_space,
+                         paddle::optional<const DenseTensor&> mean,
+                         paddle::optional<const DenseTensor&> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad);
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x,
+                               const DenseTensor& scale,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               paddle::optional<const DenseTensor&> mean,
+                               paddle::optional<const DenseTensor&> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string& data_layout,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor* x_grad,
+                               DenseTensor* scale_grad,
+                               DenseTensor* y_grad_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/batch_norm_kernel.h
+++ b/paddle/phi/kernels/batch_norm_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void BatchNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& scale,
+                     const DenseTensor& bias,
+                     const DenseTensor& mean,
+                     const DenseTensor& variance,
+                     float momentum,
+                     float epsilon,
+                     const std::string& data_layout,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor* y,
+                     DenseTensor* mean_out,
+                     DenseTensor* variance_out,
+                     DenseTensor* saved_mean,
+                     DenseTensor* saved_variance,
+                     DenseTensor* reserve_space);
+}  // namespace phi
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+namespace phi {
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context& ctx,
+                            const DenseTensor& y_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            paddle::optional<const DenseTensor&> reserve_space,
+                            paddle::optional<const DenseTensor&> mean,
+                            paddle::optional<const DenseTensor&> variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout_str,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad) {
+  const auto* d_y = &y_grad;
+  DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  auto* d_x = x_grad;
+  auto* d_scale = scale_grad;
+  auto* d_bias = bias_grad;
+  use_global_stats = is_test || use_global_stats;
+  // batch_norm with inplace as false will take X as grad input, which
+  // is same as cuDNN batch_norm backward calculation, batch_norm
+  // with inplace as true only take Y as input and X should be calculate
+  // by inverse operation of batch_norm on Y
+  if (is_inplace) {
+    if (d_x) {
+      PADDLE_ENFORCE_EQ(d_x,
+                        d_y,
+                        phi::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+    }
+  } else {
+    if (d_x) {
+      PADDLE_ENFORCE_NE(d_x,
+                        d_y,
+                        phi::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+    }
+  }
+  // Get the size for each dimension.
+  // NCHW [batch_size, in_channels, in_height, in_width]
+  const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be larger than 1."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be less than 6."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  const int N = x_dims[0];
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = x.numel() / N / C;
+  // input dimension is 2 and the format is NCHW. The input can be regarded as
+  // NHWC format
+  if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+    data_layout = DataLayout::kNHWC;
+  }
+  // init output
+  if (d_x) {
+    ctx.template Alloc<T>(d_x);
+  }
+  const T* mean_data = saved_mean.data<T>();
+  const T* inv_var_data = saved_variance.data<T>();
+  DenseTensor inv_var_tensor;
+  if (use_global_stats) {
+    const auto* running_mean = mean.get_ptr();
+    const auto* running_variance = variance.get_ptr();
+    mean_data = running_mean->data<T>();
+    inv_var_tensor.Resize({C});
+    T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
+    EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+    ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+    inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+    inv_var_data = running_inv_var_data;
+  }
+  ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+  T* d_bias_data = nullptr;
+  T* d_scale_data = nullptr;
+  if (d_scale && d_bias) {
+    d_bias_data = ctx.template Alloc<T>(d_bias);
+    d_scale_data = ctx.template Alloc<T>(d_scale);
+  }
+  // d_bias = np.sum(d_y, axis=0)
+  // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
+  // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
+  //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+  EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
+  EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
+  if (d_scale && d_bias) {
+    d_bias_arr.setZero();
+    d_scale_arr.setZero();
+  }
+  if (d_x && (N * sample_size) == 1 && !use_global_stats) {
+    paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+    return;
+  }
+  int scale_coefff = use_global_stats ? 1 : N * sample_size;
+  const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
+  DenseTensor dy_sum;
+  dy_sum.Resize({C});
+  auto dy_sum_data = ctx.template Alloc<T>(&dy_sum);
+  EigenVectorArrayMap<T> dy_sum_arr(dy_sum_data, C);
+  DenseTensor dy_mul_x_sub_mean_mul_invstd_sum;
+  dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
+  auto dy_mul_x_sub_mean_mul_invstd_sum_data =
+      ctx.template Alloc<T>(&dy_mul_x_sub_mean_mul_invstd_sum);
+  EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
+      dy_mul_x_sub_mean_mul_invstd_sum_data, C);
+  dy_sum_arr.setZero();
+  dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
+  // inplace calculation
+  // Y:  ((x - est_mean) * (inv_var) * scale + bias
+  //   formula transform ====>
+  //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  // X: (y - bias) / scale / (inv_var) + est_mean
+  //   formula transform ====>
+  //    (y - bias) / (scale * inv_var) + est_mean
+  switch (data_layout) {
+    case DataLayout::kNCHW: {
+      if (is_inplace) {
+        auto px = x;
+        EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), sample_size, N * C);
+        ConstEigenArrayMap<T> y_data(x.data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) /
+                               scale_inv_var_nhw(nc % C) / scale_coefff +
+                           mean_arr(nc % C);
+        }
+      }
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+      ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
+      for (int nc = 0; nc < N * C; ++nc) {
+        int c = nc % C;
+        dy_sum_arr(c) += d_y_arr.col(nc).sum();
+        dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
+            ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
+                .sum();
+      }
+      if (d_scale && d_bias) {
+        d_bias_arr = dy_sum_arr;
+        d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
+      }
+      if (d_x) {
+        EigenArrayMap<T> d_x_arr(
+            ctx.template Alloc<T>(d_x), sample_size, N * C);
+        if (!use_global_stats) {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) =
+                scale_inv_var_nhw(c) *
+                (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
+                 (x_arr.col(nc) - mean_arr[c]) *
+                     dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c));
+          }
+        } else {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc);
+          }
+        }
+      }
+      break;
+    }
+    case DataLayout::kNHWC: {
+      if (is_inplace) {
+        auto px = x;
+        EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), C, N * sample_size);
+        ConstEigenArrayMap<T> y_data(x.data<T>(), C, N * sample_size);
+        for (int nhw = 0; nhw < N * sample_size; nhw++) {
+          x_data.col(nhw) =
+              (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / scale_coefff +
+              mean_arr;
+        }
+      }
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
+      ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
+      for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+        dy_sum_arr += d_y_arr.col(nhw);
+        dy_mul_x_sub_mean_mul_invstd_sum_arr +=
+            (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+      }
+      if (d_scale && d_bias) {
+        d_bias_arr = dy_sum_arr;
+        d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
+      }
+      if (d_x) {
+        EigenArrayMap<T> d_x_arr(
+            ctx.template Alloc<T>(d_x), C, N * sample_size);
+        if (!use_global_stats) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) =
+                scale_inv_var_nhw *
+                (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
+                 (x_arr.col(nhw) - mean_arr) *
+                     dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
+          }
+        } else {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw);
+          }
+        }
+      }
+      break;
+    }
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
+                                                data_layout_str));
+  }
+}
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y_grad,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& saved_mean,
+                         const DenseTensor& saved_variance,
+                         paddle::optional<const DenseTensor&> reserve_space,
+                         paddle::optional<const DenseTensor&> mean,
+                         paddle::optional<const DenseTensor&> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad) {
+  BatchNormGradRawKernel<T, Context>(dev_ctx,
+                                     y_grad,
+                                     x,
+                                     scale,
+                                     bias,
+                                     saved_mean,
+                                     saved_variance,
+                                     reserve_space,
+                                     mean,
+                                     variance,
+                                     momentum,
+                                     epsilon,
+                                     data_layout,
+                                     is_test,
+                                     use_global_stats,
+                                     trainable_statistics,
+                                     fuse_with_relu,
+                                     false,
+                                     x_grad,
+                                     scale_grad,
+                                     bias_grad);
+}
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context& ctx,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x,
+                               const DenseTensor& scale,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               paddle::optional<const DenseTensor&> mean,
+                               paddle::optional<const DenseTensor&> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string& data_layout_str,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor* x_grad,
+                               DenseTensor* scale_grad,
+                               DenseTensor* y_grad_grad) {
+  const auto* X = &x;
+  const auto* Scale = &scale;
+  const auto* dY = &y_grad;
+  const auto* Saved_mean = &saved_mean;
+  const auto* Saved_variance = &saved_variance;
+  PADDLE_ENFORCE_EQ(is_test,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "`is_test = True` CANNOT be used in train program. If "
+                        "you want to use global status in pre_train model, "
+                        "please set `use_global_stats = True`"));
+  const auto data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  const auto* ddX = &x_grad_grad;
+  const auto* ddScale = &scale_grad_grad;
+  const auto* ddBias = &bias_grad_grad;
+  auto* dX = x_grad;
+  auto* dScale = scale_grad;
+  auto* ddY = y_grad_grad;
+  ctx.template Alloc<T>(dX);
+  ctx.template Alloc<T>(ddY);
+  const auto& x_dims = X->dims();
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = X->numel() / C;
+  phi::funcs::SetConstant<Context, T> set_constant;
+  const T* mean_data = Saved_mean->data<T>();
+  const T* inv_var_data = Saved_variance->data<T>();
+  DenseTensor inv_var_tensor;
+  if (use_global_stats) {
+    const auto* running_mean = mean.get_ptr();
+    const auto* running_variance = variance.get_ptr();
+    mean_data = running_mean->data<T>();
+    inv_var_tensor.Resize({C});
+    T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
+    EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+    ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+    inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+    inv_var_data = running_inv_var_data;
+  }
+  // transpose NCHW -> NHWC for easy calculate
+  DenseTensor transformed_x(X->type());
+  DenseTensor transformed_dy(dY->type());
+  DenseTensor transformed_ddx(ddX->type());
+  DenseTensor transformed_dx(dX->type());
+  DenseTensor transformed_ddy(ddY->type());
+  if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
+    VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+    // Input Tensor
+    ResizeToChannelLast<Context, T>(ctx, X, &transformed_x);
+    TransToChannelLast<Context, T>(ctx, X, &transformed_x);
+    ResizeToChannelLast<Context, T>(ctx, dY, &transformed_dy);
+    TransToChannelLast<Context, T>(ctx, dY, &transformed_dy);
+    ResizeToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
+    TransToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
+    // Output Tensor
+    ResizeToChannelLast<Context, T>(ctx, dX, &transformed_dx);
+    ResizeToChannelLast<Context, T>(ctx, ddY, &transformed_ddy);
+  } else {
+    transformed_x.ShareDataWith(*X);
+    transformed_dy.ShareDataWith(*dY);
+    transformed_ddx.ShareDataWith(*ddX);
+    transformed_dx.ShareDataWith(*dX);
+    transformed_ddy.ShareDataWith(*ddY);
+  }
+  ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+  Tensor mean_tile;
+  mean_tile.Resize({C, sample_size});
+  EigenArrayMap<T> mean_tile_data(
+      ctx.template Alloc<T>(&mean_tile), C, sample_size);
+  DenseTensor inv_var_tile;
+  inv_var_tile.Resize({C, sample_size});
+  EigenArrayMap<T> inv_var_tile_data(
+      ctx.template Alloc<T>(&inv_var_tile), C, sample_size);
+  mean_tile_data = mean_arr.replicate(1, sample_size);
+  inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
+  DenseTensor Scale_data;
+  if (!Scale) {
+    Scale_data.Resize({C});
+    ctx.template Alloc<T>(&Scale_data);
+    set_constant(ctx, &Scale_data, static_cast<T>(1));
+  }
+  ConstEigenVectorArrayMap<T> scale_arr(
+      Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
+  Tensor scale_tile;
+  scale_tile.Resize({C, sample_size});
+  EigenArrayMap<T> scale_tile_data(
+      ctx.template Alloc<T>(&scale_tile), C, sample_size);
+  scale_tile_data = scale_arr.replicate(1, sample_size);
+  ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
+  ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
+  DenseTensor x_sub_mean_mul_invstd;
+  x_sub_mean_mul_invstd.Resize({C, sample_size});
+  EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
+      ctx.template Alloc<T>(&x_sub_mean_mul_invstd), C, sample_size);
+  x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
+  if (dX) {
+    ctx.template Alloc<T>(dX);
+    EigenArrayMap<T> dx_arr(
+        ctx.template Alloc<T>(&transformed_dx), C, sample_size);
+    dx_arr.setZero();
+    if (use_global_stats) {
+      // math: dx = (ddscale * dy) * inv_var
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+        dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
+      }
+    } else {
+      // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
+      // axis=(n,h,w)) *
+      //          np.sum(dy, axis=(n,h,w)) -
+      //          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
+      //          mean),
+      //          axis=(n,h,w)) * inv_var.pow(2) *
+      //          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
+      //          NxHxW *
+      //          np.sum(ddx * (x - mean)) *
+      //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
+      //          np.sum(dy,
+      //          axis=(n,h,w)) * (x - mean) *
+      //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
+      //          inv_var
+      //          *
+      //          np.mean(dy, axis=(n,h,w)) -
+      //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
+      //          axis=(n,h,w)))
+      if (ddX) {
+        dx_arr +=
+            (x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
+             sample_size)
+                .colwise() *
+            (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
+             (dy_arr * ddx_arr).rowwise().sum() +
+             3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
+                 (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                 sample_size);
+        dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                  (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                  sample_size * (dy_arr.rowwise().sum() / sample_size - dy_arr);
+        dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                  (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                  sample_size *
+                  (ddx_arr.rowwise().sum() / sample_size - ddx_arr);
+        dx_arr = scale_tile_data * dx_arr;
+      }
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+        dx_arr +=
+            (dy_arr * inv_var_tile_data -
+             (dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size) *
+                 inv_var_tile_data -
+             x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size) *
+            ddscale_tile_data;
+      }
+    }
+    if (data_layout == DataLayout::kNCHW) {
+      VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+      TransToChannelFirst<Context, T>(ctx, &transformed_dx, dX);
+    }
+  }
+  if (dScale) {
+    EigenVectorArrayMap<T> dscale_arr(ctx.template Alloc<T>(dScale), C);
+    dscale_arr.setZero();
+    if (use_global_stats) {
+      // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
+      if (ddX) {
+        dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
+      }
+    } else {
+      // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
+      //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
+      //            ddx
+      if (ddX) {
+        Tensor first_grad;
+        first_grad.Resize({C, sample_size});
+        EigenArrayMap<T> first_grad_arr(
+            ctx.template Alloc<T>(&first_grad), C, sample_size);
+        first_grad_arr.setZero();
+        first_grad_arr +=
+            inv_var_tile_data *
+            (dy_arr -
+             dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+             x_sub_mean_mul_invstd_arr *
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size);
+        dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
+      }
+    }
+  }
+  if (ddY) {
+    ctx.template Alloc<T>(ddY);
+    EigenArrayMap<T> ddy_arr(
+        ctx.template Alloc<T>(&transformed_ddy), C, sample_size);
+    ddy_arr.setZero();
+    if (use_global_stats) {
+      // math: ddy = r * ddx * inv_var + ddbias +
+      //           ddscale * (x - mean) * inv_var
+      if (ddX) {
+        ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
+      }
+    } else {
+      // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+      //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+      //           np.mean(ddx * (x - mean), axis=(n,h,w)))
+      if (ddX) {
+        ddy_arr +=
+            scale_tile_data * inv_var_tile_data *
+            (ddx_arr -
+             ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+             x_sub_mean_mul_invstd_arr *
+                 (ddx_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size);
+      }
+    }
+    if (ddScale) {
+      ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      Tensor ddscale_tile;
+      ddscale_tile.Resize({C, sample_size});
+      EigenArrayMap<T> ddscale_tile_data(
+          ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+      ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+      ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+    }
+    if (ddBias) {
+      ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+      Tensor ddbias_tile;
+      ddbias_tile.Resize({C, sample_size});
+      EigenArrayMap<T> ddbias_tile_data(
+          ctx.template Alloc<T>(&ddbias_tile), C, sample_size);
+      ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+      ddy_arr += ddbias_tile_data;
+    }
+    if (data_layout == DataLayout::kNCHW) {
+      VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+      TransToChannelFirst<Context, T>(ctx, &transformed_ddy, ddY);
+    }
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    batch_norm_grad, CPU, ALL_LAYOUT, phi::BatchNormGradKernel, float, double) {
+}
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/cpu/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/fluid/framework/tensor_util.h"
+namespace phi {
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T, typename Context>
+void BatchNormKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& scale,
+                     const DenseTensor& bias,
+                     const DenseTensor& mean,
+                     const DenseTensor& variance,
+                     float momentum,
+                     float epsilon,
+                     const std::string& data_layout_str,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor* y,
+                     DenseTensor* mean_out,
+                     DenseTensor* variance_out,
+                     DenseTensor* saved_mean,
+                     DenseTensor* saved_variance,
+                     DenseTensor* reserve_space) {
+  bool test_mode = is_test && (!trainable_statistics);
+  bool global_stats = test_mode || use_global_stats;
+  auto data_layout = paddle::framework::StringToDataLayout(data_layout_str);
+  const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be larger than 1."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be less than 6."
+          "But received: the size of input X's dimensionss is [%d]",
+          x_dims.size()));
+  const int N = x_dims[0];
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = x.numel() / N / C;
+  // alloc memory
+  ctx.template Alloc<T>(y);
+  ctx.template Alloc<T>(mean_out);
+  ctx.template Alloc<T>(variance_out);
+  ctx.template Alloc<T>(saved_mean);
+  ctx.template Alloc<T>(saved_variance);
+  // input dimension is 2 and the format is NCHW. The input can be regarded
+  // as NHWC format
+  if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+    data_layout = DataLayout::kNHWC;
+  }
+  if (!global_stats) {
+    // saved_xx is use just in this batch of data
+    EigenVectorArrayMap<T> saved_mean_e(ctx.template Alloc<T>(saved_mean), C);
+    EigenVectorArrayMap<T> saved_variance_e(
+        ctx.template Alloc<T>(saved_variance), C);
+    saved_mean_e.setZero();
+    saved_variance_e.setZero();
+    EigenVectorArrayMap<T> running_mean_arr(ctx.template Alloc<T>(mean_out), C);
+    EigenVectorArrayMap<T> running_var_arr(ctx.template Alloc<T>(variance_out),
+                                           C);
+    if ((N * sample_size) == 1) {
+      // Only 1 element in normalization dimension,
+      // we skip the batch norm calculation, let y = x.
+      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+      return;
+    }
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
+        ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          saved_mean_e(nc % C) += x_arr.col(nc).sum();
+        }
+        saved_mean_e /= N * sample_size;
+        for (int nc = 0; nc < N * C; ++nc) {
+          saved_variance_e(nc % C) +=
+              (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+        }
+        saved_variance_e /= N * sample_size;
+        break;
+      }
+      case DataLayout::kNHWC: {
+        ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
+        for (int i = 0; i < N * sample_size; ++i) {
+          saved_mean_e += x_arr.col(i);
+        }
+        saved_mean_e /= N * sample_size;
+        for (int i = 0; i < N * sample_size; ++i) {
+          saved_variance_e +=
+              (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
+        }
+        saved_variance_e /= N * sample_size;
+        break;
+      }
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
+                                                  data_layout_str));
+    }
+    // if MomentumTensor is set, use MomentumTensor value, momentum
+    // is only used in this training branch
+    running_mean_arr =
+        running_mean_arr * momentum + saved_mean_e * (1. - momentum);
+    running_var_arr =
+        running_var_arr * momentum + saved_variance_e * (1. - momentum);
+  }
+  // use SavedMean and SavedVariance to do normalize
+  Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+  if (global_stats) {
+    ConstEigenVectorArrayMap<T> var_arr(variance.data<T>(), C);
+    inv_std = (var_arr + epsilon).sqrt().inverse();
+  } else {
+    EigenVectorArrayMap<T> saved_inv_std(saved_variance->data<T>(), C);
+    // inverse SavedVariance first, gradient will use it too.
+    saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
+    inv_std = saved_inv_std;
+  }
+  ConstEigenVectorArrayMap<T> mean_arr(
+      global_stats ? mean.data<T>() : saved_mean->data<T>(), C);
+  //   ((x - est_mean) * (inv_var) * scale + bias
+  //   formula transform ====>
+  //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
+  Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+  Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+      bias_arr - mean_arr * inv_std * scale_arr;
+  switch (data_layout) {
+    case DataLayout::kNCHW: {
+      EigenArrayMap<T> y_arr(ctx.template Alloc<T>(y), sample_size, N * C);
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+      for (int nc = 0; nc < N * C; ++nc) {
+        y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+      }
+      break;
+    }
+    case DataLayout::kNHWC: {
+      EigenArrayMap<T>(ctx.template Alloc<T>(y), C, N * sample_size) =
+          (ConstEigenArrayMap<T>(x.data<T>(), C, N * sample_size).colwise() *
+           new_scale)
+              .colwise() +
+          new_bias;
+      break;
+    }
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %d",
+                                                data_layout));
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    batch_norm, CPU, ALL_LAYOUT, phi::BatchNormKernel, float, double) {}
--- a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/fluid/framework/generator.h"
+namespace phi {
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& dev_ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out) {
+  auto tensor = out;
+  std::normal_distribution<T> dist(mean, std);
+  tensor->Resize(phi::make_ddim(shape.GetData()));
+  int64_t size = tensor->numel();
+  T* data = dev_ctx.template Alloc<T>(tensor);
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(gaussian_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GaussianRandomKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/cpu/pad_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pad_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/pad_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(pad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PadGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/cpu/pad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/pad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
+PD_REGISTER_KERNEL(pad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PadKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/fluid/operators/math/padding.h
+++ b/paddle/fluid/operators/math/padding.h
@@ -15,21 +15,26 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
-namespace paddle {
+namespace phi {
-namespace operators {
+namespace funcs {
-namespace math {
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using EigenTensor = EigenTensor<T, D, MajorType, IndexType>;
 template <typename DeviceContext, typename T, size_t D>
-void PadFunction(const framework::ExecutionContext& context,
+void PadFunction(const DeviceContext& context,
-                 const std::vector<int>& pads, const framework::Tensor& src,
+                 const std::vector<int>& pads,
-                 T pad_value, framework::Tensor* out) {
+                 const DenseTensor& src,
+                 T pad_value,
+                 DenseTensor* out) {
  std::array<std::pair<int64_t, int64_t>, D> paddings;
  for (size_t i = 0; i < paddings.size(); ++i) {
@@ -40,16 +45,16 @@ void PadFunction(const framework::ExecutionContext& context,
  auto src_tensor = EigenTensor<T, D>::From(src);
  auto out_tensor = EigenTensor<T, D>::From(*out);
-  auto& place =
+  auto& place = *(context.eigen_device());
-      *context.template device_context<DeviceContext>().eigen_device();
  EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
      place, out_tensor, src_tensor, paddings, pad_value);
 }
 template <typename DeviceContext, typename T, size_t D>
-void PadGradFunction(const framework::ExecutionContext& context,
+void PadGradFunction(const DeviceContext& context,
-                     const std::vector<int>& pads, const framework::Tensor& src,
+                     const std::vector<int>& pads,
-                     framework::Tensor* d_out) {
+                     const DenseTensor& src,
+                     DenseTensor* d_out) {
  std::array<std::pair<int64_t, int64_t>, D> paddings;
  for (size_t i = 0; i < paddings.size(); ++i) {
    paddings[i].first = -pads[i * 2];
@@ -58,16 +63,18 @@ void PadGradFunction(const framework::ExecutionContext& context,
  auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
  auto src_tensor = EigenTensor<T, D>::From(src);
-  auto& place =
+  auto& place = *(context.eigen_device());
-      *context.template device_context<DeviceContext>().eigen_device();
  EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
      place, d_out_tensor, src_tensor, paddings, static_cast<T>(0));
 }
 template <typename DeviceContext, typename T>
-void PaddingFunctor(int rank, const framework::ExecutionContext& context,
+void PaddingFunctor(int rank,
-                    const std::vector<int>& pads, T pad_value,
+                    const DeviceContext& context,
-                    const framework::Tensor& src, framework::Tensor* out) {
+                    const std::vector<int>& pads,
+                    T pad_value,
+                    const DenseTensor& src,
+                    DenseTensor* out) {
  switch (rank) {
    case 1:
      PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out);
@@ -88,16 +95,18 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
      PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
      break;
    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(
-          "PadOp only support tensors with no more"
+          phi::errors::Unimplemented("PadOp only support tensors with no more"
                                     " than 6 dimensions currently."));
  }
 }
 template <typename DeviceContext, typename T>
-void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
+void PaddingGradFunctor(int rank,
+                        const DeviceContext& context,
                        const std::vector<int>& pads,
-                        const framework::Tensor& src, framework::Tensor* out) {
+                        const DenseTensor& src,
+                        DenseTensor* out) {
  switch (rank) {
    case 1:
      PadGradFunction<DeviceContext, T, 1>(context, pads, src, out);
@@ -118,8 +127,8 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
      PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
      break;
    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
+      PADDLE_THROW(
-          "PadOp only support tensors with no more"
+          phi::errors::Unimplemented("PadOp only support tensors with no more"
                                     " than 6 dimensions currently."));
  }
 }
@@ -137,6 +146,5 @@ inline bool IsSymmetricPadding(const std::vector<int>& pads,
  }
  return is_sys_pad;
 }
-}  // namespace math
+}  // namespace funcs
-}  // namespace operators
+}  // namespace phi
-}  // namespace paddle
--- a/paddle/phi/kernels/gaussian_random_kernel.h
+++ b/paddle/phi/kernels/gaussian_random_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+namespace phi {
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+namespace phi {
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance,
+    const double epsilon,
+    const int N,
+    const int C,
+    const int HxW,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
+    BatchNormParamType<T> mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
+                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum * inv_var_i;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+template <typename T, phi::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy,
+                                        const BatchNormParamType<T> *scale,
+                                        const BatchNormParamType<T> *variance,
+                                        const double epsilon,
+                                        const int C,
+                                        const int HxW,
+                                        const int num,
+                                        T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
+    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
+                           scale[c] * inv_var);
+  }
+}
+template <typename T>
+static __global__ void KeBNRestoreData(const phi::DataLayout layout,
+                                       T *x,
+                                       const BatchNormParamType<T> *scale,
+                                       const BatchNormParamType<T> *bias,
+                                       const BatchNormParamType<T> *mean,
+                                       const BatchNormParamType<T> *variance,
+                                       double epsilon,
+                                       int C,
+                                       int M,
+                                       const int num,
+                                       const T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? (i / M) % C : i % C;
+    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
+    auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
+    x[i] = static_cast<T>(x_i);
+  }
+}
+template <typename T>
+class InplaceHelper {
+ public:
+  void operator()(const phi::DataLayout layout,
+                  T *x,
+                  const BatchNormParamType<T> *scale,
+                  const BatchNormParamType<T> *bias,
+                  const BatchNormParamType<T> *mean,
+                  const BatchNormParamType<T> *variance,
+                  double epsilon,
+                  int C,
+                  int M,
+                  const int num,
+                  const T *y,
+                  int grid2,
+                  const int block,
+                  const gpuStream_t &stream) {
+    PADDLE_ENFORCE_EQ(x,
+                      y,
+                      phi::errors::InvalidArgument(
+                          "X and Y should be inplaced in inplace mode"));
+    KeBNRestoreData<<<grid2, block, 0, stream>>>(
+        layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
+  }
+};
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *saved_mean,
+    const BatchNormParamType<T> *saved_inv_variance,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    T *dx,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> inv_var_val;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> dscale_val;
+  __shared__ BatchNormParamType<T> dbias_val;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+    if (saved_mean && saved_inv_variance) {
+      if (threadIdx.x == 0) {
+        inv_var_val = saved_inv_variance[i];
+        mean_val = saved_mean[i];
+      }
+    } else {
+      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+      BatchNormParamType<T> x_square_sum =
+          static_cast<BatchNormParamType<T>>(0);
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index = layout == phi::DataLayout::kNCHW
+                              ? (j / HxW * C + i) * HxW + j % HxW
+                              : j * outer_size + i;
+        BatchNormParamType<T> x_i =
+            static_cast<BatchNormParamType<T>>(x[index]);
+        x_sum += x_i;
+        x_square_sum += x_i * x_i;
+      }
+      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+      x_square_sum =
+          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+      if (threadIdx.x == 0) {
+        mean_val = x_sum / inner_size;
+        inv_var_val =
+            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
+      }
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      ds_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
+      db_sum += dy_i;
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale_val = ds_sum * inv_var_val;
+      dbias_val = db_sum;
+      dscale[i] = dscale_val;
+      dbias[i] = dbias_val;
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] = scale[i] * inv_var_val *
+                  (static_cast<BatchNormParamType<T>>(dy[index]) -
+                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
+                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
+                       inv_var_val * dscale_val / inner_size);
+    }
+  }
+}
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
+    const T *dy,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *mean,
+    const T *x,
+    const BatchNormParamType<T> *variance,
+    const int C,
+    const int N,
+    const int HxW,
+    T *dx) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
+  __shared__ BatchNormParamType<T> dy_sum_val;
+  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> inv_var_i = variance[i];
+    BatchNormParamType<T> mean_i = mean[i];
+    BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> dy_x_sub_mean_sum =
+        static_cast<BatchNormParamType<T>>(0);
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      dy_sum += dy_i;
+      dy_x_sub_mean_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+    }
+    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+    dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
+                            .Reduce(dy_x_sub_mean_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dy_sum_val = dy_sum;
+      dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] =
+          (static_cast<BatchNormParamType<T>>(dy[index]) -
+           dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
+           (static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
+               dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
+          scale[i] * inv_var_i;
+    }
+  }
+}
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context &ctx,
+                            const DenseTensor &y_grad,
+                            const DenseTensor &x,
+                            const DenseTensor &scale,
+                            const DenseTensor &bias,
+                            const DenseTensor &saved_mean,
+                            const DenseTensor &saved_variance,
+                            paddle::optional<const DenseTensor &> reserve_space,
+                            paddle::optional<const DenseTensor &> mean,
+                            paddle::optional<const DenseTensor &> variance,
+                            float momentum,
+                            float epsilon_f,
+                            const std::string &data_layout_str,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor *x_grad,
+                            DenseTensor *scale_grad,
+                            DenseTensor *bias_grad) {
+  double epsilon = static_cast<double>(epsilon_f);
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  const auto *d_y = &y_grad;
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+  use_global_stats = is_test || use_global_stats;
+  const auto &x_dims = x.dims();
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() >= 2 && x_dims.size() <= 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The size of input's dimensions should be between 2 and 5."
+          "But received: the size of input's dimensions is [%d],"
+          "the dimensions of input is [%s]",
+          x_dims.size(),
+          x_dims));
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+  // init output
+  if (d_x) {
+    ctx.template Alloc<T>(d_x);
+  }
+  if (d_scale && d_bias) {
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+  }
+  PADDLE_ENFORCE_EQ(
+      scale.dims().size(),
+      1UL,
+      phi::errors::InvalidArgument(
+          "The size of scale's dimensions must equal to 1. But received: "
+          "the size of scale's dimensions is [%d], the dimensions of scale "
+          "is [%s].",
+          scale.dims().size(),
+          scale.dims()));
+  PADDLE_ENFORCE_EQ(
+      scale.dims()[0],
+      C,
+      phi::errors::InvalidArgument(
+          "The first dimension of scale must equal to Channels[%d]. But "
+          "received: the first dimension of scale is [%d]",
+          C,
+          scale.dims()[0]));
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+#ifdef PADDLE_WITH_HIP
+  auto compute_format =
+      data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
+#else
+  const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF &&
+                                    FLAGS_cudnn_batchnorm_spatial_persistent &&
+                                    (reserve_space.get_ptr() != nullptr);
+  auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
+                            ? DataLayout::kNHWC
+                            : DataLayout::kNCHW;
+#endif
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_d_y(d_y->type());
+  DenseTensor transformed_d_x;
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    ResizeToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
+    TransToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
+    if (d_x) {
+      ResizeToChannelFirst<Context, T>(ctx, d_x, &transformed_d_x);
+    }
+  } else {
+    transformed_x.ShareDataWith(x);
+    transformed_d_y.ShareDataWith(*d_y);
+    if (d_x) {
+      transformed_d_x.ShareDataWith(*d_x);
+    }
+  }
+  std::vector<int> dims;
+  std::vector<int> strides;
+  if (compute_format == DataLayout::kNCHW) {
+    dims = {N, C, H, W, D};
+    strides = {C * H * W * D, H * W * D, W * D, D, 1};
+  } else {
+    dims = {N, C, H, W, D};
+    strides = {H * W * C * D, 1, W * D * C, D * C, C};
+  }
+  const int num = transformed_x.numel();
+#ifdef HIPCC
+  const int block = 256;
+#else
+  const int block = 512;
+#endif
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  int grid1 = (num + block - 1) / block;
+  int grid2 = std::min(C, max_blocks);
+  auto stream = ctx.stream();
+  InplaceHelper<T> inplace_functor;
+  if (!use_global_stats) {
+    if ((N * H * W * D) == 1) {
+      if (d_x) {
+        paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+      }
+      phi::funcs::SetConstant<Context, BatchNormParamType<T>> functor;
+      functor(ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+      functor(ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+      return;
+    }
+// ------------------- cudnn descriptors ---------------------
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#else
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnCreateTensorDescriptor(
+            &bn_param_desc_));
+#endif
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
+#elif CUDNN_VERSION_MIN(7, 0, 1)
+    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+    } else if (H == 1 && W == 1) {
+      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+    } else {
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+    }
+#else
+    if (H == 1 && W == 1) {
+      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+    } else {
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+    }
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
+//                                                       data_desc_, mode_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+            data_desc_,
+            CudnnDataType<T>::type,
+            x_dims.size() > 3 ? x_dims.size() : 4,
+            dims.data(),
+            strides.data()));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+            bn_param_desc_, data_desc_, mode_));
+#endif
+    const auto *saved_mean_data =
+        saved_mean.template data<BatchNormParamType<T>>();
+    const auto *saved_var_data =
+        saved_variance.template data<BatchNormParamType<T>>();
+    if (is_inplace) {
+      inplace_functor(compute_format,
+                      transformed_x.data<T>(),
+                      scale.template data<BatchNormParamType<T>>(),
+                      bias.template data<BatchNormParamType<T>>(),
+                      saved_mean_data,
+                      saved_var_data,
+                      epsilon,
+                      C,
+                      H * W * D,
+                      num,
+                      transformed_x.data<T>(),
+                      grid2,
+                      block,
+                      stream);
+    }
+    // This branch calls CUDNN APIs
+    if (d_x && d_scale && d_bias) {
+      bool called = false;
+#if CUDNN_VERSION_MIN(7, 4, 1)
+      called = true;
+      size_t workspace_size = 0;
+      void *workspace_ptr = nullptr;
+      DenseTensor workspace_tensor;
+      auto reserve_space_size = reserve_space->memory_size();
+      // --------------- cudnn batchnorm workspace ---------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+                  /*handle=*/ctx.cudnn_handle(),
+                  /*mode=*/mode_,
+                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*xDesc=*/data_desc_,
+                  /*yDesc=*/data_desc_,
+                  /*dyDesc=*/data_desc_,
+                  /*dzDesc=*/nullptr,
+                  /*dxDesc=*/data_desc_,
+                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                  /*activationDesc=*/nullptr,
+                  /*sizeInBytes=*/&workspace_size));
+      workspace_ptr = workspace_tensor.mutable_data(
+          ctx.GetPlace(), transformed_x.type(), workspace_size);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
+              /*handle=*/ctx.cudnn_handle(),
+              /*mode=*/mode_,
+              /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+              /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+              /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+              /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+              /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+              /*xDesc=*/data_desc_,
+              /*xData=*/transformed_x.template data<T>(),
+              /*yDesc=*/nullptr,
+              /*yData=*/nullptr,
+              /*dyDesc=*/data_desc_,
+              /*dyData=*/transformed_d_y.template data<T>(),
+              /*dzDesc=*/nullptr,
+              /*dzData=*/nullptr,
+              /*dxDesc=*/data_desc_,
+              /*dxData=*/ctx.template Alloc<T>(&transformed_d_x),
+              /*dBnScaleBiasDesc=*/bn_param_desc_,
+              /*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
+              /*bnBiasData=*/nullptr,
+              /*dBnScaleData=*/d_scale
+                  ->template mutable_data<BatchNormParamType<T>>(
+                      ctx.GetPlace()),
+              /*dBnBiasData=*/d_bias
+                  ->template mutable_data<BatchNormParamType<T>>(
+                      ctx.GetPlace()),
+              /*epsilon=*/epsilon,
+              /*savedMean=*/saved_mean_data,
+              /*savedInvVariance=*/saved_var_data,
+              /*activationDesc=*/nullptr,
+              /*workspace=*/workspace_ptr,
+              /*workSpaceSizeInBytes=*/workspace_size,
+              /*reserveSpace=*/const_cast<T *>(
+                  reserve_space->template data<T>()),
+              /*reserveSpaceSizeInBytes=*/reserve_space_size));
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+      if (!called) {
+#ifdef PADDLE_WITH_HIP
+        if (compute_format == DataLayout::kNCHW) {
+          BNBackward<T,
+                     block,
+                     DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
+              transformed_d_y.template data<T>(),
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_d_x.template data<T>(),
+              d_scale->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              d_bias->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()));
+        } else {
+          BNBackward<T,
+                     block,
+                     DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
+              transformed_d_y.template data<T>(),
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_d_x.template data<T>(),
+              d_scale->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              d_bias->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()));
+        }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationBackward(
+//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), data_desc_,
+//         transformed_x.template data<T>(), data_desc_,
+//         transformed_d_y.template data<T>(), data_desc_,
+//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
+//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+//         d_scale->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         d_bias->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         epsilon, saved_mean_data, saved_var_data));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationBackward(
+                ctx.cudnn_handle(),
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                transformed_d_y.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_d_x),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                epsilon,
+                saved_mean_data,
+                saved_var_data));
+#endif
+      }
+      if (data_layout == DataLayout::kNHWC &&
+          compute_format == DataLayout::kNCHW) {
+        VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+        TransToChannelLast<Context, T>(ctx, &transformed_d_x, d_x);
+      }
+    } else {
+      // This branch call CUDA kernels
+      if (compute_format == DataLayout::kNCHW) {
+        if (d_x) {
+          BNBackwardData<
+              T,
+              block,
+              phi::DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
+              d_y->data<T>(),
+              scale.data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              x.data<T>(),
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<
+              T,
+              block,
+              phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+              d_y->data<T>(),
+              x.data<T>(),
+              saved_mean_data,
+              saved_var_data,
+              epsilon,
+              N,
+              C,
+              H * W * D,
+              d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      } else {
+        if (d_x) {
+          BNBackwardData<
+              T,
+              block,
+              phi::DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
+              d_y->data<T>(),
+              scale.data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              x.data<T>(),
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<
+              T,
+              block,
+              phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+              d_y->data<T>(),
+              x.data<T>(),
+              saved_mean_data,
+              saved_var_data,
+              epsilon,
+              N,
+              C,
+              H * W * D,
+              d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      }
+    }
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#else
+    // clean when exit.
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDestroyTensorDescriptor(
+            bn_param_desc_));
+#endif
+  } else {
+    const auto *running_mean = mean.get_ptr();
+    const auto *running_var = variance.get_ptr();
+    const auto *running_mean_data =
+        running_mean->template data<BatchNormParamType<T>>();
+    const auto *running_var_data =
+        running_var->template data<BatchNormParamType<T>>();
+    if (is_inplace) {
+      auto px = x;
+      inplace_functor(data_layout,
+                      ctx.template Alloc<T>(&px),
+                      scale.template data<BatchNormParamType<T>>(),
+                      bias.template data<BatchNormParamType<T>>(),
+                      running_mean_data,
+                      running_var_data,
+                      epsilon,
+                      C,
+                      H * W * D,
+                      num,
+                      x.data<T>(),
+                      grid2,
+                      block,
+                      stream);
+    }
+    if (compute_format == DataLayout::kNCHW) {
+      if (d_x) {
+        KeBNBackwardData<T,
+                         phi::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
+            d_y->data<T>(),
+            scale.data<BatchNormParamType<T>>(),
+            running_var_data,
+            epsilon,
+            C,
+            H * W,
+            num,
+            d_x->data<T>());
+      }
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T,
+            block,
+            phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+            d_y->data<T>(),
+            x.data<T>(),
+            running_mean_data,
+            running_var_data,
+            epsilon,
+            N,
+            C,
+            H * W * D,
+            d_scale->data<BatchNormParamType<T>>(),
+            d_bias->data<BatchNormParamType<T>>());
+      }
+    } else {
+      if (d_x) {
+        KeBNBackwardData<T,
+                         phi::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
+            d_y->data<T>(),
+            scale.data<BatchNormParamType<T>>(),
+            running_var_data,
+            epsilon,
+            C,
+            H * W,
+            num,
+            d_x->data<T>());
+      }
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T,
+            block,
+            phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+            d_y->data<T>(),
+            x.data<T>(),
+            running_mean_data,
+            running_var_data,
+            epsilon,
+            N,
+            C,
+            H * W * D,
+            d_scale->data<BatchNormParamType<T>>(),
+            d_bias->data<BatchNormParamType<T>>());
+      }
+    }
+  }
+}
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context &dev_ctx,
+                         const DenseTensor &y_grad,
+                         const DenseTensor &x,
+                         const DenseTensor &scale,
+                         const DenseTensor &bias,
+                         const DenseTensor &saved_mean,
+                         const DenseTensor &saved_variance,
+                         paddle::optional<const DenseTensor &> reserve_space,
+                         paddle::optional<const DenseTensor &> mean,
+                         paddle::optional<const DenseTensor &> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string &data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor *x_grad,
+                         DenseTensor *scale_grad,
+                         DenseTensor *bias_grad) {
+  BatchNormGradRawKernel<T, Context>(dev_ctx,
+                                     y_grad,
+                                     x,
+                                     scale,
+                                     bias,
+                                     saved_mean,
+                                     saved_variance,
+                                     reserve_space,
+                                     mean,
+                                     variance,
+                                     momentum,
+                                     epsilon,
+                                     data_layout,
+                                     is_test,
+                                     use_global_stats,
+                                     trainable_statistics,
+                                     fuse_with_relu,
+                                     false,
+                                     x_grad,
+                                     scale_grad,
+                                     bias_grad);
+}
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context &ctx,
+                               const DenseTensor &x_grad_grad,
+                               const DenseTensor &scale_grad_grad,
+                               const DenseTensor &bias_grad_grad,
+                               const DenseTensor &y_grad,
+                               const DenseTensor &x,
+                               const DenseTensor &scale,
+                               const DenseTensor &saved_mean,
+                               const DenseTensor &saved_variance,
+                               paddle::optional<const DenseTensor &> mean,
+                               paddle::optional<const DenseTensor &> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string &data_layout_str,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor *x_grad,
+                               DenseTensor *scale_grad,
+                               DenseTensor *y_grad_grad) {
+  PADDLE_ENFORCE_EQ(is_test,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "`is_test = True` CANNOT be used in train program. If "
+                        "you want to use global status in pre_train model, "
+                        "please set `use_global_stats = True`"));
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  const DenseTensor *running_mean = nullptr;
+  const DenseTensor *running_variance = nullptr;
+  if (use_global_stats) {
+    running_mean = mean.get_ptr();
+    running_variance = variance.get_ptr();
+  }
+  paddle::operators::NormDoubleGradFunctor<Context, T>(ctx,
+                                                       data_layout,
+                                                       &x,
+                                                       &scale,
+                                                       &y_grad,
+                                                       &saved_mean,
+                                                       &saved_variance,
+                                                       running_mean,
+                                                       running_variance,
+                                                       epsilon,
+                                                       use_global_stats,
+                                                       &x_grad_grad,
+                                                       &scale_grad_grad,
+                                                       &bias_grad_grad,
+                                                       x_grad,
+                                                       scale_grad,
+                                                       y_grad_grad);
+}
+}  // namespace phi
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#endif
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
+#else
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
+#endif
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+namespace phi {
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+template <typename T, phi::DataLayout layout>
+static __global__ void BNForwardInference(const T *x,
+                                          const BatchNormParamType<T> *mean,
+                                          const BatchNormParamType<T> *variance,
+                                          const BatchNormParamType<T> *scale,
+                                          const BatchNormParamType<T> *bias,
+                                          const int C,
+                                          const int N,
+                                          const int HxW,
+                                          const double epsilon,
+                                          T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num = N * C * HxW;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> x_sub_mean =
+        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
+    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
+    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
+  }
+}
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *mean,
+    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+    x_square_sum =
+        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean_val = x_sum / inner_size;
+      variance_val = x_square_sum / inner_size - mean_val * mean_val;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+template <typename T, typename Context>
+void BatchNormKernel(const Context &ctx,
+                     const DenseTensor &x,
+                     const DenseTensor &scale,
+                     const DenseTensor &bias,
+                     const DenseTensor &mean,
+                     const DenseTensor &variance,
+                     float momentum,
+                     float epsilon_f,
+                     const std::string &data_layout_str,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor *y,
+                     DenseTensor *mean_out,
+                     DenseTensor *variance_out,
+                     DenseTensor *saved_mean,
+                     DenseTensor *saved_variance,
+                     DenseTensor *reserve_space) {
+  double epsilon = epsilon_f;
+  const bool trainable_stats = trainable_statistics;
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  bool test_mode = is_test && (!trainable_stats);
+  // Get the size for each dimension.
+  // NCHW [batch_size, in_channels, in_height, in_width]
+  const auto &x_dims = x.dims();
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() >= 2 && x_dims.size() <= 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The size of input's dimensions should be between 2 and 5"
+          "But received: the size of input's dimensions is [%d]",
+          x_dims.size()));
+  ctx.template Alloc<T>(y);
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+#ifdef PADDLE_WITH_HIP
+  auto compute_format =
+      data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
+#else
+  const bool fast_nhwc_batch_norm =
+      test_mode ||
+      (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
+  auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
+                            ? DataLayout::kNHWC
+                            : DataLayout::kNCHW;
+#endif
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_y(y->type());
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    ResizeToChannelFirst<Context, T>(ctx, y, &transformed_y);
+  } else {
+    transformed_x.ShareDataWith(x);
+    transformed_y.ShareDataWith(*y);
+  }
+// ------------------- cudnn descriptors ---------------------
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t bn_param_desc_;
+  cudnnBatchNormMode_t mode_;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+#endif
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
+#elif CUDNN_VERSION_MIN(7, 0, 1)
+  if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+  } else if (H == 1 && W == 1) {
+    mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+  }
+#else
+  if (H == 1 && W == 1) {
+    mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+  }
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
+  VLOG(3) << "Setting descriptors.";
+  std::vector<int> dims;
+  std::vector<int> strides;
+  if (compute_format == DataLayout::kNCHW) {
+    dims = {N, C, H, W, D};
+    strides = {C * H * W * D, H * W * D, W * D, D, 1};
+  } else {
+    dims = {N, C, H, W, D};
+    strides = {H * W * D * C, 1, W * D * C, D * C, C};
+  }
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// Note: PERSISTENT not implemented for inference
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(
+//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          dims.data(),
+          strides.data()));
+  // Note: PERSISTENT not implemented for inference
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+          bn_param_desc_,
+          data_desc_,
+          test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
+#endif
+  auto handle = ctx.cudnn_handle();
+  // Now, depending on whether we are running test or not, we have two paths.
+  // It is training mode when it's not reference AND not using pre-trained
+  // model.
+  bool training = !test_mode && !use_global_stats;
+  if (!training) {
+    // only when test we use input to do computation.
+    const auto *est_mean = &mean;
+    const auto *est_var = &variance;
+    // Run inference mode.
+    PADDLE_ENFORCE_EQ(
+        est_mean->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The size of mean's dimensions must equal to 1."
+            "But received: the size of mean's dimensions mean is [%d],"
+            "the dimensions of mean is [%s].",
+            est_mean->dims().size(),
+            est_mean->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_var->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The size of variance's dimensions must equal to 1."
+            "But received: the size of variance's dimensions is [%d],"
+            "the dimensions of variance is [%s].",
+            est_var->dims().size(),
+            est_var->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_mean->dims()[0],
+        C,
+        phi::errors::InvalidArgument(
+            "The first dimension of mean must equal to the number of "
+            "Channels, which is [%d]. But received: the first dimension"
+            "of mean is [%d], the dimensions of mean is [%s].",
+            C,
+            est_mean->dims()[0],
+            est_mean->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_var->dims()[0],
+        C,
+        phi::errors::InvalidArgument(
+            "The first dimension of variance must equal to the number"
+            "of Channels, which is [%d]. But received: the first dimension of"
+            "variance is [%d], the dimensions of variance is [%s].",
+            C,
+            est_var->dims()[0],
+            est_var->dims()));
+#ifdef PADDLE_WITH_HIP
+    const int block_size = 256;
+    const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
+    if (compute_format == DataLayout::kNCHW) {
+      BNForwardInference<
+          T,
+          DataLayout::kNCHW><<<grid_size, block_size, 0, ctx.stream()>>>(
+          transformed_x.template data<T>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          C,
+          N,
+          H * W * D,
+          epsilon,
+          transformed_y.template data<T>());
+    } else {
+      BNForwardInference<
+          T,
+          DataLayout::kNHWC><<<grid_size, block_size, 0, ctx.stream()>>>(
+          transformed_x.template data<T>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          C,
+          N,
+          H * W * D,
+          epsilon,
+          transformed_y.template data<T>());
+    }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardInference(
+//         handle, miopenBNSpatial,
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_mean->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_var->template data<BatchNormParamType<T>>())),
+//         epsilon));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnBatchNormalizationForwardInference(
+            handle,
+            // Note: PERSISTENT not implemented for inference
+            CUDNN_BATCHNORM_SPATIAL,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            transformed_x.template data<T>(),
+            data_desc_,
+            ctx.template Alloc<T>(&transformed_y),
+            bn_param_desc_,
+            scale.template data<BatchNormParamType<T>>(),
+            bias.template data<BatchNormParamType<T>>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            epsilon));
+#endif
+  } else {
+    // if MomentumTensor is set, use MomentumTensor value, momentum
+    // is only used in this training branch
+    // need to solve here
+    // if (ctx.HasInput("MomentumTensor")) {
+    //   const auto *mom_tensor = MomentumTensor;
+    //   DenseTensor mom_cpu;
+    //   paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
+    //                                     &mom_cpu);
+    //   momentum = mom_cpu.data<float>()[0];
+    // }
+    // Run training mode.
+    // obtain running mean and running inv var, and there is no need
+    // to initialize them.
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    if ((N * H * W * D) == 1) {
+      // Only 1 element in normalization dimension,
+      // skip the batch norm calculation, let y = x.
+      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+    } else {
+      double this_factor = 1. - momentum;
+      bool called = false;
+#if CUDNN_VERSION_MIN(7, 4, 1)
+      called = true;
+      size_t workspace_size = 0;
+      size_t reserve_space_size = 0;
+      void *reserve_space_ptr = nullptr;
+      void *workspace_ptr = nullptr;
+      DenseTensor workspace_tensor;
+      // Create reserve space and workspace for batch norm.
+      // Create tensor for each batchnorm op, it will be used in the
+      // backward. Thus this tensor shouldn't be temp.
+      // auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+      PADDLE_ENFORCE_NOT_NULL(
+          reserve_space,
+          phi::errors::NotFound(
+              "The argument ReserveSpace of batch_norm op is not found."));
+      // --------------- cudnn batchnorm workspace ---------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                  /*handle=*/handle,
+                  /*mode=*/mode_,
+                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*xDesc=*/data_desc_,
+                  /*zDesc=*/nullptr,
+                  /*yDesc=*/data_desc_,
+                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                  /*activationDesc=*/nullptr,
+                  /*sizeInBytes=*/&workspace_size));
+      // -------------- cudnn batchnorm reserve space --------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+                  /*handle=*/handle,
+                  /*mode=*/mode_,
+                  /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*activationDesc=*/nullptr,
+                  /*xDesc=*/data_desc_,
+                  /*sizeInBytes=*/&reserve_space_size));
+      reserve_space_ptr = reserve_space->mutable_data(
+          ctx.GetPlace(), transformed_x.type(), reserve_space_size);
+      workspace_ptr = workspace_tensor.mutable_data(
+          ctx.GetPlace(), transformed_x.type(), workspace_size);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+              handle,
+              mode_,
+              CUDNN_BATCHNORM_OPS_BN,
+              CudnnDataType<T>::kOne(),
+              CudnnDataType<T>::kZero(),
+              data_desc_,
+              transformed_x.template data<T>(),
+              nullptr,
+              nullptr,
+              data_desc_,
+              transformed_y.template data<T>(),
+              bn_param_desc_,
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              this_factor,
+              mean_out->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              variance_out->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              epsilon,
+              saved_mean->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              saved_variance->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              nullptr,
+              workspace_ptr,
+              workspace_size,
+              reserve_space_ptr,
+              reserve_space_size));
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+      if (!called) {
+#ifdef PADDLE_WITH_HIP
+        const int num = transformed_x.numel();
+        const int block = 256;
+        const int max_threads = ctx.GetMaxPhysicalThreadCount();
+        const int max_blocks = std::max(max_threads / block, 1);
+        const int grid = std::min(C, max_blocks);
+        if (compute_format == DataLayout::kNCHW) {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        } else {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardTraining(
+//         handle, mode_, const_cast<void *>(static_cast<const void *>(
+//                            CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         this_factor,
+//         static_cast<void *>(
+//             mean_out->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(variance_out->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace())),
+//         epsilon,
+//         static_cast<void *>(
+//             saved_mean->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(saved_variance->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace()))));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
+                handle,
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_y),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                bias.template data<BatchNormParamType<T>>(),
+                this_factor,
+                mean_out->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                variance_out->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                epsilon,
+                saved_mean->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                saved_variance->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace())));
+#endif
+      }
+    }
+  }
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+    TransToChannelLast<Context, T>(ctx, &transformed_y, y);
+  }
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#else
+  // clean when exit.
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+#endif
+}
+}  // namespace phi
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#endif
--- a/paddle/phi/kernels/gpu/batch_norm_utils.h
+++ b/paddle/phi/kernels/gpu/batch_norm_utils.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+using Tensor = DenseTensor;
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelFirst(const DeviceContext& context,
+                                 const Tensor* input,
+                                 Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  }
+}
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  }
+}
+template <typename DeviceContext, typename T>
+inline void TransToChannelFirst(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  VLOG(5) << "Why am I called?";
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 4, 1, 2, 3};
+    funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 3, 1, 2};
+    funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+template <typename DeviceContext, typename T>
+inline void TransToChannelLast(const DeviceContext& context,
+                               const Tensor* input,
+                               Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 2, 3, 1};
+    funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/fluid/framework/generator.h"
+DECLARE_bool(use_curand);
+namespace phi {
+template <typename T>
+struct GaussianGenerator {
+  T mean_, std_;
+  unsigned int seed_;
+  unsigned int offset_ = 0;
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
+      : mean_(mean), std_(std), seed_(seed) {}
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
+      : mean_(mean), std_(std), seed_(seed), offset_(offset) {}
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
+    thrust::normal_distribution<MT> dist(mean_, std_);
+    unsigned int new_n = n + offset_;
+    rng.discard(new_n);
+    MT out = dist(rng);
+    return static_cast<T>(out);
+  }
+};
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& dev_ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out) {
+  auto tensor = out;
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+  tensor->Resize(phi::make_ddim(shape.GetData()));
+  T* data = dev_ctx.template Alloc<T>(tensor);
+  int64_t size = tensor->numel();
+  int device_id = dev_ctx.GetPlace().GetDeviceId();
+  auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
+  using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
+  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    if (FLAGS_use_curand) {
+      funcs::normal_distribution<MT> dist;
+      funcs::normal_transform<MT> trans(mean, std);
+      funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
+    } else {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      auto func =
+          GaussianGenerator<MT>(mean, std, seed_offset.first, gen_offset);
+      IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
+    }
+  } else {
+    auto func = GaussianGenerator<MT>(mean, std, seed);
+    IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(gaussian_random,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GaussianRandomKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/gpu/pad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/pad_grad_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(pad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PadGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/gpu/pad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
+#include "paddle/phi/kernels/pad_kernel.h"
+PD_REGISTER_KERNEL(pad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PadKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -178,6 +178,8 @@ struct IndexCalculator {
      : dim(dim) {
    dims = details::VectorToArray<int, kMaxRank>(cal_dims);
    strides = details::VectorToArray<int, kMaxRank>(full_strides);
+    reduce_strides = details::VectorToArray<int, kMaxRank>(cal_strides);
+#ifndef PADDLE_WITH_XPU_KP
    std::vector<paddle::platform::FastDivMod> cal_divmoders;
    // fast divmod
    for (auto i : cal_strides) {
@@ -185,9 +187,22 @@ struct IndexCalculator {
    }
    divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
        cal_divmoders);
+#endif
  }
  __device__ inline int operator()(int offset) const {
+#ifdef PADDLE_WITH_XPU_KP
+    int index = 0;
+#pragma unroll
+    for (int i = 0; i < kMaxRank; ++i) {
+      if (i == dim) {
+        break;
+      }
+      index += (offset / reduce_strides[i]) * strides[dims[i]];
+      offset = offset % reduce_strides[i];
+    }
+    return index;
+#else
    int index = 0;
 #pragma unroll
    for (int i = 0; i < kMaxRank; ++i) {
@@ -199,12 +214,16 @@ struct IndexCalculator {
      offset = divmod.val[1];
    }
    return index;
+#endif
  }
  int dim;
  phi::Array<int, kMaxRank> dims;
  phi::Array<int, kMaxRank> strides;
+  phi::Array<int, kMaxRank> reduce_strides;
+#ifndef PADDLE_WITH_XPU2
  phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
+#endif
 };
 template <bool ReduceLastDim = false>
@@ -247,7 +266,7 @@ struct ReduceIndexMapping {
  __device__ __forceinline__ int BlockDimY() {
 #ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_y;
+    return 1;
 #else
    return blockDim.y;
 #endif
@@ -454,10 +473,14 @@ struct ReduceConfig {
    bool is_last_dim =
        (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
    if (rank == reduce_rank || is_last_dim) {
+#ifdef PADDLE_WITH_XPU_KP
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+#else
      reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
+#endif
    } else if (reduce_rank == 1) {
 // ReduceFirstDim and reduceSecondDim
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
      if (reduce_dim[0] == 0) {
        reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
      } else {
@@ -471,6 +494,7 @@ struct ReduceConfig {
    }
  }
+#ifndef PADDLE_WITH_XPU_KP
  void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
    constexpr int min_reduce_num_per_thread = 16;
    constexpr int max_reduce_num_per_thread = 256;
@@ -569,6 +593,7 @@ struct ReduceConfig {
      grid_dim->y = details::AlignUp(reduce_num, blocking_size);
    }
  }
+#endif
  void SetBlockDim() {
    // init
@@ -577,14 +602,14 @@ struct ReduceConfig {
    dim3 block_dim(block_num, 1, 1);
    dim3 grid_dim(left_num, 1, 1);
    blocking_size = reduce_num;
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
    if (reduce_last_dim) {
-      block_dim.x = 128;
+      block_dim.x = 64;
      block_dim.y = reduce_num;
-      grid_dim.x = 8;
+      grid_dim.x = 1;
-      grid_dim.y = 1;
+      grid_dim.y = 8;
    } else {
-      block_dim.x = 128;
+      block_dim.x = 64;
      block_dim.y = left_num;
      grid_dim.x = 8;
      grid_dim.y = 1;
@@ -661,7 +686,7 @@ __global__ void ReduceAnyKernel(const Tx* x,
    store_offset = block.BlockIdY() * left_num + left_idx;
    loop_left = min(block.GetLoopSize(), left_num - left_idx);
    stride_left = 1;
-    tid = threadIdx.x;
+    tid = THREAD_ID_X;
  } else {
    auto block = ReduceIndexMapping<false>(dim);
    input_idx = block.BlockIdY() * block.BlockDimY();
@@ -672,18 +697,20 @@ __global__ void ReduceAnyKernel(const Tx* x,
    loop_left = min(block.GetLoopSize(), left_num - left_idx);
    stride_left = block.BlockDimX() * block.GridDimX();
    store_offset = block.BlockIdY() * left_num + left_idx;
-    tid = threadIdx.y;
+    tid = THREAD_ID_Y;
  }
  // calculate the offset, means the addr where each thread really start.
  // 1. reduce for each thread
  MPType input_compute[REDUCE_VEC_SIZE];
  Tx input_reg[REDUCE_VEC_SIZE];
+  int input_idx_tmp = input_idx;
  for (int i = 0; i < loop_left; i += stride_left) {
    int input_offset = left_index_calculator(left_idx + i);
-    const Tx* input = x + input_offset;
+    const _ptr_ Tx* input = x + input_offset;
    MPType reduce_var = init;
    // load REDUCE_VEC_SIZE data once, and then compute
    int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
+    input_idx = input_idx_tmp;
    for (; input_idx + block_size < bound;
         input_idx += REDUCE_VEC_SIZE * stride) {
      kps::ReadDataReduce<Tx,
@@ -775,7 +802,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
  int loop_size = min(reduce_num - idy, blocking_size);
  int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
  int block_offset = idy * left_num + idz * reduce_num;
-  const Tx* input = x + block_offset;
+  const _ptr_ Tx* input = x + block_offset;
  Tx reduce_input;
  for (; idx < size; idx += stride) {
    MPType reduce_var = init;
@@ -838,7 +865,7 @@ static void LaunchReduceKernel(const Tx* x_data,
                               const ReduceOp& reducer,
                               const TransformOp& transform,
                               MPType init,
-                               gpuStream_t stream,
+                               KPStream stream,
                               ReduceConfig<Ty> config) {
  if (config.reduce_type == kReduceLastDim) {
    int stride_reduce = 1;
@@ -855,13 +882,14 @@ static void LaunchReduceKernel(const Tx* x_data,
                                        0);
    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
    ReduceAnyKernel<Tx,
                    Ty,
                    MPType,
                    ReduceOp,
                    TransformOp,
-                    OneDimIndexCal><<<8, 128, stream>>>(x_data,
+                    OneDimIndexCal><<<8, 64, 0, stream>>>(
+        x_data,
        config.output_data,
        reducer,
        transform,
@@ -910,13 +938,13 @@ static void LaunchReduceKernel(const Tx* x_data,
                                        0);
    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
    ReduceAnyKernel<Tx,
                    Ty,
                    MPType,
                    ReduceOp,
                    TransformOp,
-                    IndexCalculator><<<8, 128, stream>>>(
+                    IndexCalculator><<<8, 64, 0, stream>>>(
        x_data,
        config.output_data,
        reducer,
@@ -965,12 +993,13 @@ static void LaunchReduceKernel(const Tx* x_data,
    kps::DimConfig dim =
        kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
    dim.SetRem(config.left_num % block.x, 0, 0);
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
-    ReduceHigherDimKernel<Ty,
+    ReduceHigherDimKernel<
+        Ty,
        Ty,
        MPType,
        ReduceOp,
-                          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
+        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
        config.output_data,
        y_data,
        reducer,
@@ -1011,7 +1040,7 @@ CubTensorReduceImpl(const Tx* x_data,
                    const TransformOp& transform,
                    int reduce_num,
                    const paddle::platform::Place& place,
-                    gpuStream_t stream) {
+                    KPStream stream) {
  auto reducer = ReduceOp<Ty>();
  cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
                                                                  transform);
@@ -1054,7 +1083,7 @@ CubTensorReduceImpl(const Tx* x_data,
                    const TransformOp& transform,
                    int reduce_num,
                    const paddle::platform::Place& place,
-                    gpuStream_t stream) {
+                    KPStream stream) {
  PADDLE_THROW(phi::errors::InvalidArgument(
      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
 }
@@ -1068,7 +1097,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
                      phi::DenseTensor* y,
                      const TransformOp& transform,
                      const std::vector<int>& origin_reduce_dims,
-                      gpuStream_t stream) {
+                      KPStream stream) {
  y->mutable_data<Ty>(x.place());
  auto x_dim = phi::vectorize<int>(x.dims());
@@ -1098,11 +1127,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
  config.SetOutputData(y_data, x.place(), &tmp);
  constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
+#ifndef PADDLE_WITH_XPU_KP
  if (use_cub_reduce) {
    CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
        x_data, y_data, transform, config.reduce_num, x.place(), stream);
    return;
  }
+#endif
  using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
  auto reducer = ReduceOp<MPType>();
@@ -1124,12 +1155,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
               config.reduce_num % config.blocking_size,
               0);
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
    ReduceHigherDimKernel<Tx,
                          Ty,
                          MPType,
                          ReduceOp<MPType>,
-                          TransformOp><<<8, 128, stream>>>(x_data,
+                          TransformOp><<<8, 64, 0, stream>>>(
+        x_data,
        config.output_data,
        reducer,
        transform,
@@ -1163,13 +1195,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
          kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
      dim2.SetRem(config.left_num % config.block.x, 0, 0);
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
      ReduceHigherDimKernel<
          Ty,
          Ty,
          MPType,
          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
+          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
          config.output_data,
          y_data,
          reducer,
@@ -1212,7 +1244,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
 template <typename T,
          template <typename> class ReduceOp,
          template <typename, typename> class TransformOp>
-void Reduce(const GPUContext& dev_ctx,
+void Reduce(const KPDevice& dev_ctx,
            const DenseTensor& x,
            bool reduce_all,
            const std::vector<int64_t>& dims,
@@ -1227,7 +1259,7 @@ void Reduce(const GPUContext& dev_ctx,
    reduce_num *= (x.dims())[i];
  }
-  gpuStream_t stream = dev_ctx.stream();
+  KPStream stream = dev_ctx.stream();
  if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
    auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype);

--- a/paddle/phi/kernels/impl/pad_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pad_grad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+namespace phi {
+template <typename T, typename Context>
+void PadGradKernel(const Context& dev_ctx,
+                   const DenseTensor& d_out,
+                   const std::vector<int>& paddings,
+                   float pad_value,
+                   DenseTensor* d_x) {
+  if (d_x == nullptr) {
+    return;
+  }
+  dev_ctx.template Alloc<T>(d_x);
+  int rank = d_out.dims().size();
+  phi::funcs::PaddingGradFunctor<Context, T>(
+      rank, dev_ctx, paddings, d_out, d_x);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/impl/pad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <utility>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+namespace phi {
+template <typename T, typename Context>
+void PadKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int>& paddings,
+               float pad_value,
+               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  int rank = x.dims().size();
+  funcs::PaddingFunctor<Context, T>(
+      rank, dev_ctx, paddings, static_cast<T>(pad_value), x, out);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/pad_grad_kernel.h
+++ b/paddle/phi/kernels/pad_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void PadGradKernel(const Context& dev_ctx,
+                   const DenseTensor& d_out,
+                   const std::vector<int>& paddings,
+                   float pad_value,
+                   DenseTensor* d_x);
+}  // namespace phi
--- a/paddle/phi/kernels/pad_kernel.h
+++ b/paddle/phi/kernels/pad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void PadKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int>& paddings,
+               float pad_value,
+               DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/kernels/sparse/CMakeLists.txt
+++ b/paddle/phi/kernels/sparse/CMakeLists.txt
-set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
+set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function)
 register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel")
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -107,7 +107,9 @@ void ProductRuleBook(const Context& dev_ctx,
  f_calc_rulebook(nullptr);
  // alloc the rulebook
-  rulebook->ResizeAndAllocate({3, rulebook_len});
+  DenseTensorMeta rulebook_meta(
+      DataType::INT32, {3, rulebook_len}, DataLayout::NCHW);
+  rulebook->set_meta(rulebook_meta);
  dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int));
  int* rulebook_ptr = rulebook->data<int>();
  f_calc_rulebook(rulebook_ptr);

--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 namespace phi {
 namespace sparse {
@@ -55,7 +54,6 @@ void Conv3dKernel(const Context& dev_ctx,
  // 1. product rulebook
  DenseTensorMeta counter_meta(
      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  // DenseTensor rulebook = phi::Empty<int, Context>(dev_ctx);
  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
  ProductRuleBook<T, Context>(dev_ctx,

--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <thrust/execution_policy.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+#include "glog/logging.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+namespace phi {
+namespace sparse {
+// TODO(zhangkaihuo) replace this kernel with KP::InitWithDataIndex
+__global__ void InitByIndexKernel(const int n, int* out1, int* out2) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
+    out1[i] = i;
+    out2[i] = i;
+  }
+}
+/**
+ * @brief: update the out index and indices
+ * unique_keys: save the index of the output feature list
+ * unique_values: indiates the index of key before deduplication
+ * out_indexs: indicates the position of the output index in the rulebook
+ * rulebook_len: indicates the length of rulebook
+ * out_dims: indicates the output dims
+ * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
+ * rulebook_out_indexs: the output index in rulebook
+**/
+__global__ void UpdateIndexKernel(const int* unique_keys,
+                                  const int* unique_values,
+                                  const int* out_indexs,
+                                  const int non_zero_num,
+                                  const int rulebook_len,
+                                  const Dims4D out_dims,
+                                  int* out_indices,
+                                  int* rulebook_out_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    const int index = unique_keys[i];
+    int batch, x, y, z;
+    IndexToPoint<Dims4D>(index, out_dims, &batch, &x, &y, &z);
+    // get out indices
+    out_indices[i] = batch;
+    out_indices[i + non_zero_num] = z;
+    out_indices[i + non_zero_num * 2] = y;
+    out_indices[i + non_zero_num * 3] = x;
+    // update rulebook
+    int start = unique_values[i];
+    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
+    // max(end-start) = kernel_size
+    for (int j = start; j < end; j++) {
+      rulebook_out_indexs[out_indexs[j]] = i;
+    }
+  }
+}
+/**
+ * @brief product rulebook
+ * for input_i in x_indices:
+ *   if input_i participate in the convolution calculation:
+ *       infer the output_i by input_i and kernel_i
+ *       save output_i
+ *
+ * x_indices: the indices of input features
+ * x_dims: the input dims
+ * kernel_dims: the kernel dims
+ * out_dims: the output dims
+ * non_zero_num: the number of input features
+ * rulebook: the rulebook to save the kernel index, input index and output index
+ * counter: save the number of times each location in the kernel participates in
+ *the caculation
+**/
+__global__ void ProductRuleBookKernel(const int* x_indices,
+                                      const Dims4D x_dims,
+                                      const Dims4D kernel_dims,
+                                      const Dims4D out_dims,
+                                      const int64_t non_zero_num,
+                                      const Dims4D paddings,
+                                      const Dims4D dilations,
+                                      const Dims4D strides,
+                                      int* rulebook,
+                                      int* counter) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int counter_buf[];  // kernel_size
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int batch = x_indices[i];
+          int in_z = x_indices[i + non_zero_num];
+          int in_y = x_indices[i + 2 * non_zero_num];
+          int in_x = x_indices[i + 3 * non_zero_num];
+          int in_i = -1, out_index = -1;
+          if (Check(x_dims,
+                    kernel_dims,
+                    paddings,
+                    dilations,
+                    strides,
+                    in_x,
+                    in_y,
+                    in_z,
+                    kx,
+                    ky,
+                    kz)) {
+            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            in_i = i;
+            out_index =
+                PointToIndex<Dims4D>(batch, out_x, out_y, out_z, out_dims);
+            atomicAdd(&counter_buf[kernel_index], 1);
+          }
+          rulebook[kernel_index * non_zero_num + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = out_index;
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicAdd(&counter[i], counter_buf[i]);
+  }
+}
+// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
+// this kernel with phi::GatherCUDAKernel;
+// Vectorization can be used to improve read and write bandwidth
+/**
+ * brief: gather data from params according to indices
+ * params: the inputs
+ * indices: the indices you want to gather
+ * output: the outputs
+ * index_size: the size of indices
+ * slice_size: slice size corresponding to each index, here is the channel size
+**/
+template <typename T, typename IndexT = int>
+__global__ void GatherKernel(const T* params,
+                             const IndexT* indices,
+                             T* output,
+                             size_t index_size,
+                             size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
+    IndexT gather_i = indices[indices_i];
+    int64_t params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+/**
+ * brief: scatter add
+ * input: the inputs
+ * unique_value: refer to UpdateIndexKernel notes
+ * out_index: the output feature index
+ * non_zero_num: the number of output features
+ * rulebook_len: the length of rulebook
+ * channels: the output channel size
+ * out: the outputs
+**/
+template <typename T>
+__global__ void ScatterKernel(const T* input,
+                              const int* unique_value,
+                              const int* out_index,
+                              const int non_zero_num,
+                              const int rulebook_len,
+                              const int channels,
+                              T* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
+    int indices_i = i / channels;
+    int channels_i = i - indices_i * channels;
+    int start = unique_value[indices_i];
+    int end = indices_i == non_zero_num - 1 ? rulebook_len
+                                            : unique_value[indices_i + 1];
+    // max(end-start) = kernel_size
+    T sum = static_cast<T>(0);
+    for (int j = start; j < end; j++) {
+      const int out_feature_i = out_index[j];
+      sum += input[out_feature_i * channels + channels_i];
+    }
+    out[indices_i * channels + channels_i] = sum;
+  }
+}
+// brief: calculation the distance between start and end
+__global__ void DistanceKernel(const int* start,
+                               const int* end,
+                               int* distance) {
+  if (threadIdx.x == 0) {
+    *distance = end - start;
+  }
+}
+// the basic algorithm can refer to convolution_kernel.cc or
+// the second paper
+// example:
+// 1. the rulebook:
+//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
+//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
+// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
+// 3. sorted the (key, value)
+// 4. unique the (key, value):
+//  unique_key:     20, 25, 30, 33
+//  unique_values:  0, 2, 3, 5
+//  the index of unique_values is: 0, 1, 2, 3
+// 5. update the out_index by unique_key, uniqe_value and the index of
+// unique_value:
+//  the new out_index: 0, 2, 3, 2, 3, 0, 1
+template <typename T, typename Context>
+int ProductRuleBook(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    const DenseTensor& kernel,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& dilations,
+                    const std::vector<int>& strides,
+                    const DDim& out_dims,
+                    DenseTensor* rulebook,
+                    DenseTensor* counter_per_kernel,
+                    DenseTensor* offsets_per_kernel,
+                    DenseTensor* out_index,
+                    DenseTensor* unique_key,
+                    DenseTensor* unique_value,
+                    SparseCooTensor* out,
+                    std::vector<int>* h_counter,
+                    std::vector<int>* h_offsets) {
+  const auto& kernel_dims = kernel.dims();
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const int* indices_ptr = non_zero_indices.data<int>();
+  dev_ctx.Alloc(counter_per_kernel,
+                counter_per_kernel->dtype(),
+                sizeof(int) * counter_per_kernel->numel());
+  int* counter_ptr = counter_per_kernel->data<int>();
+  dev_ctx.Alloc(offsets_per_kernel,
+                offsets_per_kernel->dtype(),
+                sizeof(int) * offsets_per_kernel->numel());
+  int* offsets_ptr = offsets_per_kernel->data<int>();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  rulebook->ResizeAndAllocate({2, kernel_size * non_zero_num});
+  dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel());
+  int* rulebook_ptr = rulebook->data<int>();
+  const auto x_dims = x.dims();
+  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
+  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
+  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
+  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
+  // 1. product rule book
+  phi::funcs::SetConstant<Context, int> set_zero;
+  set_zero(dev_ctx, counter_per_kernel, 0);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+  ProductRuleBookKernel<<<config.block_per_grid.x,
+                          config.thread_per_block.x,
+                          kernel_size * sizeof(int),
+                          dev_ctx.stream()>>>(indices_ptr,
+                                              d_x_dims,
+                                              d_kernel_dims,
+                                              d_out_dims,
+                                              non_zero_num,
+                                              d_paddings,
+                                              d_dilations,
+                                              d_strides,
+                                              rulebook_ptr,
+                                              counter_ptr);
+// 2. remove -1
+#ifdef PADDLE_WITH_HIP
+  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                             rulebook_ptr,
+                             rulebook_ptr + 2 * kernel_size * non_zero_num,
+                             -1);
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         counter_ptr,
+                         counter_ptr + kernel_size,
+                         offsets_ptr);
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+  int rulebook_len =
+      (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1];
+  // 3. sorted or merge the out index
+  out_index->ResizeAndAllocate({rulebook_len});
+  unique_value->ResizeAndAllocate({rulebook_len});
+  unique_key->ResizeAndAllocate({rulebook_len});
+  dev_ctx.Alloc(
+      out_index, out_index->dtype(), sizeof(int) * out_index->numel());
+  int* out_index_ptr = out_index->data<int>();
+  dev_ctx.Alloc(
+      unique_value, unique_value->dtype(), sizeof(int) * unique_value->numel());
+  int* unique_value_ptr = unique_value->data<int>();
+  dev_ctx.Alloc(
+      unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel());
+  int* unique_key_ptr = unique_key->data<int>();
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+  InitByIndexKernel<<<config.block_per_grid.x,
+                      config.thread_per_block.x,
+                      0,
+                      dev_ctx.stream()>>>(
+      rulebook_len, out_index_ptr, unique_value_ptr);
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
+                                     rulebook_ptr + rulebook_len,
+                                     rulebook_len * sizeof(int),
+                                     hipMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
+                                     rulebook_ptr + rulebook_len,
+                                     rulebook_len * sizeof(int),
+                                     cudaMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
+#endif
+// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
+// performance, but thrust::merge_by_key limited by data size
+#ifdef PADDLE_WITH_HIP
+  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                      unique_key_ptr,
+                      unique_key_ptr + rulebook_len,
+                      out_index_ptr);
+  // 4. unique
+  thrust::pair<int*, int*> new_end =
+#ifdef PADDLE_WITH_HIP
+      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                            unique_key_ptr,
+                            unique_key_ptr + rulebook_len,
+                            unique_value_ptr);
+  // thrust::distance doesn't support stream parameters
+  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
+  // new_end.first);
+  DistanceKernel<<<1, 1>>>(unique_key_ptr,
+                           new_end.first,
+                           rulebook_ptr + 2 * kernel_size * non_zero_num - 1);
+  int out_non_zero_num = 0;
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+      hipMemcpyDeviceToHost,
+      dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+      cudaMemcpyDeviceToHost,
+      dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+  // 5. update out_indices and rulebook by unique_value_ptr
+  const int64_t sparse_dim = 4;
+  DenseTensorMeta indices_meta(
+      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(
+      x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
+  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+  dev_ctx.Alloc(
+      &out_indices, out_indices.dtype(), sizeof(int) * out_indices.numel());
+  int* out_indices_ptr = out_indices.data<int>();
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
+  UpdateIndexKernel<<<config.block_per_grid.x,
+                      config.thread_per_block.x,
+                      0,
+                      dev_ctx.stream()>>>(unique_key_ptr,
+                                          unique_value_ptr,
+                                          out_index_ptr,
+                                          out_non_zero_num,
+                                          rulebook_len,
+                                          d_out_dims,
+                                          out_indices_ptr,
+                                          rulebook_ptr + rulebook_len);
+  out->SetMember(out_indices, out_values, out_dims, true);
+  return rulebook_len;
+}
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void Conv3dKernel(const Context& dev_ctx,
+                  const SparseCooTensor& x,
+                  const DenseTensor& kernel,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const int groups,
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook) {
+  // update padding and dilation
+  // Currently, only support x.layout is NDHWC, groups = 1
+  // if x.layout != NDHWC then transpose(x), transpose(weight)
+  const auto& x_dims = x.dims();
+  const auto& kernel_dims = kernel.dims();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  DDim out_dims = {1, 1, 1, 1, 1};
+  GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+  out->set_dims(out_dims);
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
+  // Second algorithm:
+  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
+  // 1. product rulebook
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensorMeta offsets_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
+  DenseTensor out_index = phi::Empty<int, Context>(dev_ctx);
+  DenseTensor unique_key = phi::Empty<int, Context>(dev_ctx);
+  DenseTensor unique_value = phi::Empty<int, Context>(dev_ctx);
+  int n = ProductRuleBook<T, Context>(dev_ctx,
+                                      x,
+                                      kernel,
+                                      paddings,
+                                      dilations,
+                                      strides,
+                                      out_dims,
+                                      rulebook,
+                                      &counter_per_kernel,
+                                      &offsets_per_kernel,
+                                      &out_index,
+                                      &unique_key,
+                                      &unique_value,
+                                      out,
+                                      &h_counter,
+                                      &offsets);
+  const int* counter_ptr = counter_per_kernel.data<int>();
+  const int* offsets_ptr = counter_per_kernel.data<int>();
+  // 2. gather
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {n, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_features_meta(
+      x.dtype(), {n, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor out_features =
+      phi::Empty(dev_ctx, std::move(out_features_meta));
+  dev_ctx.Alloc(
+      &in_features, in_features.dtype(), sizeof(T) * in_features.numel());
+  T* in_features_ptr = in_features.data<T>();
+  dev_ctx.Alloc(
+      &out_features, out_features.dtype(), sizeof(T) * out_features.numel());
+  T* out_features_ptr = out_features.data<T>();
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook->data<int>(),
+                                             in_features_ptr,
+                                             n,
+                                             in_channels);
+  // 3. call gemm for every werght
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  auto* out_values = out->mutable_non_zero_elements();
+  dev_ctx.Alloc(
+      out_values, out_values->dtype(), sizeof(T) * out_values->numel());
+  T* out_values_ptr = out_values->data<T>();
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (h_counter[i] <= 0) {
+      continue;
+    }
+    // call gemm: (n, in_channels) * (in_channels, out_channels)
+    const int M = h_counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
+    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+    blas.GEMM(CblasNoTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_out_ptr);
+  }
+  // 4. scatter
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, out->nnz() * out_channels, 1);
+  ScatterKernel<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(out_features_ptr,
+                                         unique_value.data<int>(),
+                                         out_index.data<int>(),
+                                         out->nnz(),
+                                         n,
+                                         out_channels,
+                                         out_values_ptr);
+}
+}  // namespace sparse
+}  // namespace phi
+PD_REGISTER_KERNEL(sparse_conv3d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
--- a/paddle/phi/ops/compat/batch_norm_sig.cc
+++ b/paddle/phi/ops/compat/batch_norm_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("batch_norm",
+                         {"X", "Scale", "Bias", "Mean", "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"Y",
+                          "MeanOut",
+                          "VarianceOut",
+                          "SavedMean",
+                          "SavedVariance",
+                          "ReserveSpace"});
+}
+KernelSignature BatchNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "batch_norm_grad",
+      {GradVarName("Y"),
+       "X",
+       "Scale",
+       "Bias",
+       "SavedMean",
+       "SavedVariance",
+       "ReserveSpace",
+       "Mean",
+       "Variance"},
+      {"momentum",
+       "epsilon",
+       "data_layout",
+       "is_test",
+       "use_global_stats",
+       "trainable_statistics",
+       "fuse_with_relu"},
+      {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
+}
+KernelSignature BatchNormGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("batch_norm_grad_grad",
+                         {"DDX",
+                          "DDScale",
+                          "DDBias",
+                          "DY",
+                          "X",
+                          "Scale",
+                          "SavedMean",
+                          "SavedVariance",
+                          "Mean",
+                          "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"DX", "DScale", "DDY"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(batch_norm, phi::BatchNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad,
+                           phi::BatchNormGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad_grad,
+                           phi::BatchNormGradGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/gaussian_random_sig.cc
+++ b/paddle/phi/ops/compat/gaussian_random_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature GaussianRandomOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.InputSize("ShapeTensorList") > 0) {
+    return KernelSignature("gaussian_random",
+                           {},
+                           {"ShapeTensorList", "mean", "std", "seed", "dtype"},
+                           {"Out"});
+  }
+  const auto& shape = paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+  if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+    return KernelSignature("gaussian_random",
+                           {},
+                           {"ShapeTensor", "mean", "std", "seed", "dtype"},
+                           {"Out"});
+  }
+  return KernelSignature("gaussian_random",
+                         {},
+                         {"shape", "mean", "std", "seed", "dtype"},
+                         {"Out"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(gaussian_random,
+                           phi::GaussianRandomOpArgumentMapping);
--- a/paddle/phi/ops/compat/pad_sig.cc
+++ b/paddle/phi/ops/compat/pad_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature PadGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("pad_grad",
+                         {GradVarName("Out")},
+                         {"paddings", "pad_value"},
+                         {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(pad_grad, phi::PadGradOpArgumentMapping);
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -25,3 +25,4 @@ cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_
 cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils)
 cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils)
 cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils)
+cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS phi_tensor phi_api phi_api_utils)
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include <memory>
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/api/include/sparse_api.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+template <typename T>
+void TestConv3dBase(const std::vector<int>& indices,
+                    const std::vector<T>& features,
+                    const phi::DDim& x_dims,
+                    const std::vector<T>& kernel,
+                    const phi::DDim& kernel_dims,
+                    const std::vector<int>& correct_out_indices,
+                    const std::vector<T>& correct_out_features,
+                    const phi::DDim& correct_out_dims,
+                    const int non_zero_num,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& dilations,
+                    const float diff = 1e-3) {
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  phi::DenseTensor indices_tensor(
+      alloc.get(),
+      phi::DenseTensorMeta(
+          phi::DataType::INT32, {4, non_zero_num}, phi::DataLayout::NCHW));
+  memcpy(
+      indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
+  phi::DenseTensor features_tensor(
+      alloc.get(),
+      phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                           {non_zero_num, in_channels},
+                           phi::DataLayout::NHWC));
+  memcpy(
+      features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
+  auto x_tensor = std::make_shared<phi::SparseCooTensor>(
+      indices_tensor, features_tensor, x_dims);
+  paddle::experimental::Tensor x(x_tensor);
+  auto kernel_tensor = std::make_shared<phi::DenseTensor>(
+      alloc.get(),
+      phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                           kernel_dims,
+                           phi::DataLayout::NHWC));
+  paddle::experimental::Tensor weight(kernel_tensor);
+  memcpy(kernel_tensor->mutable_data<T>(paddle::platform::CPUPlace()),
+         kernel.data(),
+         kernel.size() * sizeof(T));
+  if (!std::is_same<T, phi::dtype::float16>::value) {
+    auto outs = paddle::experimental::sparse::conv3d(
+        x, weight, paddings, dilations, strides, 1);
+    auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
+        std::get<0>(outs).impl());
+    ASSERT_EQ(correct_out_dims.size(), out->dims().size());
+    for (int i = 0; i < correct_out_dims.size(); i++) {
+      ASSERT_EQ(correct_out_dims[i], out->dims()[i]);
+    }
+    ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out->nnz());
+    int cmp_indices = memcmp(correct_out_indices.data(),
+                             out->non_zero_indices().data<int>(),
+                             correct_out_indices.size() * sizeof(int));
+    ASSERT_EQ(cmp_indices, 0);
+    for (uint64_t i = 0; i < correct_out_features.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(
+          correct_out_features[i] - out->non_zero_elements().data<T>()[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  }
+}
+void TestConv3d(const std::vector<int>& indices,
+                const std::vector<float>& features,
+                const phi::DDim& x_dims,
+                const std::vector<float>& kernel,
+                const phi::DDim& kernel_dims,
+                const std::vector<int>& correct_out_indices,
+                const std::vector<float>& correct_out_features,
+                const phi::DDim& correct_out_dims,
+                const int non_zero_num,
+                const std::vector<int>& paddings,
+                const std::vector<int>& strides,
+                const std::vector<int>& dilations) {
+  // test float
+  TestConv3dBase<float>(indices,
+                        features,
+                        x_dims,
+                        kernel,
+                        kernel_dims,
+                        correct_out_indices,
+                        correct_out_features,
+                        correct_out_dims,
+                        non_zero_num,
+                        paddings,
+                        strides,
+                        dilations);
+}
+TEST(API, sparse_conv2d) {
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  const int in_channels = 1;
+  const int out_channels = 1;
+  phi::DDim x_dims = {1, 1, 5, 5, in_channels};
+  phi::DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
+  phi::DDim out_dims = {1, 1, 3, 3, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+  const int non_zero_num = 3;
+  std::vector<int> indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4};
+  std::vector<float> features = {-0.79394531, -0.3125, -0.55029297};
+  // 3*3*3=27
+  std::vector<float> kernel = {0.65820312,
+                               0.75048828,
+                               0.21411133,
+                               0.17370605,
+                               0.85546875,
+                               0.53076172,
+                               0.28833008,
+                               0.71044922,
+                               0.00659943};
+  std::vector<int> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 2, 2, 2, 1, 2, 0, 1, 2};
+  std::vector<float> out_features = {
+      -0.17004, -0.71338, -0.00206, -0.22205, -0.09009};
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations);
+}
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
@@ -151,6 +152,107 @@ void TestConv3dBase(const std::vector<int>& indices,
      f_verify(grads[1].data<T>(), kernel_grad);
    }
  }
+// test gpu
+#if defined(PADDLE_WITH_CUDA)
+  phi::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+  DenseTensor d_indices_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  dev_ctx_gpu.Alloc(&d_indices_tensor,
+                    d_indices_tensor.dtype(),
+                    sizeof(int) * d_indices_tensor.numel());
+  phi::Copy(
+      dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
+  DenseTensor d_features_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  dev_ctx_gpu.Alloc(&d_features_tensor,
+                    d_features_tensor.dtype(),
+                    sizeof(T) * d_features_tensor.numel());
+  phi::Copy(
+      dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
+  SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
+  DenseTensor d_kernel_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      kernel_dims,
+                      DataLayout::NHWC));
+  dev_ctx_gpu.Alloc(&d_kernel_tensor,
+                    d_kernel_tensor.dtype(),
+                    sizeof(T) * d_kernel_tensor.numel());
+  phi::Copy(
+      dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
+  DenseTensor d_rulebook = phi::Empty<int, phi::GPUContext>(dev_ctx_gpu);
+  SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
+                                            d_x_tensor,
+                                            d_kernel_tensor,
+                                            paddings,
+                                            dilations,
+                                            strides,
+                                            1,
+                                            &d_rulebook);
+  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
+  ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
+  for (int i = 0; i < correct_out_dims.size(); i++) {
+    ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]);
+  }
+  DenseTensor h_indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
+  dev_ctx_cpu.Alloc(&h_indices_tensor,
+                    h_indices_tensor.dtype(),
+                    sizeof(int) * h_indices_tensor.numel());
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_indices(),
+            phi::CPUPlace(),
+            true,
+            &h_indices_tensor);
+  int cmp_indices2 = memcmp(correct_out_indices.data(),
+                            h_indices_tensor.data<int>(),
+                            correct_out_indices.size() * sizeof(int));
+  ASSERT_EQ(cmp_indices2, 0);
+  DenseTensor h_features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {d_out.nnz()},
+                      d_out.layout()));
+  dev_ctx_cpu.Alloc(&h_features_tensor,
+                    h_features_tensor.dtype(),
+                    sizeof(T) * h_features_tensor.numel());
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_elements(),
+            phi::CPUPlace(),
+            true,
+            &h_features_tensor);
+  for (uint64_t i = 0; i < correct_out_features.size(); i++) {
+    float tmp = std::fabs(static_cast<float>(correct_out_features[i] -
+                                             h_features_tensor.data<T>()[i]));
+    ASSERT_LT(tmp, diff);
+  }
+#endif
 }
 void TestConv3d(const std::vector<int>& indices,

--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -1430,6 +1430,22 @@ class Fleet(object):
        # cache original feed forward program
        self.origin_main_program = loss.block.program
+        # add distributed attr
+        if not hasattr(self.origin_main_program, "distributed_info_"):
+            setattr(self.origin_main_program, "distributed_info_", dict())
+            self.origin_main_program.distributed_info_[
+                "dp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "dp_degree"]
+            self.origin_main_program.distributed_info_[
+                "mp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "mp_degree"]
+            self.origin_main_program.distributed_info_[
+                "pp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "pp_degree"]
+            self.origin_main_program.distributed_info_[
+                "sharding_degree"] = self._user_defined_strategy.sharding_configs[
+                    "sharding_degree"]
        context["origin_main_program"] = self.origin_main_program
        context["loss"] = loss
        if startup_program == None:

--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -351,10 +351,10 @@ endif()
 set_tests_properties(test_graph PROPERTIES TIMEOUT 120)
 set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200)
-set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120)
+set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200)
-set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200)
-set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
 if(LINUX AND WITH_MKLDNN)
    set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
    set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)

--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -26,7 +26,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
+from paddle.fluid.framework import IrGraph, _test_eager_guard
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
@@ -122,7 +122,7 @@ class ImperativeLenet(fluid.dygraph.Layer):
 class TestImperativeOutSclae(unittest.TestCase):
-    def test_out_scale_acc(self):
+    def func_out_scale_acc(self):
        seed = 1000
        lr = 0.001
@@ -166,9 +166,14 @@ class TestImperativeOutSclae(unittest.TestCase):
                loss_list[i] > loss_list[i + 1],
                msg='Failed to do the imperative qat.')
+    def test_out_scale_acc(self):
+        with _test_eager_guard():
+            self.func_out_scale_acc()
+        self.func_out_scale_acc()
 class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
-    def test_save_quantized_model(self):
+    def func_save_quantized_model(self):
        lr = 0.001
        load_param_path = "test_save_quantized_model/lenet.pdparams"
@@ -206,6 +211,11 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
                loss_list[i] > loss_list[i + 1],
                msg='Failed to do the imperative qat.')
+    def test_save_quantized_model(self):
+        with _test_eager_guard():
+            self.func_save_quantized_model()
+        self.func_save_quantized_model()
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
@@ -29,6 +29,7 @@ import paddle.fluid as fluid
 from paddle.fluid.contrib.slim.quantization import *
 from paddle.fluid.log_helper import get_logger
 from paddle.dataset.common import download
+from paddle.fluid.framework import _test_eager_guard
 from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn
 from imperative_test_utils import ImperativeLinearBn_hook
@@ -194,7 +195,7 @@ class TestImperativePTQ(unittest.TestCase):
                break
        return top1_correct_num / total_num
-    def test_ptq(self):
+    def func_ptq(self):
        start_time = time.time()
        self.set_vars()
@@ -244,9 +245,14 @@ class TestImperativePTQ(unittest.TestCase):
        end_time = time.time()
        print("total time: %ss \n" % (end_time - start_time))
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
 class TestImperativePTQfuse(TestImperativePTQ):
-    def test_ptq(self):
+    def func_ptq(self):
        start_time = time.time()
        self.set_vars()
@@ -305,6 +311,11 @@ class TestImperativePTQfuse(TestImperativePTQ):
        end_time = time.time()
        print("total time: %ss \n" % (end_time - start_time))
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
 class TestImperativePTQHist(TestImperativePTQ):
    def set_vars(self):

--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -32,7 +32,7 @@ from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose
+from paddle.fluid.framework import _test_eager_guard
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 paddle.enable_static()
@@ -55,7 +55,7 @@ class TestImperativeQat(unittest.TestCase):
        self.activation_quantize_type = 'moving_average_abs_max'
        print('weight_quantize_type', self.weight_quantize_type)
-    def test_qat(self):
+    def func_qat(self):
        self.set_vars()
        imperative_qat = ImperativeQuantAware(
@@ -193,6 +193,11 @@ class TestImperativeQat(unittest.TestCase):
                np.allclose(after_save, before_save.numpy()),
                msg='Failed to save the inference quantized model.')
+    def test_qat(self):
+        with _test_eager_guard():
+            self.func_qat()
+        self.func_qat()
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
@@ -27,7 +27,7 @@ import paddle.fluid as fluid
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.log_helper import get_logger
 from paddle.dataset.common import download
+from paddle.fluid.framework import _test_eager_guard
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 os.environ["CPU_NUM"] = "1"

--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
@@ -30,7 +30,7 @@ from paddle.fluid.dygraph import Pool2D
 from paddle.fluid.dygraph import Linear
 from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
 from paddle.fluid.log_helper import get_logger
+from paddle.fluid.framework import _test_eager_guard
 os.environ["CPU_NUM"] = "1"
 _logger = get_logger(
@@ -157,7 +157,7 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
        _logger.info("test act_preprocess")
        self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT)
-    def test_quant_aware_training(self):
+    def func_quant_aware_training(self):
        imperative_qat = self.imperative_qat
        seed = 1
        np.random.seed(seed)
@@ -243,6 +243,11 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
        train(lenet)
        test(lenet)
+    def test_quant_aware_training(self):
+        with _test_eager_guard():
+            self.func_quant_aware_training()
+        self.func_quant_aware_training()
 class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess):
    def setUp(self):

--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -32,6 +32,7 @@ from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant
+from paddle.fluid.framework import _test_eager_guard
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
@@ -42,7 +43,8 @@ _logger = get_logger(
 class TestImperativeOutSclae(unittest.TestCase):
-    def test_out_scale_acc(self):
+    def func_out_scale_acc(self):
+        paddle.disable_static()
        seed = 1000
        lr = 0.1
@@ -125,6 +127,11 @@ class TestImperativeOutSclae(unittest.TestCase):
        if find_matmul:
            self.assertTrue(matmul_skip_count == 1)
+    def test_out_scale_acc(self):
+        with _test_eager_guard():
+            self.func_out_scale_acc()
+        self.func_out_scale_acc()
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -155,8 +155,7 @@ def prune_model(main_program=None,
                n=2,
                m=4,
                mask_algo='mask_1d',
-                with_mask=True,
+                with_mask=True):
-                sharding=False):
    r"""
    Pruning parameters of supported layers in :attr:`main_program` via 
    specified mask generation function given by :attr:`mask_algo`. This 
@@ -179,7 +178,6 @@ def prune_model(main_program=None,
        mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
                                      The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
        with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
-        sharding (bool, optional): Whether to turn on sharding (model parallel) during training. Please consider turning it ON when encountering OOM using sharding. Default is False.
    Returns:
        dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
    Examples:
@@ -221,7 +219,10 @@ def prune_model(main_program=None,
            # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
            sparsity.prune_model(main_program, mask_algo='mask_2d_best')
    """
-    if sharding:
+    if main_program is not None and hasattr(
+            main_program,
+            "distributed_info_") and main_program.distributed_info_[
+                "sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda():
        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
        place = paddle.CUDAPlace(gpu_id)
    else:

--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -99,18 +99,19 @@ def param_guard(parameters):
        yield
-def _convert_into_variable(var_base):
+def _convert_into_variable(tensor):
    """
    Convert Varbase into Variable.
    """
-    if isinstance(var_base, core.VarBase):
+    if isinstance(tensor, (core.eager.Tensor, core.VarBase)):
        # Check whether has been created before.
-        new_var = var_base.block._find_var_recursive(var_base.name)
+        new_var = tensor.block._find_var_recursive(tensor.name)
        if new_var is not None:
            assert isinstance(new_var, framework.Variable)
        # Convert ParamBase into Parameter with same attributes in dy2stat.
-        elif isinstance(var_base, framework.ParamBase):
+        elif isinstance(tensor,
-            new_var = var_base._to_static_var(to_parameter=True)
+                        (framework.EagerParamBase, framework.ParamBase)):
+            new_var = tensor._to_static_var(to_parameter=True)
        else:
            # Note(Aurelius84): Convert VarBase in self._buffers into Variable with
            # same attributes and set persistable=True to allow saving this var.
@@ -120,13 +121,13 @@ def _convert_into_variable(var_base):
            # But if its shape is empty while created from `create_variable()`, we consider this buffer
            # non-persistable. See case of `drop_state` in lstm api.
-            is_persistable = len(var_base.shape) > 0
+            is_persistable = len(tensor.shape) > 0
-            new_var = var_base._to_static_var(
+            new_var = tensor._to_static_var(
                to_parameter=False, persistable=is_persistable)
        return new_var
    else:
-        return var_base
+        return tensor
 def enabled():

--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -61,7 +61,8 @@ class NestSequence(object):
    def _get_var_ids(self):
        var_ids = []
        for idx, var in enumerate(self.__input_list):
-            if isinstance(var, (framework.Variable, core.VarBase)):
+            if isinstance(var, (framework.Variable, core.VarBase,
+                                core.eager.Tensor)):
                var_ids.append(idx)
        return var_ids
@@ -73,7 +74,8 @@ class NestSequence(object):
        if need_check:
            warning_types = set()
            for var in self.__input_list:
-                if not isinstance(var, (framework.Variable, core.VarBase)):
+                if not isinstance(var, (framework.Variable, core.VarBase,
+                                        core.eager.Tensor)):
                    warning_types.add(type(var))
            if warning_types:
                logging_utils.warn(
@@ -301,10 +303,17 @@ class PartialProgramLayer:
            for name in block.vars:
                if "@GRAD" in name:
                    var_desc = block.vars[name].desc
+                    var_base = None
+                    if not core._in_eager_mode():
                        var_base = core.VarBase(var_desc.dtype(),
                                                var_desc.shape(),
                                                var_desc.name(),
                                                var_desc.type(), False)
+                    else:
+                        var_base = core.eager.Tensor(var_desc.dtype(),
+                                                     var_desc.shape(),
+                                                     var_desc.name(),
+                                                     var_desc.type(), False)
                    double_grads.append(var_base)
        return self._valid_vars(double_grads)
@@ -386,13 +395,22 @@ class PartialProgramLayer:
        expected_place = framework._current_expected_place()
        for i, value in enumerate(flatten_inputs):
            if isinstance(value, np.ndarray):
+                var = None
+                if not core._in_eager_mode():
                    var = core.VarBase(
                        value=value,
                        name=self._inputs[i].desc.name(),
                        persistable=False,
                        place=expected_place,
                        zero_copy=True)
-            elif isinstance(value, core.VarBase):
+                else:
+                    var = core.eager.Tensor(
+                        value=value,
+                        name=self._inputs[i].desc.name(),
+                        persistable=False,
+                        place=expected_place,
+                        zero_copy=True)
+            elif isinstance(value, (core.VarBase, core.eager.Tensor)):
                # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
                # into CUDAPlace when it's as input of multi Ops. so we move it in advance
                # to avoid this problem.
@@ -411,9 +429,16 @@ class PartialProgramLayer:
            var = self._outputs[var_id]
            assert isinstance(var, framework.Variable)
            var_desc = var.desc
+            varbase = None
+            if not core._in_eager_mode():
                var_base = core.VarBase(var_desc.dtype(),
                                        var_desc.shape(),
                                        var_desc.name(), var_desc.type(), False)
+            else:
+                var_base = core.eager.Tensor(var_desc.dtype(),
+                                             var_desc.shape(),
+                                             var_desc.name(),
+                                             var_desc.type(), False)
            return var_base
        # Create VarBase to receive output data.
@@ -423,9 +448,16 @@ class PartialProgramLayer:
    def _create_scope_vec(self):
        # Hold forward variables
+        tmp_scope_vec = None
+        if not core._in_eager_mode():
            tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
                                         "program_out_scope",
                                         core.VarDesc.VarType.STEP_SCOPES, True)
+            # TODO(jiabin): Support this later.
+            # else:
+            #     tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [],
+            #                                 "program_out_scope",
+            #                                 core.VarDesc.VarType.STEP_SCOPES, True)
            inner_scope = core.Scope()
            tmp_scope_vec.value().set_scope(inner_scope)
@@ -450,7 +482,8 @@ class PartialProgramLayer:
        return main_program.clone(for_test=True)
    def _is_no_value(self, var):
-        if isinstance(var, core.VarBase) and var.shape == [1]:
+        if isinstance(var,
+                      (core.VarBase, core.eager.Tensor)) and var.shape == [1]:
            # NOTE: .numpy() will insert MemcpySync operation, it hits performance.
            if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM:
                return True
@@ -460,7 +493,7 @@ class PartialProgramLayer:
        """
        Removes invalid value for various-length return statement
        """
-        if isinstance(out_vars, core.VarBase):
+        if isinstance(out_vars, (core.VarBase, core.eager.Tensor)):
            if self._is_no_value(out_vars):
                return None
            return out_vars
@@ -527,7 +560,7 @@ class PartialProgramLayer:
        param_and_buffer_names_set = set()
        for i, var in enumerate(self._params):
            # self._params constains parameters and buffers with persistable=True.
-            if not isinstance(var, core.VarBase):
+            if not isinstance(var, (core.VarBase, core.eager.Tensor)):
                raise TypeError(
                    'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.
                    format(i, type(var)))
@@ -559,10 +592,18 @@ def _create_fake_var():
    """
    Create a fake_var (force on CPU) to handle empty input or output
    """
+    if not core._in_eager_mode():
        return [
            core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
                         core.VarDesc.VarType.RAW, False)
        ]
+    else:
+        return []
+        # TODO(jiabin): Support this later
+        # return [
+        #     core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
+        #                 core.VarDesc.VarType.RAW, False)
+        # ]
 def partial_program_from(concrete_program):

--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -25,7 +25,7 @@ import threading
 import six
 import paddle
-from paddle.fluid import core
+from paddle.fluid import core, dygraph
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.layers.utils import flatten, pack_sequence_as
@@ -898,6 +898,7 @@ def save(layer, path, input_spec=None, **configs):
                state_var_dict[var.name] = var
            # 3. share parameters from Layer to scope & record var info
+            with dygraph.guard():
                for param_or_buffer in concrete_program.parameters:
                    # share to scope
                    if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
@@ -915,12 +916,14 @@ def save(layer, path, input_spec=None, **configs):
                    if param_or_buffer.name not in extra_var_info:
                        extra_info_dict = dict()
                        if param_or_buffer.name in state_names_dict:
-                        extra_info_dict['structured_name'] = state_names_dict[
+                            extra_info_dict[
+                                'structured_name'] = state_names_dict[
                                    param_or_buffer.name]
                        extra_info_dict[
                            'stop_gradient'] = param_or_buffer.stop_gradient
                        if isinstance(param_or_buffer, ParamBase):
-                        extra_info_dict['trainable'] = param_or_buffer.trainable
+                            extra_info_dict[
+                                'trainable'] = param_or_buffer.trainable
                        extra_var_info[param_or_buffer.name] = extra_info_dict
        # 4. build input & output of save_infernece_model

--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -94,7 +94,7 @@ def monkey_patch_varbase():
        # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
        # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
        attr_not_need_keys = ['grad', 'T']
-        if isinstance(self, ParamBase):
+        if isinstance(self, (ParamBase, EagerParamBase)):
            attr_kwargs = self.__dict__.copy()
        else:
            attr_names = []
@@ -111,7 +111,7 @@ def monkey_patch_varbase():
        attr_kwargs.update(kwargs)
-        if to_parameter or isinstance(self, ParamBase):
+        if to_parameter or isinstance(self, (ParamBase, EagerParamBase)):
            del attr_kwargs['persistable']
            # NOTE(Aurelius84): All parameters should be placed into global block.
            attr_kwargs['block'] = attr_kwargs['block'].program.global_block()

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1821,7 +1821,7 @@ def _pack_loaded_dict(load_obj):
 @static_only
 def _legacy_save(param_dict, model_path, protocol=2):
    def get_tensor(var):
-        if isinstance(var, core.VarBase):
+        if isinstance(var, (core.VarBase, core.eager.Tensor)):
            return var.numpy()
        elif isinstance(var, core.LoDTensor):
            return np.array(var)

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -10148,6 +10148,9 @@ def flatten(x, axis=1, name=None):
    check_variable_and_dtype(
        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
        'flatten')
+    if in_dygraph_mode():
+        return _C_ops.flatten2(x, 'axis', axis)[0]
    helper = LayerHelper('flatten', **locals())
    if not (isinstance(x, Variable)):

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -663,6 +663,8 @@ def assign(input, output=None):
            })
    if is_inplace and in_dygraph_mode():
+        # TODO(jiabin): Remove this when we support inplace
+        if not core._in_eager_mode():
            output._bump_inplace_version()
    return output

--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
@@ -98,7 +98,7 @@ class TestFleetWithASPSharding(unittest.TestCase):
        feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
        exe.run(startup_prog)
-        sparsity.prune_model(train_prog, sharding=True)
+        sparsity.prune_model(train_prog)
        data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
        exe.run(train_prog, feed=feeder.feed([data]))

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -520,6 +520,7 @@ def predict_static(args, data):
    paddle.enable_static()
    exe = fluid.Executor(args.place)
    # load inference model
    [inference_program, feed_target_names,
     fetch_targets] = fluid.io.load_inference_model(
         args.model_save_dir,

--- a/python/paddle/fluid/tests/unittests/process_group_gloo.py
+++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+import paddle
+from paddle.fluid import core
+import datetime
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+    def test_create_process_group_gloo(self):
+        with _test_eager_guard():
+            nranks = ParallelEnv().nranks
+            rank = ParallelEnv().local_rank
+            is_master = True if rank == 0 else False
+            store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master,
+                                               nranks, datetime.timedelta(0))
+            gloo_store = paddle.fluid.core.GlooStore(store)
+            opt = paddle.fluid.core.GlooOptions()
+            pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks)
+            # test allreduce sum
+            # rank 0
+            paddle.device.set_device('cpu')
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = x + y
+            if rank == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                assert np.array_equal(tensor_y, sum_result)
+            print("test allreduce sum api ok")
+            # test allreduce max
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+            max_result = paddle.maximum(tensor_x, tensor_y)
+            if rank == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_y, max_result)
+            print("test allreduce max api ok")
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+            broadcast_result = paddle.assign(tensor_x)
+            if rank == 0:
+                task = pg.broadcast(tensor_x, 0)
+                task.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                task.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_y)
+            print("test broadcast api ok")
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -144,23 +144,109 @@ class TestProcessGroupFp32(unittest.TestCase):
            print("test barrier api ok\n")
-            # test send/recv
+            # test allgather
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
            if pg.rank() == 0:
-                task = pg.send(tensor_x, dst=1)
+                task = pg.all_gather(tensor_x, tensor_out)
                task.wait()
                paddle.device.cuda.synchronize()
            # rank 1
            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+            # test alltoall
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
            y = np.random.random(self.shape).astype(self.dtype)
+            out1 = np.random.random(self.shape).astype(self.dtype)
+            out2 = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
            tensor_y = paddle.to_tensor(y)
-                task = pg.recv(tensor_y, src=0)
+            tensor_out1 = paddle.to_tensor(out1)
+            tensor_out2 = paddle.to_tensor(out2)
+            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
+                                          [self.shape[0]])
+            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
+                                          [self.shape[0] // 2])
+            if pg.rank() == 0:
+                task = pg.alltoall(tensor_x, tensor_out1)
                task.wait()
                paddle.device.cuda.synchronize()
-                assert np.array_equal(tensor_x, tensor_y)
+            # rank 1
-                print("test send/recv api ok\n")
+            else:
+                task = pg.alltoall(tensor_y, tensor_out2)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
+                                  [self.shape[0]])
+            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
+            if pg.rank() == 0:
+                assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+            else:
+                assert np.array_equal(out2_1, raw_tensor_x_2)
+            print("test alltoall api ok\n")
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            if pg.rank() == 0:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_y, out1)
+            else:
+                assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
 class TestProcessGroupFp16(TestProcessGroupFp32):

--- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
+++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
@@ -162,6 +162,7 @@ class TestIRPassBase(unittest.TestCase):
        for k, v in self.get_strategy().items():
            setattr(build_strategy, k, v)
        self.check_before_applied(main2, startup2)
        apply_build_strategy(main2, startup2, build_strategy,
                             {"use_cuda": self.use_cuda})
        self.check_after_applied(main2, startup2)

--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -320,7 +320,7 @@ class TestBatchNormOpInference(unittest.TestCase):
    def test_check_output(self):
        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
            places.append(core.CUDAPlace(0))
        for place in places:
@@ -342,13 +342,13 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
    def test_check_output(self):
        places = []
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
            place = core.CUDAPlace(0)
            if core.is_float16_supported(place):
                places.append(place)
        for place in places:
-            for data_format in ["NCHW", "NHWC"]:
+            #for data_format in ["NCHW", "NHWC"]:
+            for data_format in ["NCHW"]:
                self.check_with_place(place, data_format, self.dtype,
                                      [2, 3, 4, 5])
                self.check_with_place(place, data_format, self.dtype, [2, 3])
@@ -517,7 +517,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
            places.append(core.CUDAPlace(0))
        for place in places:
@@ -657,7 +657,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
 class TestDygraphBatchNormTrainableStats(unittest.TestCase):
    def test_dygraph(self):
        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for p in places:
            shape = [4, 10, 4, 4]
@@ -678,7 +678,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
    def test_static(self):
        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for p in places:
            exe = fluid.Executor(p)
@@ -716,4 +716,6 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
 if __name__ == '__main__':
+    import paddle
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -28,7 +28,7 @@ import paddle
 class TestBatchNorm(unittest.TestCase):
    def test_name(self):
        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for p in places:
            with fluid.dygraph.guard(p):
@@ -36,7 +36,7 @@ class TestBatchNorm(unittest.TestCase):
    def test_error(self):
        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for p in places:
            #paddle.disable_static()
@@ -83,7 +83,7 @@ class TestBatchNorm(unittest.TestCase):
    def test_dygraph(self):
        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for p in places:
            shape = [4, 10, 4, 4]
@@ -135,7 +135,7 @@ class TestBatchNorm(unittest.TestCase):
    def test_static(self):
        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for p in places:
            exe = fluid.Executor(p)
@@ -177,7 +177,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
        else:
            paddle.set_default_dtype("float64")
        self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
            self.places.append(fluid.CUDAPlace(0))
    def tearDown(self):
@@ -247,7 +247,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
 class TestBatchNormUseGlobalStats(unittest.TestCase):
    def setUp(self):
        self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
            self.places.append(fluid.CUDAPlace(0))
        self.init_test()
@@ -300,4 +300,6 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats):
 if __name__ == '__main__':
+    import paddle
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_collective_process_group.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
@@ -22,6 +22,9 @@ class TestProcessGroup(TestMultipleGpus):
    def test_process_group_nccl(self):
        self.run_mnist_2gpu('process_group_nccl.py')
+    def test_process_group_gloo(self):
+        self.run_mnist_2gpu('process_group_gloo.py')
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 import paddle
 import paddle.fluid.core as core
@@ -1001,4 +1002,5 @@ create_test_cudnn_channel_last_fp16_class(
    TestWithDilation_AsyPadding, grad_check=False)
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -771,13 +771,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
            self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr))
            ori_place = egr_tensor.place
-            new_arr = np.random.rand(4, 4, 16, 32).astype('float32')
+            new_arr = np.random.rand(4, 16, 16, 32).astype('float32')
            self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr))
-            egr_tensor._set_value(new_arr)
+            egr_tensor.set_value(new_arr)
            self.assertEqual(egr_tensor.stop_gradient, True)
            self.assertTrue(egr_tensor.place._equals(ori_place))
-            self.assertEqual(egr_tensor.shape, [4, 4, 16, 32])
+            self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
            self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr))
@@ -880,7 +880,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase):
            new_weight = np.ones([1, 3]).astype('float32')
            self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight))
-            linear.weight._set_value(new_weight)
+            linear.weight.set_value(new_weight)
            self.assertTrue(np.array_equal(linear.weight.numpy(), new_weight))
            self.assertTrue(linear.weight.place._equals(ori_place))

--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -231,4 +231,5 @@ class TestExpandV2API(unittest.TestCase):
 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@@ -23,6 +23,7 @@ import paddle.fluid as fluid
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid import compiler
 import paddle.fluid.unique_name as unique_name
+import paddle
 class TestInplaceANBOpTraining(unittest.TestCase):
@@ -138,14 +139,14 @@ class TestInplaceANBOpTraining(unittest.TestCase):
                outs[0].name if not only_forward else None,
                build_strategy=build_strategy,
                exec_strategy=exec_strategy)
-            bn_fetches = exe.run(program=comp_prog1,
+            bn_fetches = exe.run(program=main,
                                 feed={'input': data},
                                 fetch_list=fetch_name)
            fetch_outs.append(bn_fetches)
            fetch_names.append(fetch_name)
-        for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs +
+        for bn_val, inplace_abn_val, name1, name2 in zip(*(
-                                                           fetch_names)):
+                fetch_outs + fetch_names)):
            self.assertTrue(
                np.allclose(
                    bn_val, inplace_abn_val, atol=1e-2),
@@ -156,6 +157,7 @@ class TestInplaceANBOpTraining(unittest.TestCase):
    def test_op(self):
        use_cudas = [False, True] if core.is_compiled_with_cuda() else [False]
+        #use_cudas = [False]
        for use_cuda in use_cudas:
            place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
            layouts = ["NCHW", "NHWC"]
@@ -186,4 +188,5 @@ class TestInplaceANBOpTraining(unittest.TestCase):
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -21,6 +21,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import gradient_checker
+import paddle
 from decorator_helper import prog_scope
@@ -167,4 +168,5 @@ class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
@@ -24,6 +24,7 @@ from simple_nets import init_data, simple_fc_net, fc_with_batchnorm
 import seresnext_net
 from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType
 from fake_reader import fake_imdb_reader
+import paddle
 def lstm_net(use_feed):
@@ -309,4 +310,5 @@ class TestProgramPruneBackward(unittest.TestCase):
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -507,4 +507,5 @@ class TestReshapeZeroTensor(unittest.TestCase):
 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -533,10 +533,6 @@ class TestTensorRegisterHook(unittest.TestCase):
            size=[self.batch_size, self.in_size]).astype('float32')
        data_t = paddle.to_tensor(data)
-        if _in_eager_mode():
-            with self.assertRaises(TypeError):
-                out = jit_net(data_t)
-        else:
        with self.assertRaises(AssertionError):
            out = jit_net(data_t)

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1402,7 +1402,8 @@ def gather(x, index, axis=None, name=None):
        return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False)
    check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        x, 'x',
+        ['float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
        'gather')
    check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')

--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -43,7 +43,9 @@ class BaseAPI(object):
            self.is_base_api = False
            self.invoke = api_item_yaml['invoke']
        else:
-            self.infer_meta = self.parse_infer_meta(api_item_yaml['infer_meta'])
+            if 'infer_meta' in api_item_yaml:
+                self.infer_meta = self.parse_infer_meta(api_item_yaml[
+                    'infer_meta'])
            self.kernel = self.parse_kernel(api_item_yaml['kernel'])
            self.support_selected_rows_kernel = False if len(self.kernel[
                'func']) == 1 else True
@@ -182,9 +184,9 @@ class BaseAPI(object):
                'Tensor': 'Tensor',
                'Tensor[]': 'std::vector<Tensor>'
            }
-            if re.search(r'\(\w*\)', output_item):
+            if re.search(r'\([a-zA-Z0-9_@]*\)', output_item):
                result = re.search(
-                    r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>\w+)\)",
+                    r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>[a-zA-Z0-9_@]+)\)",
                    output_item)
                out_type = result.group('out_type')
                assert out_type in output_type_map, \
@@ -499,11 +501,8 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
    def get_kernel_args(self, code_indent):
        input_trans_map = {
            'const Tensor&': 'const phi::DenseTensor&',
-            'const Tensor &': 'const phi::DenseTensor&',
            'const std::vector<Tensor>&':
            'const std::vector<phi::DenseTensor>&',
-            'const std::vector<Tensor> &':
-            'const std::vector<phi::DenseTensor>&',
            'const paddle::optional<Tensor>&':
            'paddle::optional<const phi::DenseTensor&>',
            'const paddle::optional<std::vector<Tensor>>&':
@@ -592,7 +591,6 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
    def get_selected_rows_kernel_args(self, code_indent):
        input_trans_map = {
            'const Tensor&': 'const phi::SelectedRows&',
-            'const Tensor &': 'const phi::SelectedRows&',
            'const paddle::optional<Tensor>&':
            'paddle::optional<const phi::SelectedRows&>'
        }

--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -105,7 +105,7 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"

--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -56,8 +56,9 @@ class BackwardAPI(BaseAPI):
        # check the attributes of backward
        for attr in self.attrs['names']:
-            assert attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0], \
+            assert (attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0]) or \
-                f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api. \
+                 self.attrs['attr_info'][attr][1] is not None, \
+                f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api or doesn't have default value. \
                 Please check the args of {self.api} in yaml."
        # check the output of backward
@@ -145,7 +146,7 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_registry.h"
-#include "paddle/phi/api/lib/api_utils.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"

--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
+- sparse_api : conv3d
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups)
+  output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  kernel :
+    func : sparse_conv3d
+    layout : x
+- sparse_api : to_dense
+  args : (Tensor x, Backend backend)
+  output : Tensor(out@DenseTensor)
+  invoke : to_dense_impl(x, backend)
+- sparse_api : to_sparse_coo
+  args : (Tensor x, Backend backend, int64_t sparse_dim)
+  output : Tensor(out@SparseCooTensor)
+  invoke : to_sparse_coo_impl(x, backend, sparse_dim)
+- sparse_api : to_sparse_csr
+  args : (Tensor x, Backend backend)
+  output : Tensor(out@SparseCsrTensor)
+  invoke : to_sparse_csr_impl(x, backend)
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import yaml
+import argparse
+import re
+from api_base import BaseAPI
+class SparseAPI(BaseAPI):
+    def __init__(self, api_item_yaml):
+        super(SparseAPI, self).__init__(api_item_yaml)
+    def get_api_name(self, api_item_yaml):
+        return api_item_yaml['sparse_api']
+    def get_api_func_name(self):
+        return self.api
+    def get_return_type(self, out_type_list):
+        return out_type_list[0] if len(
+            out_type_list) == 1 else "std::tuple<" + ",".join(
+                out_type_list) + ">"
+    def gene_api_declaration(self):
+        return f"""
+// {", ".join(self.outputs['names'])}
+PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+"""
+    def get_kernel_tensor_out_type(self, output_name):
+        sparse_type = 'TensorType::DENSE_TENSOR'
+        if output_name.endswith('@SparseCooTensor'):
+            sparse_type = 'TensorType::SPARSE_COO'
+        elif output_name.endswith('@SparseCsrTensor'):
+            sparse_type = 'TensorType::SPARSE_CSR'
+        return sparse_type
+    def gene_output(self,
+                    output_type_list,
+                    set_out_func,
+                    code_indent,
+                    inplace_flag=False):
+        kernel_output = ""
+        output_names = []
+        output_create = ""
+        if len(output_type_list) == 1:
+            kernel_output = 'kernel_out'
+            output_names.append('kernel_out')
+            inplace_assign = " = " + self.inplace_map[self.outputs['names'][
+                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
+                    'names'][0] in self.inplace_map else ""
+            output_create = f"""
+  {self.outputs['return_type']} out{inplace_assign};
+  auto* kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+        elif len(output_type_list) > 1:
+            output_create = f"""
+  {self.outputs['return_type']} out;"""
+            for i in range(len(output_type_list)):
+                kernel_output = kernel_output + f'kernel_out_{i}, '
+                output_names.append(f'kernel_out_{i}')
+                if inplace_flag and self.inplace_map is not None and self.outputs[
+                        'names'][i] in self.inplace_map:
+                    output_create = output_create + f"""
+  std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};"""
+                output_create = output_create + f"""
+  auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(out), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+            kernel_output = kernel_output[:-2]
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.api))
+        return kernel_output, output_names, output_create
+    def gen_sparse_kernel_context(self, kernel_output_names):
+        input_trans_map = {
+            'const Tensor&': 'const phi::TenseBase&',
+            'const std::vector<Tensor>&': 'const std::vector<phi::TenseBase>&',
+            'const paddle::optional<Tensor>&':
+            'paddle::optional<const phi::TenseBase&>'
+        }
+        out_trans_map = {
+            'Tensor': 'phi::TenseBase*',
+            'std::vector<Tensor>': 'std::vector<phi::TenseBase*>'
+        }
+        input_names = self.inputs['names']
+        input_infos = self.inputs['input_info']
+        attr_names = self.attrs['names']
+        kernel_param = self.kernel['param']
+        if kernel_param is None:
+            kernel_param = input_names + attr_names
+        kernel_context_code = ""
+        for param in kernel_param:
+            if param in input_names:
+                if param in self.optional_vars:
+                    raise ValueError(
+                        f"{self.api} : Unsupport optional input({param}) for sparse api."
+                    )
+                else:
+                    kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackInput({param}.impl().get());"""
+                continue
+            if param in attr_names:
+                # set attr for kernel_context
+                if 'ScalarArray' in self.attrs['attr_info'][param][0]:
+                    param = 'phi::ScalarArray(' + param + ')'
+                elif 'Scalar' in self.attrs['attr_info'][param][0]:
+                    param = 'phi::Scalar(' + param + ')'
+            elif isinstance(param, bool):
+                param = str(param).lower()
+            else:
+                param + str(param) + ", "
+            kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackAttr({param});"""
+        for out_name in kernel_output_names:
+            kernel_context_code = kernel_context_code + f"""
+  kernel_context.EmplaceBackOutput({out_name});"""
+        return kernel_context_code
+    def gen_sparse_kernel_code(self, inplace_flag=False):
+        _, kernel_output_names, output_create = self.gene_output(
+            self.outputs['types'], 'SetSparseKernelOutput', '', inplace_flag)
+        kernel_context_code = self.gen_sparse_kernel_context(
+            kernel_output_names)
+        return f"""
+  auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
+  VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+  VLOG(6) << "{self.api} api sparse kernel: " << phi_kernel;
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+  auto kernel_context = phi::KernelContext(dev_ctx);
+{output_create}
+{kernel_context_code}
+  phi_kernel(&kernel_context);
+  return out;"""
+    def gene_base_api_code(self, inplace_flag=False):
+        api_func_name = self.get_api_func_name()
+        return f"""
+PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+{self.gene_kernel_select()}
+{self.gen_sparse_kernel_code(inplace_flag)}
+}}
+"""
+def header_include():
+    return """
+#include <tuple>
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
+"""
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+#include "glog/logging.h"
+#include "paddle/phi/api/lib/api_registry.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/data_transform.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/declarations.h"
+"""
+def api_register():
+    return """
+PD_REGISTER_API(Test);
+"""
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+namespace sparse {
+""", """
+}  // namespace sparse
+}  // namespace experimental
+}  // namespace paddle
+""")
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+    namespace = api_namespace()
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+    include_header_file = "paddle/phi/api/include/sparse_api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+    for api in apis:
+        sparse_api = SparseAPI(api)
+        header_file.write(sparse_api.gene_api_declaration())
+        source_file.write(sparse_api.gene_api_code())
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+    source_file.write(api_register())
+    header_file.close()
+    source_file.close()
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ Sparse API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to sparse api yaml file',
+        default='python/paddle/utils/code_gen/sparse_api.yaml')
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/phi/api/include/sparse_api.h')
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/phi/api/lib/sparse_api.cc')
+    options = parser.parse_args()
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+if __name__ == '__main__':
+    main()