diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index b8ccd8e744dab1b2dcb31551893aa0df0180fbbc..f86b4b706b3e246629ec944e06857b88d3cfaad8 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -414,6 +414,16 @@ std::future<int32_t> BrpcPsClient::load(uint32_t table_id,
   return send_cmd(table_id, PS_LOAD_ONE_TABLE, {epoch, mode});
 }
 
+std::future<int32_t> BrpcPsClient::Load(const LoadSaveContext &load_context) {
+  if (load_context.table_id < 0) {
+    return send_cmd(-1, PS_LOAD_ALL_TABLE,
+                    {load_context.epoch, load_context.mode});
+  } else {
+    return send_cmd(load_context.table_id, PS_LOAD_ONE_TABLE,
+                    {load_context.epoch, load_context.mode});
+  }
+}
+
 std::future<int32_t> BrpcPsClient::save(const std::string &epoch,
                                         const std::string &mode) {
   VLOG(1) << "BrpcPsClient::save path " << epoch;
@@ -427,6 +437,19 @@ std::future<int32_t> BrpcPsClient::save(uint32_t table_id,
   return send_save_cmd(table_id, PS_SAVE_ONE_TABLE, {epoch, mode});
 }
 
+std::future<int32_t> BrpcPsClient::Save(const LoadSaveContext &save_context) {
+  if (save_context.table_id < 0) {
+    VLOG(1) << "BrpcPsClient::save path " << save_context.epoch;
+    return send_save_cmd(-1, PS_SAVE_ALL_TABLE,
+                         {save_context.epoch, save_context.mode});
+  } else {
+    VLOG(1) << "BrpcPsClient::save one table path " << save_context.epoch
+            << " table_id " << save_context.table_id;
+    return send_save_cmd(save_context.table_id, PS_SAVE_ONE_TABLE,
+                         {save_context.epoch, save_context.mode});
+  }
+}
+
 std::future<int32_t> BrpcPsClient::clear() {
   return send_cmd(-1, PS_CLEAR_ALL_TABLE, {});
 }
@@ -505,6 +528,44 @@ std::future<int32_t> BrpcPsClient::barrier(size_t table_id,
   return send_cmd(table_id, PS_BARRIER, {std::to_string(barrier_type)});
 }
 
+std::future<int32_t> BrpcPsClient::Pull(RequestContext &pull_context) {
+  if (pull_context.value_type == Dense) {  // pull dense
+    Region *dense_region =
+        reinterpret_cast<Region *>(pull_context.dense_values);
+    pull_dense(dense_region, pull_context.num, pull_context.table);
+  } else {  // pull sparse
+    uint64_t *keys = reinterpret_cast<uint64_t *>(pull_context.keys);
+    float **select_values =
+        reinterpret_cast<float **>(pull_context.sparse_values);
+    size_t table_id = pull_context.table;
+    size_t num = pull_context.num;
+    bool is_training = pull_context.is_training;
+    if (pull_context.training_mode == Geo) {  // for geo
+      pull_sparse_param(select_values, table_id, keys, num, is_training);
+    } else if (pull_context.training_mode == Async) {  // for async
+      pull_sparse(select_values, table_id, keys, num, is_training);
+    }
+  }
+}
+
+std::future<int32_t> BrpcPsClient::Push(RequestContext &push_context) {
+  if (push_context.value_type == Dense) {  // push dense
+    const Region *dense_region = push_context.push_context.push_dense_values;
+    push_dense(dense_region, push_context.num, push_context.table);
+  } else {  // push sparse
+    size_t table_id = push_context.table;
+    size_t num = push_context.num;
+    bool is_training = push_context.is_training;
+    if (push_context.training_mode == Geo) {  // for geo
+      // TODO(zhaocaibei)
+    } else if (push_context.training_mode == Async) {  // for async
+      const uint64_t *keys = push_context.push_context.keys;
+      const float **update_values = push_context.push_context.push_values;
+      push_sparse(table_id, keys, update_values, num);
+    }
+  }
+}
+
 std::future<int32_t> BrpcPsClient::pull_geo_param(size_t table_id,
                                                   std::vector<float> *values,
                                                   std::vector<uint64_t> *keys,
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
index 59ed59933db868ae4c05b69529a2c12fd0f689e2..8b0cb0741b4004fbad444a9919ec540289067f55 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
@@ -163,12 +163,17 @@ class BrpcPsClient : public PSClient {
   std::future<int32_t> load(uint32_t table_id, const std::string &epoch,
                             const std::string &mode) override;
 
+  std::future<int32_t> Load(const LoadSaveContext &load_context) override;
+
   std::future<int32_t> save(const std::string &epoch,
                             const std::string &mode) override;
 
   std::future<int32_t> save(uint32_t table_id, const std::string &epoch,
                             const std::string &mode) override;
 
+  virtual std::future<int32_t> Save(
+      const LoadSaveContext &save_context) override;
+
   std::future<int32_t> clear() override;
 
   std::future<int32_t> clear(uint32_t table_id) override;
@@ -199,6 +204,10 @@ class BrpcPsClient : public PSClient {
                                                  const uint64_t *keys,
                                                  size_t num, bool is_training);
 
+  virtual std::future<int32_t> Pull(RequestContext &pull_context) override;
+
+  virtual std::future<int32_t> Push(RequestContext &push_context) override;
+
   virtual std::future<int32_t> print_table_stat(uint32_t table_id);
 
   virtual std::future<int32_t> barrier(size_t table_id, uint32_t barrier_type);
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.h b/paddle/fluid/distributed/ps/service/brpc_ps_server.h
index 4310c247438ceb9bff541fdd21e00ff70ff7b4fd..d81a3a5df07f1de534cd646138fecc4dc2c970e1 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_server.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.h
@@ -51,7 +51,7 @@ class BrpcPsServer : public PSServer {
     _server.Join();
     return 0;
   }
-  virtual int32_t port();
+  int32_t port();
 
  private:
   virtual int32_t initialize();
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.h b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
index aee0190850753786ce0f083257458caf50a63d26..a978d97b296b0a529a121fcfb9723639421d1e5e 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
@@ -43,7 +43,7 @@ class GraphBrpcServer : public PSServer {
     _server.Join();
     return 0;
   }
-  virtual int32_t port();
+  int32_t port();
 
   std::condition_variable *export_cv() { return &cv_; }
 
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 21719fbdbf1d64ad26ae0053b73812440ed08b66..8a2bfbe31602be299366fdcbeb264e45a5c4f703 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -26,6 +26,7 @@
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 #include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
@@ -59,6 +60,41 @@ class PSClientClosure : public google::protobuf::Closure {
   std::vector<std::shared_ptr<std::promise<int32_t>>> _promises;
 };
 
+struct LoadSaveContext {
+  int table_id;
+  std::string epoch;
+  std::string mode;
+};
+
+enum TrainingMode { Async = 0, Sync = 1, Geo = 3 };
+
+enum TrainingPhase { Init = 0, Train = 1, Save = 2 };
+
+// enum ValueType {
+//   Sparse = 0,
+//   Dense = 1
+// };
+
+struct PushContext {
+  const uint64_t *keys;
+  const float **push_values;
+  const Region *push_dense_values;
+};
+
+struct RequestContext {
+  int table;
+  TrainingMode training_mode;    // 1 for async, 2 for geo, 3 for sync
+  TrainingPhase training_phase;  // 1 for init, 2 for train
+  ValueType value_type;          // 1 for sparse, 2 for dense
+  void *keys;
+  void **sparse_values;  // for sparse values
+  Region *dense_values;  // for dense values
+  PushContext push_context;
+  size_t num;
+  bool is_training;
+  void *callback;
+};
+
 class PSClient {
  public:
   PSClient() {}
@@ -86,6 +122,9 @@ class PSClient {
   // 指定table数据load
   virtual std::future<int32_t> load(uint32_t table_id, const std::string &epoch,
                                     const std::string &mode) = 0;
+  // context配置load选项
+  virtual std::future<int32_t> Load(const LoadSaveContext &load_context) = 0;
+
   // 全量table数据save  value_accessor根据mode，可能有不同的save条件
   virtual std::future<int32_t> save(const std::string &epoch,
                                     const std::string &mode) = 0;
@@ -93,6 +132,8 @@ class PSClient {
   virtual std::future<int32_t> save(uint32_t table_id, const std::string &epoch,
                                     const std::string &mode) = 0;
 
+  virtual std::future<int32_t> Save(const LoadSaveContext &save_context) = 0;
+
   // 清空table数据
   virtual std::future<int32_t> clear() = 0;
   virtual std::future<int32_t> clear(uint32_t table_id) = 0;
@@ -107,6 +148,8 @@ class PSClient {
   virtual std::future<int32_t> pull_dense(Region *regions, size_t region_num,
                                           size_t table_id) = 0;  // 保留
 
+  virtual std::future<int32_t> Push(RequestContext &push_context) = 0;
+
   // firstly push dense param for parameter server
   // this is neccessary because dense weight initialized in trainer on cold
   // start
@@ -117,6 +160,9 @@ class PSClient {
   virtual std::future<int32_t> push_dense(const Region *regions,
                                           size_t region_num,
                                           size_t table_id) = 0;
+
+  virtual std::future<int32_t> Pull(RequestContext &pull_context) = 0;
+
   // 使用keys进行pull请求，结果填充values
   // keys和values的个数均为num个，每个value占用select_size空间
   // future结束前keys和values缓冲区不能再次使用
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc
index 972cce135f189bee6dbba9e0b89baa288816827b..9e364b6d3ed7aabe3cd3bc944e697e11ac808a33 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc
@@ -56,6 +56,19 @@ int32_t PsLocalClient::initialize() {
   return done();
 }
 
+std::future<int32_t> PsLocalClient::Load(const LoadSaveContext& load_context) {
+  if (load_context.table_id < 0) {
+    for (auto& it : _table_map) {
+      load(it.first, load_context.epoch, load_context.mode);
+    }
+    return done();
+  } else {
+    auto* table_ptr = table(load_context.table_id);
+    table_ptr->load(load_context.epoch, load_context.mode);
+    return done();
+  }
+}
+
 ::std::future<int32_t> PsLocalClient::save(const std::string& epoch,
                                            const std::string& mode) {
   // TODO
@@ -74,6 +87,21 @@ int32_t PsLocalClient::initialize() {
   return done();
 }
 
+::std::future<int32_t> PsLocalClient::Save(
+    const LoadSaveContext& save_context) {
+  if (save_context.table_id < 0) {
+    for (auto& it : _table_map) {
+      save(it.first, save_context.epoch, save_context.mode);
+    }
+    return done();
+  } else {
+    auto* table_ptr = table(save_context.table_id);
+    table_ptr->flush();
+    table_ptr->save(save_context.epoch, save_context.mode);
+    return done();
+  }
+}
+
 ::std::future<int32_t> PsLocalClient::clear() {
   // TODO
   return done();
@@ -93,6 +121,51 @@ int32_t PsLocalClient::initialize() {
   return done();
 }
 
+::std::future<int32_t> PsLocalClient::Pull(RequestContext& pull_context) {
+  if (pull_context.value_type == Dense) {  // pull dense
+    Region* dense_region = reinterpret_cast<Region*>(pull_context.dense_values);
+    pull_dense(dense_region, pull_context.num, pull_context.table);
+  } else {  // pull sparse
+    uint64_t* keys = reinterpret_cast<uint64_t*>(pull_context.keys);
+    char** select_values = reinterpret_cast<char**>(pull_context.sparse_values);
+    size_t table_id = pull_context.table;
+    size_t num = pull_context.num;
+    pull_sparse_ptr(select_values, table_id, keys, num);
+  }
+}
+
+::std::future<int32_t> PsLocalClient::Push(RequestContext& push_context) {
+  if (push_context.value_type == Dense) {  // push dense
+    if (push_context.training_phase == Init) {
+      const Region* regions = push_context.push_context.push_dense_values;
+      size_t region_num = push_context.num;
+      push_dense_param(regions, region_num, push_context.table);
+    } else {
+      if (push_context.training_mode == Geo) {  // geo
+        float* total_send_data =
+            reinterpret_cast<float*>(push_context.dense_values);
+        size_t total_send_data_size = push_context.num;
+        push_dense_raw_gradient(push_context.table, total_send_data,
+                                total_send_data_size, push_context.callback);
+      } else {  // async and sync
+        const Region* regions = push_context.push_context.push_dense_values;
+        size_t region_num = push_context.num;
+        push_dense(regions, region_num, push_context.table);
+      }
+    }
+  } else {  // push sparse
+    if (push_context.training_mode == Async) {
+      const uint64_t* keys = push_context.push_context.keys;
+      const float** update_values = push_context.push_context.push_values;
+      size_t table_id = push_context.table;
+      size_t num = push_context.num;
+      push_sparse(table_id, keys, update_values, num);
+    } else {
+      // TODO
+    }
+  }
+}
+
 ::std::future<int32_t> PsLocalClient::pull_dense(Region* regions,
                                                  size_t region_num,
                                                  size_t table_id) {
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h
index e73974ac562861d86e679ddbc213335d10731281..83ca558e3d2cb1f62235cda06c221b0d9367b043 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.h
@@ -39,12 +39,16 @@ class PsLocalClient : public PSClient {
   virtual ::std::future<int32_t> load(uint32_t table_id,
                                       const std::string& epoch,
                                       const std::string& mode) override;
+  virtual std::future<int32_t> Load(
+      const LoadSaveContext& load_context) override;
 
   virtual ::std::future<int32_t> save(const std::string& epoch,
                                       const std::string& mode) override;
   virtual ::std::future<int32_t> save(uint32_t table_id,
                                       const std::string& epoch,
                                       const std::string& mode) override;
+  virtual std::future<int32_t> Save(
+      const LoadSaveContext& save_context) override;
 
   virtual ::std::future<int32_t> clear() override;
   virtual ::std::future<int32_t> clear(uint32_t table_id) override;
@@ -55,6 +59,10 @@ class PsLocalClient : public PSClient {
   virtual ::std::future<int32_t> pull_dense(Region* regions, size_t region_num,
                                             size_t table_id);
 
+  virtual ::std::future<int32_t> Pull(RequestContext& pull_context) override;
+
+  virtual ::std::future<int32_t> Push(RequestContext& push_context) override;
+
   virtual ::std::future<int32_t> push_dense(const Region* regions,
                                             size_t region_num, size_t table_id);
 
diff --git a/paddle/fluid/distributed/ps/service/ps_local_server.h b/paddle/fluid/distributed/ps/service/ps_local_server.h
index 91f8bc4c9127115c9b5595270973d011778c6262..31b52126fc5767b445dfb605ff46b3fbc63c620c 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_server.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_server.h
@@ -28,7 +28,6 @@ class PsLocalServer : public PSServer {
   virtual uint64_t start() { return 0; }
   virtual uint64_t start(const std::string &ip, uint32_t port) { return 0; }
   virtual int32_t stop() { return 0; }
-  virtual int32_t port() { return 0; }
   virtual int32_t configure(
       const PSParameter &config, PSEnvironment &env, size_t server_rank,
       const std::vector<framework::ProgramDesc> &server_sub_program = {}) {
diff --git a/paddle/fluid/distributed/ps/service/server.cc b/paddle/fluid/distributed/ps/service/server.cc
index 5f1974e3e610c6772457514759bff83db944bf52..893f671359e40ce632185c78bade16404d23afc0 100644
--- a/paddle/fluid/distributed/ps/service/server.cc
+++ b/paddle/fluid/distributed/ps/service/server.cc
@@ -67,8 +67,6 @@ int32_t PSServer::configure(
   _config = config.server_param();
   _rank = server_rank;
   _environment = &env;
-  _shuffled_ins =
-      paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>();
   size_t shard_num = env.get_ps_servers().size();
 
   const auto &downpour_param = _config.downpour_server_param();
diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h
index 160d4a612829531d619c69a0cd5e9cd091f94868..d2804405b41989cbd9b5bed0afaf6d481d0658db 100644
--- a/paddle/fluid/distributed/ps/service/server.h
+++ b/paddle/fluid/distributed/ps/service/server.h
@@ -69,11 +69,6 @@ class PSServer {
       const PSParameter &config, PSEnvironment &env, size_t server_rank,
       const std::vector<framework::ProgramDesc> &server_sub_program = {});
 
-  // return server_ip
-  virtual std::string ip() { return butil::my_ip_cstr(); }
-  // return server_port
-  virtual int32_t port() = 0;
-
   virtual uint64_t start(const std::string &ip, uint32_t port) = 0;
   virtual int32_t stop() = 0;
 
@@ -94,15 +89,6 @@ class PSServer {
     return &_table_map;
   }
 
-  typedef std::function<int32_t(int, int, const std::string &)> MsgHandlerFunc;
-  virtual int registe_pserver2pserver_msg_handler(int msg_type,
-                                                  MsgHandlerFunc handler) {
-    _msg_handler_map[msg_type] = handler;
-    return 0;
-  }
-
-  paddle::framework::Channel<std::pair<uint64_t, std::string>> _shuffled_ins;
-
  protected:
   virtual int32_t initialize() = 0;
 
@@ -111,7 +97,6 @@ class PSServer {
   ServerParameter _config;
   PSEnvironment *_environment;
   std::unordered_map<uint32_t, std::shared_ptr<Table>> _table_map;
-  std::unordered_map<int32_t, MsgHandlerFunc> _msg_handler_map;
 
  protected:
   std::shared_ptr<framework::Scope> scope_;
diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h
index 7c91a6086498037e56f9b89dc13243cfeb827c5c..07c211bb9c12866e3646a0dbdebfba189eb2507e 100644
--- a/paddle/fluid/distributed/ps/table/accessor.h
+++ b/paddle/fluid/distributed/ps/table/accessor.h
@@ -45,6 +45,17 @@ struct DataConverter {
   std::string deconverter;
 };
 
+struct AccessorInfo {
+  size_t dim;
+  size_t size;
+  size_t select_size;
+  size_t select_dim;
+  size_t update_size;
+  size_t update_dim;
+  size_t mf_size;
+  size_t fea_dim;
+};
+
 class ValueAccessor {
  public:
   ValueAccessor() {}
@@ -68,6 +79,8 @@ class ValueAccessor {
   }
   virtual int initialize() = 0;
 
+  virtual void GetTableInfo(AccessorInfo& info) = 0;
+
   // value维度
   virtual size_t dim() = 0;
   // value各个维度的size
@@ -163,6 +176,7 @@ class ValueAccessor {
   TableAccessorParameter _config;
   std::unordered_map<int, std::shared_ptr<struct DataConverter>>
       _data_coverter_map;
+  AccessorInfo _accessor_info;
 };
 REGISTER_PSCORE_REGISTERER(ValueAccessor);
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/ps/table/common_dense_table.cc b/paddle/fluid/distributed/ps/table/common_dense_table.cc
index 607469e2f7b0d5df79d4cb7477e0eaa3f4a8323a..cc0f5867a3d651bca9323452d1eb97355de4c160 100644
--- a/paddle/fluid/distributed/ps/table/common_dense_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_dense_table.cc
@@ -128,6 +128,21 @@ int32_t CommonDenseTable::set_global_lr(float* lr) {
   return 0;
 }
 
+int32_t CommonDenseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Dense);
+  float* pull_values = context.pull_context.values;
+  return pull_dense(pull_values, context.num);
+}
+
+int32_t CommonDenseTable::Push(TableContext& context) {
+  CHECK(context.value_type == Dense);
+  if (context.pull_context.values != nullptr) {
+    const float* values = context.push_context.values;
+    return push_dense(values, context.num);
+  }
+  return 0;
+}
+
 int32_t CommonDenseTable::pull_dense(float* pull_values, size_t num) {
   std::copy(values_[param_idx_].begin(), values_[param_idx_].end(),
             pull_values);
diff --git a/paddle/fluid/distributed/ps/table/common_dense_table.h b/paddle/fluid/distributed/ps/table/common_dense_table.h
index a4c0f29ddb8770c8adc0d6885929aaac8a028e90..cad49a0a449c4735a74261574436a78789694d9b 100644
--- a/paddle/fluid/distributed/ps/table/common_dense_table.h
+++ b/paddle/fluid/distributed/ps/table/common_dense_table.h
@@ -40,6 +40,8 @@ class CommonDenseTable : public DenseTable {
                                   const std::string& name);
   virtual int32_t initialize_value();
   virtual int32_t initialize_optimizer();
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
   int32_t pull_dense(float* pull_values, size_t num) override;
   int32_t push_dense_param(const float* values, size_t num) override;
   int32_t push_dense(const float* values, size_t num) override;
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index 7946569525cc4bb1351046632dfe5894611c4b67..f6f127621b947c41122f7803a90f39b640713b8e 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -454,6 +454,9 @@ class GraphTable : public SparseTable {
   int32_t get_server_index_by_id(int64_t id);
   Node *find_node(int64_t id);
 
+  virtual int32_t Pull(TableContext &context) { return 0; }
+  virtual int32_t Push(TableContext &context) { return 0; }
+
   virtual int32_t pull_sparse(float *values,
                               const PullSparseValue &pull_value) {
     return 0;
diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.cc b/paddle/fluid/distributed/ps/table/common_sparse_table.cc
index b44d08b937a96c806142f5d7f1ba2ae0bcdb0f5e..45be53335e1a181f7c1e2abb7326ac6b9800703f 100644
--- a/paddle/fluid/distributed/ps/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_sparse_table.cc
@@ -355,6 +355,32 @@ int32_t CommonSparseTable::pour() {
   return 0;
 }
 
+int32_t CommonSparseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.use_ptr) {
+    char** pull_values = context.pull_context.ptr_values;
+    const uint64_t* keys = context.pull_context.keys;
+    return pull_sparse_ptr(pull_values, keys, context.num);
+  } else {
+    float* pull_values = context.pull_context.values;
+    const PullSparseValue& pull_value = context.pull_context.pull_value;
+    return pull_sparse(pull_values, pull_value);
+  }
+}
+
+int32_t CommonSparseTable::Push(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.pull_context.values != nullptr) {
+    const float* values = context.push_context.values;
+    const uint64_t* keys = context.push_context.keys;
+    return push_sparse(keys, values, context.num);
+  } else {
+    const float** values = context.push_context.ptr_values;
+    const uint64_t* keys = context.push_context.keys;
+    return push_sparse(keys, values, context.num);
+  }
+}
+
 int32_t CommonSparseTable::pull_sparse(float* pull_values,
                                        const PullSparseValue& pull_value) {
   auto shard_num = task_pool_size_;
diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.h b/paddle/fluid/distributed/ps/table/common_sparse_table.h
index 82481dcd584e42b9b2bca1bcc5862b361e372b05..138c5447420663eae5ad94ea03a84360a46f8b3d 100644
--- a/paddle/fluid/distributed/ps/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/common_sparse_table.h
@@ -121,6 +121,9 @@ class CommonSparseTable : public SparseTable {
   virtual int32_t push_dense(const float* values, size_t num) { return 0; }
   // unused method end
 
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
+
   virtual int32_t initialize();
   virtual int32_t initialize_shard() { return 0; }
   virtual int32_t initialize_value();
diff --git a/paddle/fluid/distributed/ps/table/common_table.h b/paddle/fluid/distributed/ps/table/common_table.h
index bac826dfe0e20b42d5cc47467356bc5614383a44..3d291c0152246bffa748ea57cf1c96eff6f2f343 100644
--- a/paddle/fluid/distributed/ps/table/common_table.h
+++ b/paddle/fluid/distributed/ps/table/common_table.h
@@ -119,6 +119,9 @@ class BarrierTable : public Table {
 
   virtual void *get_shard(size_t shard_idx) { return 0; }
 
+  virtual int32_t Pull(TableContext &context) { return 0; }
+  virtual int32_t Push(TableContext &context) { return 0; }
+
   int32_t pull_dense(float *values, size_t num) override { return 0; }
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
index 866bd8114ccea329123e16585c33366e759d5df8..43e143dca901bb8264f666a1e4fd89a52102d894 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
@@ -38,6 +38,16 @@ int CtrCommonAccessor::initialize() {
   return 0;
 }
 
+void CtrCommonAccessor::GetTableInfo(AccessorInfo& info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 size_t CtrCommonAccessor::dim() { return common_feature_value.dim(); }
 
 size_t CtrCommonAccessor::dim_size(size_t dim) {
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h
index 1e31fec04649b19882269fa9cce5f5d7fb4978c1..bc46217955a8a677a9e5e16f740e2636d633908f 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h
@@ -126,6 +126,7 @@ class CtrCommonAccessor : public ValueAccessor {
   virtual int initialize();
   virtual ~CtrCommonAccessor() {}
 
+  virtual void GetTableInfo(AccessorInfo& info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
index b07bcf70ad7af416fc66e036c0061f9556cc4eae..bccf1fdebafa03442047048825ef85207711b6b3 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
@@ -37,6 +37,16 @@ int DownpourCtrDoubleAccessor::initialize() {
   return 0;
 }
 
+void DownpourCtrDoubleAccessor::GetTableInfo(AccessorInfo& info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 size_t DownpourCtrDoubleAccessor::dim() {
   auto embedx_dim = _config.embedx_dim();
   return DownpourCtrDoubleFeatureValue::dim(embedx_dim);
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
index d7c717ace098821c3434888c9ff0ad699c923867..d7942634e86003c484710aad1d969e4d6371cb7f 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
@@ -168,6 +168,7 @@ class DownpourCtrDoubleAccessor : public ValueAccessor {
   DownpourCtrDoubleAccessor() {}
   virtual ~DownpourCtrDoubleAccessor() {}
   virtual int initialize();
+  virtual void GetTableInfo(AccessorInfo& info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/depends/sparse_utils.h b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h
index 708f7786bf3b0975791fcc74dddf62d2eb01e450..98e0250acc4d686dbde561ffb03edeb96444c406 100644
--- a/paddle/fluid/distributed/ps/table/depends/sparse_utils.h
+++ b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h
@@ -58,7 +58,7 @@ struct PullSparseValue {
                std::vector<int>* offset_shard) const {
     offset_shard->reserve(numel_ / shard_num + 1);
     for (int x = 0; x < numel_; ++x) {
-      if (feasigns_[x] % shard_num == shard_id) {
+      if (int(feasigns_[x] % shard_num) == shard_id) {
         offset_shard->push_back(x);
       }
     }
diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc
index 5f22c3a436f1f0b89e6289844a9c56fbe888625d..e8ca7430351de7cbdc1e98607d6d9b884b6a376a 100644
--- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc
@@ -37,6 +37,16 @@ int DownpourCtrAccessor::initialize() {
   return 0;
 }
 
+void DownpourCtrAccessor::GetTableInfo(AccessorInfo& info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 size_t DownpourCtrAccessor::dim() {
   auto embedx_dim = _config.embedx_dim();
   return DownpourCtrFeatureValue::dim(embedx_dim);
diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h
index 5de7b12e01f0d6e619ee14b852b7aa308ec3b497..11991ad044ff63353c9a898469ec915163c2dea9 100644
--- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h
@@ -160,6 +160,7 @@ class DownpourCtrAccessor : public ValueAccessor {
   virtual ~DownpourCtrAccessor() {}
 
   virtual int initialize();
+  virtual void GetTableInfo(AccessorInfo& info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
index 89c4fc15ae27998da3a3c7c3092baa9eee9846a0..3b43f99543fddabfaa24fc7da562203fc3f0d633 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
@@ -48,6 +48,8 @@ class MemorySparseGeoTable : public SparseTable {
   virtual int32_t save(const std::string& path, const std::string& param) {
     return 0;
   }
+  virtual int32_t Pull(TableContext& context) { return 0; }
+  virtual int32_t Push(TableContext& context) { return 0; }
   virtual int32_t flush() { return 0; }
   virtual int32_t shrink(const std::string& param) { return 0; }
   virtual void clear() { return; }
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
index 7ce6e9005cf56ca295a6620a209551e303c112f3..98454ca747d314d76bb63706e853ded835df736a 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -390,6 +390,26 @@ std::pair<int64_t, int64_t> MemorySparseTable::print_table_stat() {
   return {feasign_size, mf_size};
 }
 
+int32_t MemorySparseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.use_ptr) {
+    char** pull_values = context.pull_context.ptr_values;
+    const uint64_t* keys = context.pull_context.keys;
+    return pull_sparse_ptr(pull_values, keys, context.num);
+  } else {
+    float* pull_values = context.pull_context.values;
+    const PullSparseValue& pull_value = context.pull_context.pull_value;
+    return pull_sparse(pull_values, pull_value);
+  }
+}
+
+int32_t MemorySparseTable::Push(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+
+  const uint64_t* keys = context.push_context.keys;
+  return push_sparse(keys, context.push_context.ptr_values, context.num);
+}
+
 int32_t MemorySparseTable::pull_sparse(float* pull_values,
                                        const PullSparseValue& pull_value) {
   CostTimer timer("pserver_sparse_select_all");
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
index 5770f25f8f41dec286993d6b586959c8c0d3a0c0..d26c67319760da0496ae8a1c164adf0d5b63b1f2 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
@@ -48,6 +48,9 @@ class MemorySparseTable : public SparseTable {
   virtual int32_t push_dense(const float* values, size_t num) { return 0; }
   // unused method end
 
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
+
   virtual int32_t initialize();
   virtual int32_t initialize_shard() { return 0; }
   virtual int32_t initialize_value();
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index 60514b4e19ffaf63f285e25f1355660fabe58d48..5bc58bc5a1108b5f342036d9bd72c96287458401 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -61,6 +61,21 @@ int32_t SSDSparseTable::initialize() {
   return 0;
 }
 
+int32_t SSDSparseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.use_ptr) {
+    char** pull_values = context.pull_context.ptr_values;
+    const uint64_t* keys = context.pull_context.keys;
+    return pull_sparse_ptr(pull_values, keys, context.num);
+  } else {
+    float* pull_values = context.pull_context.values;
+    const PullSparseValue& pull_value = context.pull_context.pull_value;
+    return pull_sparse(pull_values, pull_value);
+  }
+}
+
+int32_t SSDSparseTable::Push(TableContext& context) { return 0; }
+
 int32_t SSDSparseTable::pull_sparse(float* pull_values,
                                     const PullSparseValue& pull_value) {
   auto shard_num = task_pool_size_;
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
index f5e8a7067e0e041f9913bef8e43ad8b35bdb2783..3a703d7d966d3e6026d13c0658f5979120cd2073 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
@@ -42,6 +42,9 @@ class SSDSparseTable : public CommonSparseTable {
   // exchange data
   virtual int32_t update_table();
 
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
+
   virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
 
   virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys,
diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h
index da1bb668ccfa3c5f1a4f876a396847b6b3853772..2bd2a42b6c58f0753de86aa4e60ac7e0611bd7f7 100644
--- a/paddle/fluid/distributed/ps/table/table.h
+++ b/paddle/fluid/distributed/ps/table/table.h
@@ -32,6 +32,30 @@
 
 namespace paddle {
 namespace distributed {
+
+enum ValueType { Sparse = 0, Dense = 1 };
+
+struct PullContext {
+  const uint64_t *keys;
+  const PullSparseValue pull_value;
+  float *values;
+  char **ptr_values;
+};
+
+struct TablePushContext {
+  const uint64_t *keys;
+  const float *values;
+  const float **ptr_values;
+};
+
+struct TableContext {
+  ValueType value_type;
+  PullContext pull_context;
+  TablePushContext push_context;
+  size_t num;
+  bool use_ptr;
+};
+
 class Table {
  public:
   Table() {}
@@ -39,6 +63,8 @@ class Table {
   virtual int32_t initialize(const TableParameter &config,
                              const FsClientParameter &fs_config);
 
+  virtual int32_t Pull(TableContext &context) = 0;
+  virtual int32_t Push(TableContext &context) = 0;
   virtual int32_t pull_dense(float *values, size_t num) = 0;
   virtual int32_t push_dense(const float *values, size_t num) = 0;
   // for push global_step
diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.cc b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
index 70a580c1e53a931dc2affd29db01b72691c68a39..8c5349bff832caaa0a1b411723df8b3e9bcdcd4f 100644
--- a/paddle/fluid/distributed/ps/table/tensor_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
@@ -20,6 +20,16 @@ namespace distributed {
 
 int CommMergeAccessor::initialize() { return 0; }
 
+void CommMergeAccessor::GetTableInfo(AccessorInfo &info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 // value 维度
 size_t CommMergeAccessor::dim() { return 0; }
 
diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h
index 5041b8fdf8733eff676b5fce1a972e39182df48e..1873b743b44ec736f0470c3eff1f5b0280c235bf 100644
--- a/paddle/fluid/distributed/ps/table/tensor_accessor.h
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h
@@ -30,6 +30,7 @@ class CommMergeAccessor : public ValueAccessor {
   CommMergeAccessor() {}
   virtual ~CommMergeAccessor() {}
   virtual int initialize();
+  virtual void GetTableInfo(AccessorInfo &info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/tensor_table.h b/paddle/fluid/distributed/ps/table/tensor_table.h
index 64d81327acc55ba0655bfc33efaa0d9d9f59649e..23a62365c0f5a374f3820e2e790e6085cfda1c06 100644
--- a/paddle/fluid/distributed/ps/table/tensor_table.h
+++ b/paddle/fluid/distributed/ps/table/tensor_table.h
@@ -48,6 +48,8 @@ class TensorTable : public Table {
   TensorTable() {}
   virtual ~TensorTable() {}
 
+  virtual int32_t Pull(TableContext &context) { return 0; }
+  virtual int32_t Push(TableContext &context) { return 0; }
   int32_t pull_dense(float *values, size_t num) override { return 0; }
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index 0588dbdf0fc61298d33eeb6db5b3de91a6de8256..c887cfeb71eef1c8b861b0d5958dca983e9feaaf 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -30,6 +30,32 @@ bool FleetWrapper::is_initialized_ = false;
 
 std::shared_ptr<paddle::distributed::PSCore> FleetWrapper::pserver_ptr_ = NULL;
 
+void FleetWrapper::Stop() { StopServer(); }
+
+void FleetWrapper::Load(WrapperContext& context) {
+  auto table_id = context.table_id;
+  if (table_id >= 0 && context.meta != "") {
+    LoadSparseOnServer(context.path, context.meta, context.table_id);
+    return;
+  }
+  if (table_id < 0) {  // laod all
+    LoadModel(context.path, context.mode);
+  } else {  // load one table
+    LoadModelOneTable(table_id, context.path, context.mode);
+  }
+  return;
+}
+
+void FleetWrapper::Save(WrapperContext& context) {
+  auto table_id = context.table_id;
+  if (table_id < 0) {
+    SaveModel(context.path, context.mode);
+  } else {
+    SaveModelOneTable(table_id, context.path, context.mode);
+  }
+  return;
+}
+
 void FleetWrapper::SetClient2ClientConfig(int request_timeout_ms,
                                           int connect_timeout_ms,
                                           int max_retry) {
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h
index a535b8c5bf8f9bf72a2fa895b8a0fd82ffb2e0a3..d68c453c6d51b04131ce562cafddbbdb06ac0356 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.h
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.h
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
+#include "paddle/fluid/distributed/ps/wrapper/ps_wrapper.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/framework/io/shell.h"
@@ -54,7 +55,7 @@ using framework::Variable;
 
 using RpcCtxMap = std::unordered_map<std::string, CommContext>;
 
-class FleetWrapper {
+class FleetWrapper : public PSWrapper {
  public:
   virtual ~FleetWrapper() {}
   FleetWrapper() {
@@ -68,7 +69,13 @@ class FleetWrapper {
     // pserver request max retry
     client2client_max_retry_ = 3;
   }
+  virtual int32_t Initialize(InitContext& context) { return 0; }
 
+  virtual void Stop() override;
+
+  virtual void Load(WrapperContext& context) override;
+
+  virtual void Save(WrapperContext& context) override;
   // set client to client communication config
   void SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms,
                               int max_retry);
diff --git a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
index c92835aa995adfd3158fc344b490efb2c3133ec0..ca02ad31195ef2cdee649f5348d3f735c38097b8 100755
--- a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
+++ b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
@@ -1,18 +1,84 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
-#define PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
-
-#endif  // PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <ctime>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/service.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/io/shell.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace framework {
+class Scope;
+class SelectedRows;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace distributed {
+
+class PSCore;
+
+using framework::LoDTensor;
+using framework::Scope;
+using phi::SelectedRows;
+using framework::Variable;
+
+using RpcCtxMap = std::unordered_map<std::string, CommContext>;
+
+struct WrapperContext {
+  uint32_t table_id;
+  const std::string path;
+  const int mode;
+  const std::string meta;
+};
+
+struct InitContext {
+  const std::vector<int> dev_ids;  // for gpu
+};
+
+class PSWrapper {
+ public:
+  virtual ~PSWrapper() {}
+  PSWrapper() {}
+  // init server
+
+  virtual int32_t Initialize(InitContext& context) = 0;
+
+  virtual void Stop() = 0;
+
+  virtual void Load(WrapperContext& context) = 0;
+
+  virtual void Save(WrapperContext& context) = 0;
+};
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index b8d59e8dd8b4c60e28323955effd232eb2b51945..df2cdc35626a8aa27899f7340fa14285299a11d1 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -2032,7 +2032,15 @@ static std::string GenerateSingleOpBase(
   const char* ATTRS_TEMPLATE = "  auto& %s = this->attr_map_;\n";
   std::string grad_attrs_str =
       paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name);
-
+  if (fwd_op_type == "cast") {
+    // swtich in out dtype
+    const char* CAST_GRAD =
+        "  auto temp_type = %s[\"in_dtype\"];\n"
+        "  %s[\"in_dtype\"] = %s[\"out_dtype\"];\n"
+        "  %s[\"out_dtype\"] = temp_type;\n";
+    grad_attrs_str += paddle::string::Sprintf(CAST_GRAD, attrs_name, attrs_name,
+                                              attrs_name, attrs_name);
+  }
   // Handle dynamic grad attributes
   grad_attrs_str += HandleDynamicGradAttributes(fwd_op_type, attrs_name);
   generated_grad_function_body += grad_attrs_str;
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 163d25e85ce8c085087331c6e3273075aed5e5f4..038ad09aa4d8bef1282c024559b60d0eed7e48d1 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -93,7 +93,7 @@ void GradTensorHolder::add(size_t slot_id, size_t rank,
     // Create new tensor->impl and fill it with 1.0
     if (t.defined()) {
       // Fill 1.0
-      buffer_[slot_id][rank] = paddle::experimental::ones_like(t);
+      buffer_[slot_id][rank] = paddle::experimental::ones_like(t, t.dtype());
     }
   }
 }
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
index 7b9a4ab1557bf0ce0ed2bd348298373f0ba672cf..e4952a243262bedc5477908cd8aedeb158e344b8 100644
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -24,734 +26,10 @@ namespace operators {
 
 using framework::Tensor;
 
-template <typename T>
-void ConstPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                         const int in_height, const int in_width,
-                         const int out_depth, const int out_height,
-                         const int out_width, const int pad_front,
-                         const int pad_top, const int pad_left, const int out_d,
-                         const int out_h, const int out_w, const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-       in_h >= in_height || in_w >= in_width)
-          ? value
-          : in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void ConstPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                         const int in_depth, const int in_height,
-                         const int in_width, const int out_depth,
-                         const int out_height, const int out_width,
-                         const int pad_front, const int pad_top,
-                         const int pad_left, const int out_d, const int out_h,
-                         const int out_w, const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-      in_h >= in_height || in_w >= in_width) {
-    for (int c = 0; c < channels; ++c) {
-      out_data[out_index + c] = value;
-    }
-  } else {
-    const int in_index =
-        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-    for (int c = 0; c < channels; ++c) {
-      out_data[out_index + c] = in_data[in_index + c];
-    }
-  }
-}
-
-template <typename T>
-void ReflectPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                           const int in_height, const int in_width,
-                           const int out_depth, const int out_height,
-                           const int out_width, const int pad_front,
-                           const int pad_top, const int pad_left,
-                           const int out_d, const int out_h, const int out_w,
-                           const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);                     // reflect by 0
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
-  in_h = std::max(in_h, -in_h);                     // reflect by 0
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-  in_w = std::max(in_w, -in_w);                     // reflect by 0
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
-
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void ReflectPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                           const int in_depth, const int in_height,
-                           const int in_width, const int out_depth,
-                           const int out_height, const int out_width,
-                           const int pad_front, const int pad_top,
-                           const int pad_left, const int out_d, const int out_h,
-                           const int out_w, const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
-  in_h = std::max(in_h, -in_h);
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);
-  in_w = std::max(in_w, -in_w);
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    out_data[out_index + c] = in_data[in_index + c];
-  }
-}
-
-template <typename T>
-void ReplicatePad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                             const int in_height, const int in_width,
-                             const int out_depth, const int out_height,
-                             const int out_width, const int pad_front,
-                             const int pad_top, const int pad_left,
-                             const int out_d, const int out_h, const int out_w,
-                             const T value) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void ReplicatePad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                             const int in_depth, const int in_height,
-                             const int in_width, const int out_depth,
-                             const int out_height, const int out_width,
-                             const int pad_front, const int pad_top,
-                             const int pad_left, const int out_d,
-                             const int out_h, const int out_w, const T value) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    out_data[out_index + c] = in_data[in_index + c];
-  }
-}
-
-template <typename T>
-void CircularPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                            const int in_height, const int in_width,
-                            const int out_depth, const int out_height,
-                            const int out_width, const int pad_front,
-                            const int pad_top, const int pad_left,
-                            const int out_d, const int out_h, const int out_w,
-                            const T value) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void CircularPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                            const int in_depth, const int in_height,
-                            const int in_width, const int out_depth,
-                            const int out_height, const int out_width,
-                            const int pad_front, const int pad_top,
-                            const int pad_left, const int out_d,
-                            const int out_h, const int out_w, const T value) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    out_data[out_index + c] = in_data[in_index + c];
-  }
-}
-
-template <typename T>
-void Pad3DNCDHW(const T* in_data, const int num, const int channels,
-                const int in_depth, const int in_height, const int in_width,
-                const int out_depth, const int out_height, const int out_width,
-                const int pad_front, const int pad_top, const int pad_left,
-                T value, T* out_data,
-                void (*pad_func)(const T*, T*, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const T)) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_d = 0; out_d < out_depth; ++out_d) {
-        for (int out_h = 0; out_h < out_height; ++out_h) {
-          for (int out_w = 0; out_w < out_width; ++out_w) {
-            pad_func(in_data, out_data, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, out_d, out_h, out_w, value);
-          }
-        }
-      }
-      in_data += in_depth * in_height * in_width;
-      out_data += out_depth * out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad3DNDHWC(const T* in_data, const int num, const int channels,
-                const int in_depth, const int in_height, const int in_width,
-                const int out_depth, const int out_height, const int out_width,
-                const int pad_front, const int pad_top, const int pad_left,
-                T value, T* out_data,
-                void (*pad_func)(const T*, T*, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const int, const T)) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_d = 0; out_d < out_depth; ++out_d) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          pad_func(in_data, out_data, channels, in_depth, in_height, in_width,
-                   out_depth, out_height, out_width, pad_front, pad_top,
-                   pad_left, out_d, out_h, out_w, value);
-        }
-      }
-    }
-    in_data += in_depth * in_height * in_width * channels;
-    out_data += out_depth * out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void ConstPad3DGradNCDHW(T* d_in_data, const T* d_out_data, const int in_depth,
-                         const int in_height, const int in_width,
-                         const int out_depth, const int out_height,
-                         const int out_width, const int pad_front,
-                         const int pad_top, const int pad_left, const int out_d,
-                         const int out_h, const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-        in_h >= in_height || in_w >= in_width)) {
-    d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] =
-        d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-  }
-}
-
-template <typename T>
-void ConstPad3DGradNDHWC(T* d_in_data, const T* d_out_data, const int channels,
-                         const int in_depth, const int in_height,
-                         const int in_width, const int out_depth,
-                         const int out_height, const int out_width,
-                         const int pad_front, const int pad_top,
-                         const int pad_left, const int out_d, const int out_h,
-                         const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-        in_h >= in_height || in_w >= in_width)) {
-    const int in_index =
-        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-    for (int c = 0; c < channels; ++c) {
-      d_in_data[in_index + c] = d_out_data[out_index + c];
-    }
-  }
-}
-
-template <typename T>
-void ReflectPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
-                           const int in_depth, const int in_height,
-                           const int in_width, const int out_depth,
-                           const int out_height, const int out_width,
-                           const int pad_front, const int pad_top,
-                           const int pad_left, const int out_d, const int out_h,
-                           const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);                     // reflect by 0
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
-  in_h = std::max(in_h, -in_h);                     // reflect by 0
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-  in_w = std::max(in_w, -in_w);                     // reflect by 0
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
-
-  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
-      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-}
-
-template <typename T>
-void ReflectPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
-                           const int channels, const int in_depth,
-                           const int in_height, const int in_width,
-                           const int out_depth, const int out_height,
-                           const int out_width, const int pad_front,
-                           const int pad_top, const int pad_left,
-                           const int out_d, const int out_h, const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
-  in_h = std::max(in_h, -in_h);
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);
-  in_w = std::max(in_w, -in_w);
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    d_in_data[in_index + c] += d_out_data[out_index + c];
-  }
-}
-
-template <typename T>
-void ReplicatePad3DGradNCDHW(T* d_in_data, const T* d_out_data,
-                             const int in_depth, const int in_height,
-                             const int in_width, const int out_depth,
-                             const int out_height, const int out_width,
-                             const int pad_front, const int pad_top,
-                             const int pad_left, const int out_d,
-                             const int out_h, const int out_w) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
-      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-}
-
-template <typename T>
-void ReplicatePad3DGradNDHWC(T* d_in_data, const T* d_out_data,
-                             const int channels, const int in_depth,
-                             const int in_height, const int in_width,
-                             const int out_depth, const int out_height,
-                             const int out_width, const int pad_front,
-                             const int pad_top, const int pad_left,
-                             const int out_d, const int out_h,
-                             const int out_w) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    d_in_data[in_index + c] += d_out_data[out_index + c];
-  }
-}
-
-template <typename T>
-void CircularPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
-                            const int in_depth, const int in_height,
-                            const int in_width, const int out_depth,
-                            const int out_height, const int out_width,
-                            const int pad_front, const int pad_top,
-                            const int pad_left, const int out_d,
-                            const int out_h, const int out_w) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
-      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-}
-
-template <typename T>
-void CircularPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
-                            const int channels, const int in_depth,
-                            const int in_height, const int in_width,
-                            const int out_depth, const int out_height,
-                            const int out_width, const int pad_front,
-                            const int pad_top, const int pad_left,
-                            const int out_d, const int out_h, const int out_w) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    d_in_data[in_index + c] += d_out_data[out_index + c];
-  }
-}
-
-template <typename T>
-void Pad3DGradNCDHW(T* d_in_data, const int num, const int channels,
-                    const int in_depth, const int in_height, const int in_width,
-                    const int out_depth, const int out_height,
-                    const int out_width, const int pad_front, const int pad_top,
-                    const int pad_left, const T* d_out_data,
-                    void (*pad_func)(T*, const T*, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int)) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_d = 0; out_d < out_depth; ++out_d) {
-        for (int out_h = 0; out_h < out_height; ++out_h) {
-          for (int out_w = 0; out_w < out_width; ++out_w) {
-            pad_func(d_in_data, d_out_data, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, out_d, out_h, out_w);
-          }
-        }
-      }
-      d_in_data += in_depth * in_height * in_width;
-      d_out_data += out_depth * out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad3DGradNDHWC(T* d_in_data, const int num, const int channels,
-                    const int in_depth, const int in_height, const int in_width,
-                    const int out_depth, const int out_height,
-                    const int out_width, const int pad_front, const int pad_top,
-                    const int pad_left, const T* d_out_data,
-                    void (*pad_func)(T*, const T*, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int)) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_d = 0; out_d < out_depth; ++out_d) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          pad_func(d_in_data, d_out_data, channels, in_depth, in_height,
-                   in_width, out_depth, out_height, out_width, pad_front,
-                   pad_top, pad_left, out_d, out_h, out_w);
-        }
-      }
-    }
-    d_in_data += in_depth * in_height * in_width * channels;
-    d_out_data += out_depth * out_height * out_width * channels;
-  }
-}
-
-static inline std::vector<int> GetPaddings(
-    const framework::ExecutionContext& context) {
-  std::vector<int> paddings(6);
-  auto* paddings_t = context.Input<Tensor>("Paddings");
-  if (paddings_t) {
-    auto paddings_data = paddings_t->data<int>();
-    std::memcpy(paddings.data(), paddings_data, paddings.size() * sizeof(int));
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings.data());
-  }
-  return paddings;
-}
-
-template <typename T>
-class Pad3dCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    T value = static_cast<T>(context.Attr<float>("value"));
-
-    auto* x = context.Input<Tensor>("X");
-    auto in_dims = x->dims();
-    const T* in_data = x->data<T>();
-
-    auto* out = context.Output<Tensor>("Out");
-    if (data_format == "NCDHW") {
-      out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[4] + pads[5],
-                   in_dims[3] + pads[2] + pads[3],
-                   in_dims[4] + pads[0] + pads[1]});
-    } else {
-      out->Resize({in_dims[0], in_dims[1] + pads[4] + pads[5],
-                   in_dims[2] + pads[2] + pads[3],
-                   in_dims[3] + pads[0] + pads[1], in_dims[4]});
-    }
-    auto out_dims = out->dims();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    int channels = in_dims[1];
-    int in_depth = in_dims[2];
-    int in_height = in_dims[3];
-    int in_width = in_dims[4];
-    int out_depth = out_dims[2];
-    int out_height = out_dims[3];
-    int out_width = out_dims[4];
-    if (data_format == "NDHWC") {
-      channels = in_dims[4];
-      in_depth = in_dims[1];
-      in_height = in_dims[2];
-      in_width = in_dims[3];
-      out_depth = out_dims[1];
-      out_height = out_dims[2];
-      out_width = out_dims[3];
-    }
-
-    if (mode == "reflect") {
-      PADDLE_ENFORCE_GT(in_depth, pads[4],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_front"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_front(%d).",
-                            in_depth, pads[4]));
-      PADDLE_ENFORCE_GT(in_depth, pads[5],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_back"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_back(%d).",
-                            in_depth, pads[5]));
-
-      PADDLE_ENFORCE_GT(in_height, pads[2],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_top"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_top(%d).",
-                            in_height, pads[2]));
-      PADDLE_ENFORCE_GT(in_height, pads[3],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_bottom"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_bottom(%d).",
-                            in_height, pads[3]));
-
-      PADDLE_ENFORCE_GT(in_width, pads[0],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_left"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_left(%d).",
-                            in_width, pads[0]));
-      PADDLE_ENFORCE_GT(in_width, pads[1],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_right"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_right(%d).",
-                            in_width, pads[1]));
-    } else if (mode == "circular" || mode == "replicate") {
-      PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0,
-                        platform::errors::InvalidArgument(
-                            "The input tensor size can not be 0 for circular "
-                            "or replicate padding mode."));
-    }
-
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-    const int num = in_dims[0];
-    if (data_format == "NCDHW") {
-      std::map<std::string,
-               void (*)(const T*, T*, const int, const int, const int,
-                        const int, const int, const int, const int, const int,
-                        const int, const int, const int, const int, const T)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DFuncNCDHW;
-      func_map["replicate"] = ReplicatePad3DFuncNCDHW;
-      func_map["circular"] = CircularPad3DFuncNCDHW;
-      func_map["constant"] = ConstPad3DFuncNCDHW;
-      Pad3DNCDHW(in_data, num, channels, in_depth, in_height, in_width,
-                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-                 value, out_data, func_map[mode]);
-    } else {
-      std::map<std::string, void (*)(const T*, T*, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const T)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DFuncNDHWC;
-      func_map["replicate"] = ReplicatePad3DFuncNDHWC;
-      func_map["circular"] = CircularPad3DFuncNDHWC;
-      func_map["constant"] = ConstPad3DFuncNDHWC;
-      Pad3DNDHWC(in_data, num, channels, in_depth, in_height, in_width,
-                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-                 value, out_data, func_map[mode]);
-    }
-  }
-};
-
-template <typename T>
-class Pad3dGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    auto d_out_dims = d_out->dims();
-    const T* d_out_data = d_out->data<T>();
-    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(context.template device_context<platform::CPUDeviceContext>(),
-             d_in, static_cast<T>(0));
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-    const int num = d_in_dims[0];
-    if (data_format == "NCDHW") {
-      const int channels = d_in_dims[1];
-      const int in_depth = d_in_dims[2];
-      const int in_height = d_in_dims[3];
-      const int in_width = d_in_dims[4];
-      const int out_depth = d_out_dims[2];
-      const int out_height = d_out_dims[3];
-      const int out_width = d_out_dims[4];
-
-      std::map<std::string,
-               void (*)(T*, const T*, const int, const int, const int,
-                        const int, const int, const int, const int, const int,
-                        const int, const int, const int, const int)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DGradNCDHW;
-      func_map["replicate"] = ReplicatePad3DGradNCDHW;
-      func_map["circular"] = CircularPad3DGradNCDHW;
-      func_map["constant"] = ConstPad3DGradNCDHW;
-
-      Pad3DGradNCDHW(d_in_data, num, channels, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, d_out_data, func_map[mode]);
-    } else {
-      const int channels = d_in_dims[4];
-      const int in_depth = d_in_dims[1];
-      const int in_height = d_in_dims[2];
-      const int in_width = d_in_dims[3];
-      const int out_depth = d_out_dims[1];
-      const int out_height = d_out_dims[2];
-      const int out_width = d_out_dims[3];
-
-      std::map<std::string,
-               void (*)(T*, const T*, const int, const int, const int,
-                        const int, const int, const int, const int, const int,
-                        const int, const int, const int, const int, const int)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DGradNDHWC;
-      func_map["replicate"] = ReplicatePad3DGradNDHWC;
-      func_map["circular"] = CircularPad3DGradNDHWC;
-      func_map["constant"] = ConstPad3DGradNDHWC;
-
-      Pad3DGradNDHWC(d_in_data, num, channels, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, d_out_data, func_map[mode]);
-    }
-  }
-};
-
 class Pad3dOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad3d");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad3d");
-
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dim.size(), 5,
-                      platform::errors::InvalidArgument(
-                          "The size of Input(X)'s dimension should be equal to "
-                          "5, but received %d. ",
-                          x_dim.size()));
-
-    std::vector<int64_t> out_dims(x_dim.size());
-    auto data_format = ctx->Attrs().Get<std::string>("data_format");
-    out_dims[0] = x_dim[0];
-    if (ctx->HasInput("Paddings")) {
-      auto paddings_dim = ctx->GetInputDim("Paddings");
-      PADDLE_ENFORCE_EQ(paddings_dim.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "Size of Input(Paddings)'s dimension should be "
-                            "equal to 1, but received %d.",
-                            paddings_dim.size()));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(paddings_dim[0], 6,
-                          platform::errors::InvalidArgument(
-                              "Shape of Input(Paddings) should be equal to "
-                              "[6], but received [%d].",
-                              paddings_dim[0]));
-      }
-      out_dims[1] = x_dim[1];
-      out_dims[2] = x_dim[2];
-      out_dims[3] = x_dim[3];
-    } else {
-      auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-      PADDLE_ENFORCE_EQ(
-          paddings.size(), 6,
-          platform::errors::InvalidArgument(
-              "Size of paddings should be equal to 4, but received %d.",
-              static_cast<int>(paddings.size())));
-      if (data_format == "NCDHW") {
-        out_dims[1] = x_dim[1];  // channel
-        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
-                          ? x_dim[2]
-                          : (x_dim[2] + paddings[4] + paddings[5]);  // depth
-
-        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
-                          ? x_dim[3]
-                          : (x_dim[3] + paddings[2] + paddings[3]);  // height
-
-        out_dims[4] = ((!ctx->IsRuntime()) && (x_dim[4] < 0))
-                          ? x_dim[4]
-                          : (x_dim[4] + paddings[0] + paddings[1]);  // width
-      } else {                                                       // NDHWC
-        out_dims[4] = x_dim[4];                                      // channel
-
-        out_dims[1] = ((!ctx->IsRuntime()) && (x_dim[1] < 0))
-                          ? x_dim[1]
-                          : (x_dim[1] + paddings[4] + paddings[5]);  // depth
-        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
-                          ? x_dim[2]
-                          : (x_dim[2] + paddings[2] + paddings[3]);  // height
-        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
-                          ? x_dim[3]
-                          : (x_dim[3] + paddings[0] + paddings[1]);  // width
-      }
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -921,15 +199,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(Pad3dOpGradNoNeedBufferVarsInferer, "X");
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(pad3d, Pad3dInferShapeFunctor,
+                            PD_INFER_META(phi::Pad3dInferMeta));
+
 REGISTER_OPERATOR(pad3d, ops::Pad3dOp, ops::Pad3dOpMaker,
                   ops::Pad3dOpGradMaker<paddle::framework::OpDesc>,
-                  ops::Pad3dOpGradMaker<paddle::imperative::OpBase>);
+                  ops::Pad3dOpGradMaker<paddle::imperative::OpBase>,
+                  Pad3dInferShapeFunctor);
 REGISTER_OPERATOR(pad3d_grad, ops::Pad3dOpGrad,
                   ops::Pad3dOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::Pad3dOpDoubleGradMaker<paddle::imperative::OpBase>,
                   ops::Pad3dOpGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(pad3d, ops::Pad3dCPUKernel<float>,
-                       ops::Pad3dCPUKernel<double>, ops::Pad3dCPUKernel<int>,
-                       ops::Pad3dCPUKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL(pad3d_grad, ops::Pad3dGradCPUKernel<float>,
-                       ops::Pad3dGradCPUKernel<double>);
diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu
deleted file mode 100644
index 9ab0eb9d445da9b1d0f64cf4f7a721026dab5476..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad3d_op.cu
+++ /dev/null
@@ -1,793 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-using framework::Tensor;
-
-template <typename T>
-__global__ void Pad3DConstNCDHW(const int nthreads, const T* in_data,
-                                const int num, const int channels,
-                                const int in_depth, const int in_height,
-                                const int in_width, const int out_depth,
-                                const int out_height, const int out_width,
-                                const int pad_front, const int pad_top,
-                                const int pad_left, T value, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    out_data[index] =
-        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-         in_h >= in_height || in_w >= in_width)
-            ? value
-            : in_data[nc * in_depth * in_height * in_width +
-                      in_d * in_height * in_width + in_h * in_width + in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DConstNDHWC(const int nthreads, const T* in_data,
-                                const int num, const int channels,
-                                const int in_depth, const int in_height,
-                                const int in_width, const int out_depth,
-                                const int out_height, const int out_width,
-                                const int pad_front, const int pad_top,
-                                const int pad_left, T value, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-    const int in_d = out_d - pad_front;
-    const int in_h = out_h - pad_top;
-    const int in_w = out_w - pad_left;
-
-    out_data[index] =
-        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-         in_h >= in_height || in_w >= in_width)
-            ? value
-            : in_data[n * in_depth * in_height * in_width * channels +
-                      in_d * in_height * in_width * channels +
-                      in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReflectNCDHW(const int nthreads, const T* in_data,
-                                  const int num, const int channels,
-                                  const int in_depth, const int in_height,
-                                  const int in_width, const int out_depth,
-                                  const int out_height, const int out_width,
-                                  const int pad_front, const int pad_top,
-                                  const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);                     // reflect by 0
-    in_d = min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
-    in_h = max(in_h, -in_h);                     // reflect by 0
-    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-    in_w = max(in_w, -in_w);                     // reflect by 0
-    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
-    out_data[index] =
-        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
-                    in_width +
-                in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReflectNDHWC(const int nthreads, const T* in_data,
-                                  const int num, const int channels,
-                                  const int in_depth, const int in_height,
-                                  const int in_width, const int out_depth,
-                                  const int out_height, const int out_width,
-                                  const int pad_front, const int pad_top,
-                                  const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);
-    in_d = min(in_d, 2 * in_depth - in_d - 2);
-    in_h = max(in_h, -in_h);
-    in_h = min(in_h, 2 * in_height - in_h - 2);
-    in_w = max(in_w, -in_w);
-    in_w = min(in_w, 2 * in_width - in_w - 2);
-
-    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
-                              in_d * in_height * in_width * channels +
-                              in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReplicateNCDHW(const int nthreads, const T* in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    out_data[index] =
-        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
-                    in_width +
-                in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReplicateNDHWC(const int nthreads, const T* in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
-                              in_d * in_height * in_width * channels +
-                              in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DCircularNCDHW(const int nthreads, const T* in_data,
-                                   const int num, const int channels,
-                                   const int in_depth, const int in_height,
-                                   const int in_width, const int out_depth,
-                                   const int out_height, const int out_width,
-                                   const int pad_front, const int pad_top,
-                                   const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    out_data[index] =
-        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
-                    in_width +
-                in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DCircularNDHWC(const int nthreads, const T* in_data,
-                                   const int num, const int channels,
-                                   const int in_depth, const int in_height,
-                                   const int in_width, const int out_depth,
-                                   const int out_height, const int out_width,
-                                   const int pad_front, const int pad_top,
-                                   const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
-                              in_d * in_height * in_width * channels +
-                              in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradConstNCDHW(const int in_size, T* d_in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(in_index, in_size) {
-    const int in_w = in_index % in_width;
-
-    int nc = in_index / in_width;
-    const int in_h = nc % in_height;
-
-    nc /= in_height;
-    const int in_d = nc % in_depth;
-
-    nc /= in_depth;
-
-    const int out_d = in_d + pad_front;
-    const int out_h = in_h + pad_top;
-    const int out_w = in_w + pad_left;
-    d_in_data[in_index] =
-        d_out_data[nc * out_depth * out_height * out_width +
-                   out_d * out_height * out_width + out_h * out_width + out_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradConstNDHWC(const int in_size, T* d_in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(in_index, in_size) {
-    const int c = in_index % channels;
-    int n = in_index / channels;
-
-    const int in_w = n % in_width;
-    n /= in_width;
-
-    const int in_h = n % in_height;
-    n /= in_height;
-
-    const int in_d = n % in_depth;
-    n /= in_depth;
-
-    const int out_d = in_d + pad_front;
-    const int out_h = in_h + pad_top;
-    const int out_w = in_w + pad_left;
-
-    d_in_data[in_index] =
-        d_out_data[n * out_depth * out_height * out_width * channels +
-                   out_d * out_height * out_width * channels +
-                   out_h * out_width * channels + out_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReflectNCDHW(const int out_size, T* d_in_data,
-                                      const int num, const int channels,
-                                      const int in_depth, const int in_height,
-                                      const int in_width, const int out_depth,
-                                      const int out_height, const int out_width,
-                                      const int pad_front, const int pad_top,
-                                      const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);
-    in_h = max(in_h, -in_h);
-    in_w = max(in_w, -in_w);
-
-    in_d = min(in_d, 2 * in_depth - in_d - 2);
-    in_h = min(in_h, 2 * in_height - in_h - 2);
-    in_w = min(in_w, 2 * in_width - in_w - 2);
-
-    platform::CudaAtomicAdd(
-        &d_in_data[nc * in_depth * in_height * in_width +
-                   in_d * in_height * in_width + in_h * in_width + in_w],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReflectNDHWC(const int out_size, T* d_in_data,
-                                      const int num, const int channels,
-                                      const int in_depth, const int in_height,
-                                      const int in_width, const int out_depth,
-                                      const int out_height, const int out_width,
-                                      const int pad_front, const int pad_top,
-                                      const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);
-    in_h = max(in_h, -in_h);
-    in_w = max(in_w, -in_w);
-
-    in_d = min(in_d, in_depth * 2 - in_d - 2);
-    in_h = min(in_h, in_height * 2 - in_h - 2);
-    in_w = min(in_w, in_width * 2 - in_w - 2);
-    platform::CudaAtomicAdd(
-        &d_in_data[n * in_depth * in_height * in_width * channels +
-                   in_d * in_height * in_width * channels +
-                   in_h * in_width * channels + in_w * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReplicateNCDHW(
-    const int out_size, T* d_in_data, const int num, const int channels,
-    const int in_depth, const int in_height, const int in_width,
-    const int out_depth, const int out_height, const int out_width,
-    const int pad_front, const int pad_top, const int pad_left,
-    const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    platform::CudaAtomicAdd(
-        &d_in_data[nc * in_depth * in_height * in_width +
-                   in_d * in_height * in_width + in_h * in_width + in_w],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReplicateNDHWC(
-    const int out_size, T* d_in_data, const int num, const int channels,
-    const int in_depth, const int in_height, const int in_width,
-    const int out_depth, const int out_height, const int out_width,
-    const int pad_front, const int pad_top, const int pad_left,
-    const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    platform::CudaAtomicAdd(
-        &d_in_data[n * in_depth * in_height * in_width * channels +
-                   in_d * in_height * in_width * channels +
-                   in_h * in_width * channels + in_w * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradCircularNCDHW(const int out_size, T* d_in_data,
-                                       const int num, const int channels,
-                                       const int in_depth, const int in_height,
-                                       const int in_width, const int out_depth,
-                                       const int out_height,
-                                       const int out_width, const int pad_front,
-                                       const int pad_top, const int pad_left,
-                                       const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    platform::CudaAtomicAdd(
-        &d_in_data[nc * in_depth * in_height * in_width +
-                   in_d * in_height * in_width + in_h * in_width + in_w],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradCircularNDHWC(const int out_size, T* d_in_data,
-                                       const int num, const int channels,
-                                       const int in_depth, const int in_height,
-                                       const int in_width, const int out_depth,
-                                       const int out_height,
-                                       const int out_width, const int pad_front,
-                                       const int pad_top, const int pad_left,
-                                       const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    platform::CudaAtomicAdd(
-        &d_in_data[n * in_depth * in_height * in_width * channels +
-                   in_d * in_height * in_width * channels +
-                   in_h * in_width * channels + in_w * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-static inline std::vector<int> GetPaddings(
-    const framework::ExecutionContext& context) {
-  std::vector<int> paddings(6);
-  auto* paddings_data = context.Input<Tensor>("Paddings");
-  if (paddings_data) {
-    Tensor pads;
-    framework::TensorCopySync(*paddings_data, platform::CPUPlace(), &pads);
-    auto pads_data = pads.data<int>();
-    std::memcpy(paddings.data(), pads_data, paddings.size() * sizeof(int));
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings.data());
-  }
-  return paddings;
-}
-
-template <typename T>
-class Pad3dCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    T value = static_cast<T>(context.Attr<float>("value"));
-
-    auto* x = context.Input<Tensor>("X");
-    auto in_dims = x->dims();
-    const T* in_data = x->data<T>();
-    auto* out = context.Output<Tensor>("Out");
-    auto out_dims = out->dims();
-    if (data_format == "NCDHW") {
-      out_dims[0] = in_dims[0];
-      out_dims[1] = in_dims[1];
-      out_dims[2] = in_dims[2] + pads[4] + pads[5];
-      out_dims[3] = in_dims[3] + pads[2] + pads[3];
-      out_dims[4] = in_dims[4] + pads[0] + pads[1];
-    } else {
-      out_dims[0] = in_dims[0];
-      out_dims[1] = in_dims[1] + pads[4] + pads[5];
-      out_dims[2] = in_dims[2] + pads[2] + pads[3];
-      out_dims[3] = in_dims[3] + pads[0] + pads[1];
-      out_dims[4] = in_dims[4];
-    }
-    T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
-
-    int channels = in_dims[1];
-    int in_depth = in_dims[2];
-    int in_height = in_dims[3];
-    int in_width = in_dims[4];
-    int out_depth = out_dims[2];
-    int out_height = out_dims[3];
-    int out_width = out_dims[4];
-    if (data_format == "NDHWC") {
-      channels = in_dims[4];
-      in_depth = in_dims[1];
-      in_height = in_dims[2];
-      in_width = in_dims[3];
-      out_depth = out_dims[1];
-      out_height = out_dims[2];
-      out_width = out_dims[3];
-    }
-
-    if (mode == "reflect") {
-      PADDLE_ENFORCE_GT(in_depth, pads[4],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_front"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_front(%d).",
-                            in_depth, pads[4]));
-      PADDLE_ENFORCE_GT(in_depth, pads[5],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_back"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_back(%d).",
-                            in_depth, pads[5]));
-
-      PADDLE_ENFORCE_GT(in_height, pads[2],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_top"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_top(%d).",
-                            in_height, pads[2]));
-      PADDLE_ENFORCE_GT(in_height, pads[3],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_bottom"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_bottom(%d).",
-                            in_height, pads[3]));
-
-      PADDLE_ENFORCE_GT(in_width, pads[0],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_left"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_left(%d).",
-                            in_width, pads[0]));
-      PADDLE_ENFORCE_GT(in_width, pads[1],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_right"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_right(%d).",
-                            in_width, pads[1]));
-    } else if (mode == "circular" || mode == "replicate") {
-      PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0,
-                        platform::errors::InvalidArgument(
-                            "The input tensor size can not be 0 for circular "
-                            "or replicate padding mode."));
-    }
-
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-    const int num = in_dims[0];
-
-    auto stream = context.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    const int out_size = out->numel();
-    int grid = (out_size + block - 1) / block;
-
-    if (data_format == "NCDHW") {
-      if (mode == "reflect") {
-        Pad3DReflectNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "replicate") {
-        Pad3DReplicateNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "circular") {
-        Pad3DCircularNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else {
-        Pad3DConstNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            value, out_data);
-      }
-    } else {
-      if (mode == "reflect") {
-        Pad3DReflectNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "replicate") {
-        Pad3DReplicateNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "circular") {
-        Pad3DCircularNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else {
-        Pad3DConstNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            value, out_data);
-      }
-    }
-  }
-};
-
-template <typename T>
-class Pad3dGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    auto d_out_dims = d_out->dims();
-    const T* d_out_data = d_out->data<T>();
-    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(context.template device_context<platform::CUDADeviceContext>(),
-             d_in, static_cast<T>(0));
-
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-
-    const int num = d_in_dims[0];
-
-    auto stream = context.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    const int out_size = d_out->numel();
-    const int in_size = d_in->numel();
-    int grid = (out_size + block - 1) / block;
-
-    if (data_format == "NCDHW") {
-      const int channels = d_in_dims[1];
-      const int in_depth = d_in_dims[2];
-      const int in_height = d_in_dims[3];
-      const int in_width = d_in_dims[4];
-      const int out_depth = d_out_dims[2];
-      const int out_height = d_out_dims[3];
-      const int out_width = d_out_dims[4];
-
-      if (mode == "reflect") {
-        Pad3DGradReflectNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "replicate") {
-        Pad3DGradReplicateNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "circular") {
-        Pad3DGradCircularNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else {
-        grid = (in_size + block - 1) / block;
-        Pad3DGradConstNCDHW<T><<<grid, block, 0, stream>>>(
-            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      }
-    } else {
-      const int channels = d_in_dims[4];
-      const int in_depth = d_in_dims[1];
-      const int in_height = d_in_dims[2];
-      const int in_width = d_in_dims[3];
-      const int out_depth = d_out_dims[1];
-      const int out_height = d_out_dims[2];
-      const int out_width = d_out_dims[3];
-      if (mode == "reflect") {
-        Pad3DGradReflectNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "replicate") {
-        Pad3DGradReplicateNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "circular") {
-        Pad3DGradCircularNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else {
-        grid = (in_size + block - 1) / block;
-        Pad3DGradConstNDHWC<T><<<grid, block, 0, stream>>>(
-            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(pad3d, ops::Pad3dCUDAKernel<plat::float16>,
-                        ops::Pad3dCUDAKernel<float>,
-                        ops::Pad3dCUDAKernel<double>, ops::Pad3dCUDAKernel<int>,
-                        ops::Pad3dCUDAKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL(pad3d_grad, ops::Pad3dGradCUDAKernel<plat::float16>,
-                        ops::Pad3dGradCUDAKernel<float>,
-                        ops::Pad3dGradCUDAKernel<double>);
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 01186922b51af83e3e12dcd1f6b48c52c8fb8aa8..24bb2c9e65cde6909d5f934eb41b0a363044dde9 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace paddle {
 namespace pybind {
 
@@ -62,10 +63,10 @@ int TensorDtype2NumpyDtype(phi::DataType dtype) {
       return pybind11::detail::npy_api::NPY_INT32_;
     case phi::DataType::INT64:
       return pybind11::detail::npy_api::NPY_INT64_;
-    case phi::DataType::FLOAT16:
-      return pybind11::detail::NPY_FLOAT16_;
     case phi::DataType::BFLOAT16:
       return pybind11::detail::NPY_UINT16_;
+    case phi::DataType::FLOAT16:
+      return pybind11::detail::NPY_FLOAT16_;
     case phi::DataType::FLOAT32:
       return pybind11::detail::npy_api::NPY_FLOAT_;
     case phi::DataType::FLOAT64:
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index bcbc8f5262ce33a5c994f5acb5c19406162109d3..7c5f38744f8923805d1e9b521c58813293cdce9b 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -877,6 +877,77 @@ void PadInferMeta(const MetaTensor& input,
   out->set_dtype(input.dtype());
 }
 
+void Pad3dInferMeta(const MetaTensor& x,
+                    const ScalarArray& paddings_scalar_array,
+                    const std::string& mode,
+                    float value,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config) {
+  auto x_dim = x.dims();
+  PADDLE_ENFORCE_EQ(x_dim.size(),
+                    5,
+                    errors::InvalidArgument(
+                        "The size of Input(X)'s dimension should be equal to "
+                        "5, but received %d. ",
+                        x_dim.size()));
+
+  std::vector<int64_t> out_dims(x_dim.size());
+  out_dims[0] = x_dim[0];
+  if (paddings_scalar_array.FromTensor()) {
+    if (config.is_runtime) {
+      PADDLE_ENFORCE_EQ(
+          paddings_scalar_array.GetData().size(),
+          6,
+          errors::InvalidArgument("Shape of Input(Paddings) should be equal to "
+                                  "[6], but received [%d].",
+                                  paddings_scalar_array.GetData().size()));
+    }
+    out_dims[1] = x_dim[1];
+    out_dims[2] = x_dim[2];
+    out_dims[3] = x_dim[3];
+  } else {
+    auto paddings = paddings_scalar_array.GetData();
+
+    PADDLE_ENFORCE_EQ(
+        paddings.size(),
+        6,
+        errors::InvalidArgument(
+            "Size of paddings should be equal to 6, but received %d.",
+            static_cast<int>(paddings.size())));
+    if (data_format == "NCDHW") {
+      out_dims[1] = x_dim[1];  // channel
+      out_dims[2] = ((!config.is_runtime) && (x_dim[2] < 0))
+                        ? x_dim[2]
+                        : (x_dim[2] + paddings[4] + paddings[5]);  // depth
+
+      out_dims[3] = ((!config.is_runtime) && (x_dim[3] < 0))
+                        ? x_dim[3]
+                        : (x_dim[3] + paddings[2] + paddings[3]);  // height
+
+      out_dims[4] = ((!config.is_runtime) && (x_dim[4] < 0))
+                        ? x_dim[4]
+                        : (x_dim[4] + paddings[0] + paddings[1]);  // width
+    } else {                                                       // NDHWC
+      out_dims[4] = x_dim[4];                                      // channel
+
+      out_dims[1] = ((!config.is_runtime) && (x_dim[1] < 0))
+                        ? x_dim[1]
+                        : (x_dim[1] + paddings[4] + paddings[5]);  // depth
+      out_dims[2] = ((!config.is_runtime) && (x_dim[2] < 0))
+                        ? x_dim[2]
+                        : (x_dim[2] + paddings[2] + paddings[3]);  // height
+      out_dims[3] = ((!config.is_runtime) && (x_dim[3] < 0))
+                        ? x_dim[3]
+                        : (x_dim[3] + paddings[0] + paddings[1]);  // width
+    }
+  }
+
+  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
 void PixelShuffleInferMeta(const MetaTensor& x,
                            int upscale_factor,
                            const std::string& data_format,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 1b4ff7c69ac36f06042f5dc3678018c57aa1b0a1..d84283a65c4d19445dce61e9cf8ee6f70a83905f 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -147,6 +147,14 @@ void PadInferMeta(const MetaTensor& input,
                   MetaTensor* out,
                   MetaConfig config = MetaConfig());
 
+void Pad3dInferMeta(const MetaTensor& x,
+                    const ScalarArray& paddings,
+                    const std::string& mode,
+                    float value,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config = MetaConfig());
+
 void PixelShuffleInferMeta(const MetaTensor& x,
                            int upscale_factor,
                            const std::string& data_format,
diff --git a/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc b/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b1adb3e206da97918dc69ee4694de1be525b382e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc
@@ -0,0 +1,480 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+void ConstPad3DGradNCDHW(T* d_in_data,
+                         const T* d_out_data,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] =
+        d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+void ConstPad3DGradNDHWC(T* d_in_data,
+                         const T* d_out_data,
+                         const int channels,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      d_in_data[in_index + c] = d_out_data[out_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DGradNCDHW(T* d_in_data,
+                           const T* d_out_data,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReflectPad3DGradNDHWC(T* d_in_data,
+                           const T* d_out_data,
+                           const int channels,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DGradNCDHW(T* d_in_data,
+                             const T* d_out_data,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReplicatePad3DGradNDHWC(T* d_in_data,
+                             const T* d_out_data,
+                             const int channels,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DGradNCDHW(T* d_in_data,
+                            const T* d_out_data,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void CircularPad3DGradNDHWC(T* d_in_data,
+                            const T* d_out_data,
+                            const int channels,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DGradNCDHW(T* d_in_data,
+                    const int num,
+                    const int channels,
+                    const int in_depth,
+                    const int in_height,
+                    const int in_width,
+                    const int out_depth,
+                    const int out_height,
+                    const int out_width,
+                    const int pad_front,
+                    const int pad_top,
+                    const int pad_left,
+                    const T* d_out_data,
+                    void (*pad_func)(T*,
+                                     const T*,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(d_in_data,
+                     d_out_data,
+                     in_depth,
+                     in_height,
+                     in_width,
+                     out_depth,
+                     out_height,
+                     out_width,
+                     pad_front,
+                     pad_top,
+                     pad_left,
+                     out_d,
+                     out_h,
+                     out_w);
+          }
+        }
+      }
+      d_in_data += in_depth * in_height * in_width;
+      d_out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DGradNDHWC(T* d_in_data,
+                    const int num,
+                    const int channels,
+                    const int in_depth,
+                    const int in_height,
+                    const int in_width,
+                    const int out_depth,
+                    const int out_height,
+                    const int out_width,
+                    const int pad_front,
+                    const int pad_top,
+                    const int pad_left,
+                    const T* d_out_data,
+                    void (*pad_func)(T*,
+                                     const T*,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(d_in_data,
+                   d_out_data,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   out_d,
+                   out_h,
+                   out_w);
+        }
+      }
+    }
+    d_in_data += in_depth * in_height * in_width * channels;
+    d_out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& out_grad,
+                     const ScalarArray& paddings,
+                     const std::string& mode,
+                     float pad_value,
+                     const std::string& data_format,
+                     DenseTensor* x_grad) {
+  std::vector<int64_t> pads = paddings.GetData();
+
+  auto* d_out = &out_grad;
+  auto* d_in = x_grad;
+  auto d_in_dims = d_in->dims();
+  auto d_out_dims = d_out->dims();
+  const T* d_out_data = d_out->data<T>();
+  T* d_in_data = dev_ctx.template Alloc<T>(d_in);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, d_in, static_cast<T>(0));
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+  const int num = d_in_dims[0];
+  if (data_format == "NCDHW") {
+    const int channels = d_in_dims[1];
+    const int in_depth = d_in_dims[2];
+    const int in_height = d_in_dims[3];
+    const int in_width = d_in_dims[4];
+    const int out_depth = d_out_dims[2];
+    const int out_height = d_out_dims[3];
+    const int out_width = d_out_dims[4];
+
+    std::map<std::string,
+             void (*)(T*,
+                      const T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DGradNCDHW;
+    func_map["replicate"] = ReplicatePad3DGradNCDHW;
+    func_map["circular"] = CircularPad3DGradNCDHW;
+    func_map["constant"] = ConstPad3DGradNCDHW;
+
+    Pad3DGradNCDHW(d_in_data,
+                   num,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   d_out_data,
+                   func_map[mode]);
+  } else {
+    const int channels = d_in_dims[4];
+    const int in_depth = d_in_dims[1];
+    const int in_height = d_in_dims[2];
+    const int in_width = d_in_dims[3];
+    const int out_depth = d_out_dims[1];
+    const int out_height = d_out_dims[2];
+    const int out_width = d_out_dims[3];
+
+    std::map<std::string,
+             void (*)(T*,
+                      const T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DGradNDHWC;
+    func_map["replicate"] = ReplicatePad3DGradNDHWC;
+    func_map["circular"] = CircularPad3DGradNDHWC;
+    func_map["constant"] = ConstPad3DGradNDHWC;
+
+    Pad3DGradNDHWC(d_in_data,
+                   num,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   d_out_data,
+                   func_map[mode]);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    pad3d_grad, CPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/pad3d_kernel.cc b/paddle/phi/kernels/cpu/pad3d_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0dc01f485f3aa9ba6ff0b6d089887ff04847054c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad3d_kernel.cc
@@ -0,0 +1,578 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+void ConstPad3DFuncNCDHW(const T* in_data,
+                         T* out_data,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w,
+                         const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+       in_h >= in_height || in_w >= in_width)
+          ? value
+          : in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ConstPad3DFuncNDHWC(const T* in_data,
+                         T* out_data,
+                         const int channels,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w,
+                         const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+      in_h >= in_height || in_w >= in_width) {
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = value;
+    }
+  } else {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = in_data[in_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DFuncNCDHW(const T* in_data,
+                           T* out_data,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w,
+                           const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReflectPad3DFuncNDHWC(const T* in_data,
+                           T* out_data,
+                           const int channels,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w,
+                           const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DFuncNCDHW(const T* in_data,
+                             T* out_data,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w,
+                             const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReplicatePad3DFuncNDHWC(const T* in_data,
+                             T* out_data,
+                             const int channels,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w,
+                             const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DFuncNCDHW(const T* in_data,
+                            T* out_data,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w,
+                            const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void CircularPad3DFuncNDHWC(const T* in_data,
+                            T* out_data,
+                            const int channels,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w,
+                            const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DNCDHW(const T* in_data,
+                const int num,
+                const int channels,
+                const int in_depth,
+                const int in_height,
+                const int in_width,
+                const int out_depth,
+                const int out_height,
+                const int out_width,
+                const int pad_front,
+                const int pad_top,
+                const int pad_left,
+                T value,
+                T* out_data,
+                void (*pad_func)(const T*,
+                                 T*,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(in_data,
+                     out_data,
+                     in_depth,
+                     in_height,
+                     in_width,
+                     out_depth,
+                     out_height,
+                     out_width,
+                     pad_front,
+                     pad_top,
+                     pad_left,
+                     out_d,
+                     out_h,
+                     out_w,
+                     value);
+          }
+        }
+      }
+      in_data += in_depth * in_height * in_width;
+      out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DNDHWC(const T* in_data,
+                const int num,
+                const int channels,
+                const int in_depth,
+                const int in_height,
+                const int in_width,
+                const int out_depth,
+                const int out_height,
+                const int out_width,
+                const int pad_front,
+                const int pad_top,
+                const int pad_left,
+                T value,
+                T* out_data,
+                void (*pad_func)(const T*,
+                                 T*,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(in_data,
+                   out_data,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   out_d,
+                   out_h,
+                   out_w,
+                   value);
+        }
+      }
+    }
+    in_data += in_depth * in_height * in_width * channels;
+    out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const ScalarArray& paddings,
+                 const std::string& mode,
+                 float pad_value,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  T value = static_cast<T>(pad_value);
+  std::vector<int64_t> pads = paddings.GetData();
+
+  auto in_dims = x.dims();
+  const T* in_data = x.data<T>();
+
+  if (data_format == "NCDHW") {
+    out->Resize({in_dims[0],
+                 in_dims[1],
+                 in_dims[2] + pads[4] + pads[5],
+                 in_dims[3] + pads[2] + pads[3],
+                 in_dims[4] + pads[0] + pads[1]});
+  } else {
+    out->Resize({in_dims[0],
+                 in_dims[1] + pads[4] + pads[5],
+                 in_dims[2] + pads[2] + pads[3],
+                 in_dims[3] + pads[0] + pads[1],
+                 in_dims[4]});
+  }
+
+  auto out_dims = out->dims();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  int channels = in_dims[1];
+  int in_depth = in_dims[2];
+  int in_height = in_dims[3];
+  int in_width = in_dims[4];
+  int out_depth = out_dims[2];
+  int out_height = out_dims[3];
+  int out_width = out_dims[4];
+  if (data_format == "NDHWC") {
+    channels = in_dims[4];
+    in_depth = in_dims[1];
+    in_height = in_dims[2];
+    in_width = in_dims[3];
+    out_depth = out_dims[1];
+    out_height = out_dims[2];
+    out_width = out_dims[3];
+  }
+
+  if (mode == "reflect") {
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[4],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_front"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_front(%d).",
+                                in_depth,
+                                pads[4]));
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[5],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_back"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_back(%d).",
+                                in_depth,
+                                pads[5]));
+
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[2],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_top"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_top(%d).",
+                                in_height,
+                                pads[2]));
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[3],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_bottom"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_bottom(%d).",
+                                in_height,
+                                pads[3]));
+
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[0],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_left"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_left(%d).",
+                                in_width,
+                                pads[0]));
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[1],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_right"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_right(%d).",
+                                in_width,
+                                pads[1]));
+  } else if (mode == "circular" || mode == "replicate") {
+    PADDLE_ENFORCE_NE(in_depth * in_height * in_width,
+                      0,
+                      errors::InvalidArgument(
+                          "The input tensor size can not be 0 for circular "
+                          "or replicate padding mode."));
+  }
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+  const int num = in_dims[0];
+  if (data_format == "NCDHW") {
+    std::map<std::string,
+             void (*)(const T*,
+                      T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const T)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DFuncNCDHW;
+    func_map["replicate"] = ReplicatePad3DFuncNCDHW;
+    func_map["circular"] = CircularPad3DFuncNCDHW;
+    func_map["constant"] = ConstPad3DFuncNCDHW;
+    Pad3DNCDHW(in_data,
+               num,
+               channels,
+               in_depth,
+               in_height,
+               in_width,
+               out_depth,
+               out_height,
+               out_width,
+               pad_front,
+               pad_top,
+               pad_left,
+               value,
+               out_data,
+               func_map[mode]);
+  } else {
+    std::map<std::string,
+             void (*)(const T*,
+                      T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const T)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DFuncNDHWC;
+    func_map["replicate"] = ReplicatePad3DFuncNDHWC;
+    func_map["circular"] = CircularPad3DFuncNDHWC;
+    func_map["constant"] = ConstPad3DFuncNDHWC;
+    Pad3DNDHWC(in_data,
+               num,
+               channels,
+               in_depth,
+               in_height,
+               in_width,
+               out_depth,
+               out_height,
+               out_width,
+               pad_front,
+               pad_top,
+               pad_left,
+               value,
+               out_data,
+               func_map[mode]);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    pad3d, CPU, ALL_LAYOUT, phi::Pad3dKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ca8f3d73daded476052b77459bd68f2184ab290
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
@@ -0,0 +1,507 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_grad_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void Pad3DGradConstNCDHW(const int in_size,
+                                    T* d_in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int in_w = in_index % in_width;
+
+    int nc = in_index / in_width;
+    const int in_h = nc % in_height;
+
+    nc /= in_height;
+    const int in_d = nc % in_depth;
+
+    nc /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+    d_in_data[in_index] =
+        d_out_data[nc * out_depth * out_height * out_width +
+                   out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradConstNDHWC(const int in_size,
+                                    T* d_in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int c = in_index % channels;
+    int n = in_index / channels;
+
+    const int in_w = n % in_width;
+    n /= in_width;
+
+    const int in_h = n % in_height;
+    n /= in_height;
+
+    const int in_d = n % in_depth;
+    n /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+
+    d_in_data[in_index] =
+        d_out_data[n * out_depth * out_height * out_width * channels +
+                   out_d * out_height * out_width * channels +
+                   out_h * out_width * channels + out_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNCDHW(const int out_size,
+                                      T* d_in_data,
+                                      const int num,
+                                      const int channels,
+                                      const int in_depth,
+                                      const int in_height,
+                                      const int in_width,
+                                      const int out_depth,
+                                      const int out_height,
+                                      const int out_width,
+                                      const int pad_front,
+                                      const int pad_top,
+                                      const int pad_left,
+                                      const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNDHWC(const int out_size,
+                                      T* d_in_data,
+                                      const int num,
+                                      const int channels,
+                                      const int in_depth,
+                                      const int in_height,
+                                      const int in_width,
+                                      const int out_depth,
+                                      const int out_height,
+                                      const int out_width,
+                                      const int pad_front,
+                                      const int pad_top,
+                                      const int pad_left,
+                                      const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, in_depth * 2 - in_d - 2);
+    in_h = min(in_h, in_height * 2 - in_h - 2);
+    in_w = min(in_w, in_width * 2 - in_w - 2);
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNCDHW(const int out_size,
+                                        T* d_in_data,
+                                        const int num,
+                                        const int channels,
+                                        const int in_depth,
+                                        const int in_height,
+                                        const int in_width,
+                                        const int out_depth,
+                                        const int out_height,
+                                        const int out_width,
+                                        const int pad_front,
+                                        const int pad_top,
+                                        const int pad_left,
+                                        const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNDHWC(const int out_size,
+                                        T* d_in_data,
+                                        const int num,
+                                        const int channels,
+                                        const int in_depth,
+                                        const int in_height,
+                                        const int in_width,
+                                        const int out_depth,
+                                        const int out_height,
+                                        const int out_width,
+                                        const int pad_front,
+                                        const int pad_top,
+                                        const int pad_left,
+                                        const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNCDHW(const int out_size,
+                                       T* d_in_data,
+                                       const int num,
+                                       const int channels,
+                                       const int in_depth,
+                                       const int in_height,
+                                       const int in_width,
+                                       const int out_depth,
+                                       const int out_height,
+                                       const int out_width,
+                                       const int pad_front,
+                                       const int pad_top,
+                                       const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNDHWC(const int out_size,
+                                       T* d_in_data,
+                                       const int num,
+                                       const int channels,
+                                       const int in_depth,
+                                       const int in_height,
+                                       const int in_width,
+                                       const int out_depth,
+                                       const int out_height,
+                                       const int out_width,
+                                       const int pad_front,
+                                       const int pad_top,
+                                       const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& out_grad,
+                     const ScalarArray& paddings,
+                     const std::string& mode,
+                     float pad_value,
+                     const std::string& data_format,
+                     DenseTensor* x_grad) {
+  std::vector<int64_t> pads = paddings.GetData();
+  auto* d_out = &out_grad;
+  auto* d_in = x_grad;
+  auto d_in_dims = d_in->dims();
+  auto d_out_dims = d_out->dims();
+  const T* d_out_data = d_out->data<T>();
+  T* d_in_data = dev_ctx.template Alloc<T>(d_in);
+
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, d_in, static_cast<T>(0));
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+
+  const int num = d_in_dims[0];
+
+  auto stream = dev_ctx.stream();
+  int block = PADDLE_CUDA_NUM_THREADS;
+  const int out_size = d_out->numel();
+  const int in_size = d_in->numel();
+  int grid = (out_size + block - 1) / block;
+
+  if (data_format == "NCDHW") {
+    const int channels = d_in_dims[1];
+    const int in_depth = d_in_dims[2];
+    const int in_height = d_in_dims[3];
+    const int in_width = d_in_dims[4];
+    const int out_depth = d_out_dims[2];
+    const int out_height = d_out_dims[3];
+    const int out_width = d_out_dims[4];
+
+    if (mode == "reflect") {
+      Pad3DGradReflectNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                           d_in_data,
+                                                           num,
+                                                           channels,
+                                                           in_depth,
+                                                           in_height,
+                                                           in_width,
+                                                           out_depth,
+                                                           out_height,
+                                                           out_width,
+                                                           pad_front,
+                                                           pad_top,
+                                                           pad_left,
+                                                           d_out_data);
+    } else if (mode == "replicate") {
+      Pad3DGradReplicateNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                             d_in_data,
+                                                             num,
+                                                             channels,
+                                                             in_depth,
+                                                             in_height,
+                                                             in_width,
+                                                             out_depth,
+                                                             out_height,
+                                                             out_width,
+                                                             pad_front,
+                                                             pad_top,
+                                                             pad_left,
+                                                             d_out_data);
+    } else if (mode == "circular") {
+      Pad3DGradCircularNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                            d_in_data,
+                                                            num,
+                                                            channels,
+                                                            in_depth,
+                                                            in_height,
+                                                            in_width,
+                                                            out_depth,
+                                                            out_height,
+                                                            out_width,
+                                                            pad_front,
+                                                            pad_top,
+                                                            pad_left,
+                                                            d_out_data);
+    } else {
+      grid = (in_size + block - 1) / block;
+      Pad3DGradConstNCDHW<T><<<grid, block, 0, stream>>>(in_size,
+                                                         d_in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         d_out_data);
+    }
+  } else {
+    const int channels = d_in_dims[4];
+    const int in_depth = d_in_dims[1];
+    const int in_height = d_in_dims[2];
+    const int in_width = d_in_dims[3];
+    const int out_depth = d_out_dims[1];
+    const int out_height = d_out_dims[2];
+    const int out_width = d_out_dims[3];
+    if (mode == "reflect") {
+      Pad3DGradReflectNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                           d_in_data,
+                                                           num,
+                                                           channels,
+                                                           in_depth,
+                                                           in_height,
+                                                           in_width,
+                                                           out_depth,
+                                                           out_height,
+                                                           out_width,
+                                                           pad_front,
+                                                           pad_top,
+                                                           pad_left,
+                                                           d_out_data);
+    } else if (mode == "replicate") {
+      Pad3DGradReplicateNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                             d_in_data,
+                                                             num,
+                                                             channels,
+                                                             in_depth,
+                                                             in_height,
+                                                             in_width,
+                                                             out_depth,
+                                                             out_height,
+                                                             out_width,
+                                                             pad_front,
+                                                             pad_top,
+                                                             pad_left,
+                                                             d_out_data);
+    } else if (mode == "circular") {
+      Pad3DGradCircularNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                            d_in_data,
+                                                            num,
+                                                            channels,
+                                                            in_depth,
+                                                            in_height,
+                                                            in_width,
+                                                            out_depth,
+                                                            out_height,
+                                                            out_width,
+                                                            pad_front,
+                                                            pad_top,
+                                                            pad_left,
+                                                            d_out_data);
+    } else {
+      grid = (in_size + block - 1) / block;
+      Pad3DGradConstNDHWC<T><<<grid, block, 0, stream>>>(in_size,
+                                                         d_in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         d_out_data);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    pad3d_grad, GPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2cef77cc0eef96d910d1b4f8c1b0ba736034063a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu
@@ -0,0 +1,588 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_kernel.h"
+
+#include <algorithm>
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void Pad3DConstNCDHW(const int nthreads,
+                                const T* in_data,
+                                const int num,
+                                const int channels,
+                                const int in_depth,
+                                const int in_height,
+                                const int in_width,
+                                const int out_depth,
+                                const int out_height,
+                                const int out_width,
+                                const int pad_front,
+                                const int pad_top,
+                                const int pad_left,
+                                T value,
+                                T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[nc * in_depth * in_height * in_width +
+                      in_d * in_height * in_width + in_h * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DConstNDHWC(const int nthreads,
+                                const T* in_data,
+                                const int num,
+                                const int channels,
+                                const int in_depth,
+                                const int in_height,
+                                const int in_width,
+                                const int out_depth,
+                                const int out_height,
+                                const int out_width,
+                                const int pad_front,
+                                const int pad_top,
+                                const int pad_left,
+                                T value,
+                                T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    const int in_d = out_d - pad_front;
+    const int in_h = out_h - pad_top;
+    const int in_w = out_w - pad_left;
+
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[n * in_depth * in_height * in_width * channels +
+                      in_d * in_height * in_width * channels +
+                      in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNCDHW(const int nthreads,
+                                  const T* in_data,
+                                  const int num,
+                                  const int channels,
+                                  const int in_depth,
+                                  const int in_height,
+                                  const int in_width,
+                                  const int out_depth,
+                                  const int out_height,
+                                  const int out_width,
+                                  const int pad_front,
+                                  const int pad_top,
+                                  const int pad_left,
+                                  T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);                     // reflect by 0
+    in_d = min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+    in_h = max(in_h, -in_h);                     // reflect by 0
+    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+    in_w = max(in_w, -in_w);                     // reflect by 0
+    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNDHWC(const int nthreads,
+                                  const T* in_data,
+                                  const int num,
+                                  const int channels,
+                                  const int in_depth,
+                                  const int in_height,
+                                  const int in_width,
+                                  const int out_depth,
+                                  const int out_height,
+                                  const int out_width,
+                                  const int pad_front,
+                                  const int pad_top,
+                                  const int pad_left,
+                                  T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = max(in_h, -in_h);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = max(in_w, -in_w);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNCDHW(const int nthreads,
+                                    const T* in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNDHWC(const int nthreads,
+                                    const T* in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNCDHW(const int nthreads,
+                                   const T* in_data,
+                                   const int num,
+                                   const int channels,
+                                   const int in_depth,
+                                   const int in_height,
+                                   const int in_width,
+                                   const int out_depth,
+                                   const int out_height,
+                                   const int out_width,
+                                   const int pad_front,
+                                   const int pad_top,
+                                   const int pad_left,
+                                   T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNDHWC(const int nthreads,
+                                   const T* in_data,
+                                   const int num,
+                                   const int channels,
+                                   const int in_depth,
+                                   const int in_height,
+                                   const int in_width,
+                                   const int out_depth,
+                                   const int out_height,
+                                   const int out_width,
+                                   const int pad_front,
+                                   const int pad_top,
+                                   const int pad_left,
+                                   T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const ScalarArray& paddings,
+                 const std::string& mode,
+                 float pad_value,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  std::vector<int64_t> pads = paddings.GetData();
+
+  auto in_dims = x.dims();
+  const T* in_data = x.data<T>();
+  auto out_dims = out->dims();
+  T value = static_cast<T>(pad_value);
+
+  if (data_format == "NCDHW") {
+    out_dims[0] = in_dims[0];
+    out_dims[1] = in_dims[1];
+    out_dims[2] = in_dims[2] + pads[4] + pads[5];
+    out_dims[3] = in_dims[3] + pads[2] + pads[3];
+    out_dims[4] = in_dims[4] + pads[0] + pads[1];
+  } else {
+    out_dims[0] = in_dims[0];
+    out_dims[1] = in_dims[1] + pads[4] + pads[5];
+    out_dims[2] = in_dims[2] + pads[2] + pads[3];
+    out_dims[3] = in_dims[3] + pads[0] + pads[1];
+    out_dims[4] = in_dims[4];
+  }
+  out->Resize(out_dims);
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  int channels = in_dims[1];
+  int in_depth = in_dims[2];
+  int in_height = in_dims[3];
+  int in_width = in_dims[4];
+  int out_depth = out_dims[2];
+  int out_height = out_dims[3];
+  int out_width = out_dims[4];
+  if (data_format == "NDHWC") {
+    channels = in_dims[4];
+    in_depth = in_dims[1];
+    in_height = in_dims[2];
+    in_width = in_dims[3];
+    out_depth = out_dims[1];
+    out_height = out_dims[2];
+    out_width = out_dims[3];
+  }
+
+  if (mode == "reflect") {
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[4],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_front"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_front(%d).",
+                                in_depth,
+                                pads[4]));
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[5],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_back"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_back(%d).",
+                                in_depth,
+                                pads[5]));
+
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[2],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_top"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_top(%d).",
+                                in_height,
+                                pads[2]));
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[3],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_bottom"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_bottom(%d).",
+                                in_height,
+                                pads[3]));
+
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[0],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_left"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_left(%d).",
+                                in_width,
+                                pads[0]));
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[1],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_right"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_right(%d).",
+                                in_width,
+                                pads[1]));
+  } else if (mode == "circular" || mode == "replicate") {
+    PADDLE_ENFORCE_NE(in_depth * in_height * in_width,
+                      0,
+                      errors::InvalidArgument(
+                          "The input tensor size can not be 0 for circular "
+                          "or replicate padding mode."));
+  }
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+  const int num = in_dims[0];
+
+  auto stream = dev_ctx.stream();
+  int block = PADDLE_CUDA_NUM_THREADS;
+  const int out_size = out->numel();
+  int grid = (out_size + block - 1) / block;
+
+  if (data_format == "NCDHW") {
+    if (mode == "reflect") {
+      Pad3DReflectNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                       in_data,
+                                                       num,
+                                                       channels,
+                                                       in_depth,
+                                                       in_height,
+                                                       in_width,
+                                                       out_depth,
+                                                       out_height,
+                                                       out_width,
+                                                       pad_front,
+                                                       pad_top,
+                                                       pad_left,
+                                                       out_data);
+    } else if (mode == "replicate") {
+      Pad3DReplicateNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                         in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         out_data);
+    } else if (mode == "circular") {
+      Pad3DCircularNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                        in_data,
+                                                        num,
+                                                        channels,
+                                                        in_depth,
+                                                        in_height,
+                                                        in_width,
+                                                        out_depth,
+                                                        out_height,
+                                                        out_width,
+                                                        pad_front,
+                                                        pad_top,
+                                                        pad_left,
+                                                        out_data);
+    } else {
+      Pad3DConstNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                     in_data,
+                                                     num,
+                                                     channels,
+                                                     in_depth,
+                                                     in_height,
+                                                     in_width,
+                                                     out_depth,
+                                                     out_height,
+                                                     out_width,
+                                                     pad_front,
+                                                     pad_top,
+                                                     pad_left,
+                                                     value,
+                                                     out_data);
+    }
+  } else {
+    if (mode == "reflect") {
+      Pad3DReflectNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                       in_data,
+                                                       num,
+                                                       channels,
+                                                       in_depth,
+                                                       in_height,
+                                                       in_width,
+                                                       out_depth,
+                                                       out_height,
+                                                       out_width,
+                                                       pad_front,
+                                                       pad_top,
+                                                       pad_left,
+                                                       out_data);
+    } else if (mode == "replicate") {
+      Pad3DReplicateNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                         in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         out_data);
+    } else if (mode == "circular") {
+      Pad3DCircularNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                        in_data,
+                                                        num,
+                                                        channels,
+                                                        in_depth,
+                                                        in_height,
+                                                        in_width,
+                                                        out_depth,
+                                                        out_height,
+                                                        out_width,
+                                                        pad_front,
+                                                        pad_top,
+                                                        pad_left,
+                                                        out_data);
+    } else {
+      Pad3DConstNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                     in_data,
+                                                     num,
+                                                     channels,
+                                                     in_depth,
+                                                     in_height,
+                                                     in_width,
+                                                     out_depth,
+                                                     out_height,
+                                                     out_width,
+                                                     pad_front,
+                                                     pad_top,
+                                                     pad_left,
+                                                     value,
+                                                     out_data);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(pad3d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pad3dKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/pad3d_grad_kernel.h b/paddle/phi/kernels/pad3d_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..38f1e5335e8c240058fb3b52a8ae59a0c438b61c
--- /dev/null
+++ b/paddle/phi/kernels/pad3d_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pad3dGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& out_grad,
+                     const ScalarArray& paddings,
+                     const std::string& mode,
+                     float pad_value,
+                     const std::string& data_format,
+                     DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/pad3d_kernel.h b/paddle/phi/kernels/pad3d_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8876c3e7bc74f6f03413f00279bfaa355907c6b
--- /dev/null
+++ b/paddle/phi/kernels/pad3d_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pad3dKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const ScalarArray& paddings,
+                 const std::string& mode,
+                 float pad_value,
+                 const std::string& data_format,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/pad3d_sig.cc b/paddle/phi/ops/compat/pad3d_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c43b98fa27e6baef55ad1dcbc11cb764ba9cb944
--- /dev/null
+++ b/paddle/phi/ops/compat/pad3d_sig.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Pad3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Paddings")) {
+    return KernelSignature(
+        "pad3d", {"X"}, {"Paddings", "mode", "value", "data_format"}, {"Out"});
+  }
+
+  return KernelSignature(
+      "pad3d", {"X"}, {"paddings", "mode", "value", "data_format"}, {"Out"});
+}
+
+KernelSignature Pad3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Paddings")) {
+    return KernelSignature("pad3d_grad",
+                           {"X", GradVarName("Out")},
+                           {"Paddings", "mode", "value", "data_format"},
+                           {GradVarName("X")});
+  }
+  return KernelSignature("pad3d_grad",
+                         {"X", GradVarName("Out")},
+                         {"paddings", "mode", "value", "data_format"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(pad3d, phi::Pad3dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pad3d_grad, phi::Pad3dGradOpArgumentMapping);
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 9439982858530e1e81156be4b32ef2d91dc4a33a..b4c5a36d288b7ee0f6e771d72b21bd54d1e3d669 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -612,7 +612,7 @@ def grad(outputs,
 
     if no_grad_vars is None:
         no_grad_vars = []
-    elif isinstance(no_grad_vars, core.VarBase):
+    elif isinstance(no_grad_vars, (core.VarBase, core.eager.Tensor)):
         no_grad_vars = [no_grad_vars]
     elif isinstance(no_grad_vars, core.eager.Tensor):
         no_grad_vars = [no_grad_vars]
@@ -718,13 +718,13 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
             y.shape     # [3L, 2L]
 
     """
-    support_type = (list, tuple, np.ndarray, core.VarBase, framework.Variable,
-                    core.Tensor, core.LoDTensor)
+    support_type = (list, tuple, np.ndarray, core.eager.Tensor, core.VarBase,
+                    framework.Variable, core.Tensor, core.LoDTensor)
     if not isinstance(value, support_type):
         raise TypeError(
             "The type of 'value' in fluid.dygraph.to_variable must be %s, but received %s."
             % (support_type, type(value)))
-    if isinstance(value, (core.VarBase, framework.Variable)):
+    if isinstance(value, (core.eager.Tensor, core.VarBase, framework.Variable)):
         return value
     elif isinstance(value, (core.Tensor, core.LoDTensor)):
         return core.VarBase(value)
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 2b67a2029727f6b8f917239094a1b906d5cd6a62..af30b2b2444b44f1b27e8f277eb380557255517d 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -28,6 +28,7 @@ from .math_op_patch import monkey_patch_math_varbase
 from .parallel import scale_loss
 from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
 import paddle.utils.deprecated as deprecated
+from paddle import _C_ops
 
 
 class TensorHookRemoveHelper(object):
@@ -782,7 +783,7 @@ def monkey_patch_varbase():
 
     @framework.dygraph_only
     def clone(self):
-        return _C_ops_.assign(self)
+        return _C_ops.assign(self)
 
     @framework.dygraph_only
     def value(self):
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index d0a94238a7aeb21f9d1baf8154cbe3b7f2b77a72..fb787215d910e9924622147b86c328af5e1994de 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -316,7 +316,8 @@ def _dygraph_not_support_(func):
 
 def _dygraph_only_(func):
     def __impl__(*args, **kwargs):
-        assert in_dygraph_mode(
+        assert (
+            in_dygraph_mode() or _in_eager_mode()
         ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__
         return func(*args, **kwargs)
 
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 9348b0b50a1c08e7103dc3cc32169f4a6a40591c..c45045509201df89d6a07b8c0aadc7ef9130cf2f 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -886,6 +886,7 @@ class TestDistributeFpnProposals(LayerTest):
                 refer_level=4,
                 refer_scale=224,
                 rois_num=rois_num_dy)
+            print(type(multi_rois_dy))
             output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy
             output_dy_np = []
             for output in output_dy:
@@ -973,4 +974,5 @@ class TestBoxDecoderAndAssign(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
index cc2e14d6d6c2ef237351e372c75ca7e700de3fbf..341ec852c52197f689870f0a6c45141ebe318301 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
@@ -50,3 +50,7 @@ class TestExponentialFamilyException(unittest.TestCase):
     def test_entropy_exception(self):
         with self.assertRaises(NotImplementedError):
             paddle.distribution.ExponentialFamily.entropy(self.dist)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_kl.py b/python/paddle/fluid/tests/unittests/distribution/test_kl.py
index a1413722446e287688d7e120a3ef31ea67cc798b..55358380c8b23fdfd512b259aca06901d5623e38 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_kl.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_kl.py
@@ -112,3 +112,7 @@ class TestKLExpfamilyExpFamily(unittest.TestCase):
             kl._kl_expfamily_expfamily(self.p, self.q),
             rtol=config.RTOL.get(config.DEFAULT_DTYPE),
             atol=config.ATOL.get(config.DEFAULT_DTYPE))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
index 737c085dde6acf5e3645b2127f42b1d8b5a7aa1d..34b6f6dc8e5453b42e10c45d5423c6e17d2d0506 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
@@ -20,6 +20,7 @@ import sys
 sys.path.append("../")
 from op_test import OpTest
 
+import paddle
 from paddle import fluid
 
 
@@ -115,4 +116,5 @@ class TestSequenceConcatOpError(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_Tensor_type.py b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
index 59395b94279ea7ec4fe43221deede7e82be8f38e..f1427d29782b969d9571f79c9a7bc62bf4e77070 100644
--- a/python/paddle/fluid/tests/unittests/test_Tensor_type.py
+++ b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
@@ -39,6 +39,7 @@ class TensorTypeTest(unittest.TestCase):
 
         tensorx = paddle.tensor.logic.Tensor(inx)
         typex_str = str(type(tensorx))
+
         expectx = "<class 'paddle.Tensor'>"
         self.assertEqual((typex_str == expectx), True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index ecac22553cbcda7cc2dae179603f407eddc8652a..d05c9a3c313bb634effd9280e3d9503142166ee4 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -1202,4 +1202,5 @@ class TestMultiTensorAdam(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index b440e745b1082e98a832ea076cc052cbc106eeab..789cfa82658f43d2adb148fe41fd2fb380e96fba 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -451,4 +451,5 @@ class TestLayerTo(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_create_parameter.py b/python/paddle/fluid/tests/unittests/test_create_parameter.py
index 763fb64816c9c66055b3ead2886e4ba29e0406f7..199558acd4ef64f4d63c04920ba0b0e0295df96c 100644
--- a/python/paddle/fluid/tests/unittests/test_create_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_create_parameter.py
@@ -18,6 +18,7 @@ import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from paddle.fluid import ParamAttr, initializer
+import paddle
 
 
 class TestCreateParameterError(unittest.TestCase):
@@ -50,4 +51,5 @@ class TestCreateParameterError(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index f5934debfd7b663b24a0949012ea2aa85e07ece8..ffc5bc184efc222d3adb57e158814c0f592b9405 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -20,6 +20,7 @@ import numpy as np
 from op_test import OpTest
 from test_softmax_op import stable_softmax
 import paddle.fluid as fluid
+import paddle
 
 
 def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None):
@@ -229,4 +230,5 @@ class BadInputTestCTCAlignr(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py
index 1ae780f488d2dc6bf37f88505a67723ea867dd94..4a96827bd7c3c56320a58261abe1824786164d10 100644
--- a/python/paddle/fluid/tests/unittests/test_diff_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
@@ -211,4 +211,5 @@ class TestDiffOpPreAppendAxis(TestDiffOp):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
index f95546f15f0024ccd8b7cd8464f0a8eb70662d8d..27d82fcc8903be20a378a45e0f4f3b01aa3d3bb7 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -190,4 +190,5 @@ class TestDygraphRemoveWeightNorm(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py
index ccbc0a1676302b4c29b524601930cc855847e0fc..7a3ae203be62d644f076ae9b6bc2bf5b8641ccdf 100644
--- a/python/paddle/fluid/tests/unittests/test_exponential_op.py
+++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py
@@ -209,4 +209,5 @@ class TestExponentialAPI(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fmin_op.py b/python/paddle/fluid/tests/unittests/test_fmin_op.py
index 5cdf096be6708c47dd1f56dc97243be70c6d63d5..7231823c375324aa7bbf7d45db14b4457ca4a8dd 100644
--- a/python/paddle/fluid/tests/unittests/test_fmin_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fmin_op.py
@@ -189,3 +189,8 @@ class TestElementwiseFmin2Op(OpTest):
         """test_check_grad_ingore_y"""
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index bff10c9c4ca26d342a6849a0b23a490058d6b7f7..8dc822c69b2c5df34968fbcd39b8d8438700add2 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -1025,4 +1025,5 @@ class TestDiracInitializer3(TestDiracInitializer1):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inner.py b/python/paddle/fluid/tests/unittests/test_inner.py
index de9decd0b8961115b7ee2e6dac44bfb40fcc5c1f..ff9f15ebbfc8204de042d7731ed94035152f46eb 100644
--- a/python/paddle/fluid/tests/unittests/test_inner.py
+++ b/python/paddle/fluid/tests/unittests/test_inner.py
@@ -163,4 +163,5 @@ class TestMultiplyError(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_io_save_load.py b/python/paddle/fluid/tests/unittests/test_io_save_load.py
index 89ca28510b9b929b1fe36e0c9883da020e71555c..83aadbf68d569f904d56abfcab91236bd637095b 100644
--- a/python/paddle/fluid/tests/unittests/test_io_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_io_save_load.py
@@ -88,4 +88,5 @@ class TestWhenTrainWithNoGrad(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_isclose_op.py b/python/paddle/fluid/tests/unittests/test_isclose_op.py
index aa39284d11349eed027a1a496ce6d8b2b5e92e3d..2bb58d7c5741f2655bdcbffecedf8762704c07f3 100644
--- a/python/paddle/fluid/tests/unittests/test_isclose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isclose_op.py
@@ -210,6 +210,9 @@ class TestIscloseOpFloat64(TestIscloseOp):
         self.atol = np.array([0]).astype("float64")
         self.equal_nan = False
 
+    def test_check_output(self):
+        self.check_output()
+
 
 class TestIscloseOpLargeDimInput(TestIscloseOp):
     def set_args(self):
@@ -222,4 +225,5 @@ class TestIscloseOpLargeDimInput(TestIscloseOp):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index 16f954708d4d4149f46a18cfd48e35dfbe147153..423eeaf3ada45e7d04dca3512bdba0b067583222 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -175,4 +175,5 @@ class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 6d94144fc7788d0dc79cfb10f97667a257621a04..60dd4948f996e505f59d7e12b92569000843c528 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -555,4 +555,5 @@ class TestLRScheduler(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py
index e2e118ac9e3b46499055c2dd46755d5401d5abd5..4e89a9034a341777f09958d9709b64a12020ec28 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
@@ -19,6 +19,7 @@ import unittest
 import numpy as np
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle
 
 
 def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects,
@@ -129,4 +130,5 @@ class TestMeanIOUOpError(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_minus_op.py b/python/paddle/fluid/tests/unittests/test_minus_op.py
index 54253b17b967871b03628023c5a9fdb339af1828..461ff6a9273cdb39c73901da3f77fca021335f0c 100644
--- a/python/paddle/fluid/tests/unittests/test_minus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minus_op.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 
 
 class TestMinusOp(OpTest):
@@ -36,4 +37,5 @@ class TestMinusOp(OpTest):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index 575bc653618a583e883783cd1fffe1db371eccff..ef912699455d1b4ea2aa2899f20d0e2e09634f77 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -588,4 +588,5 @@ class API_NormTest(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 9e0cf6ddef2d619e4d3b32260f7ddf5f31186ae5..8945d35c131fd8de89e2a421bbbd4b16aa01c9d8 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -315,7 +315,9 @@ class TestSaveLoadAny(unittest.TestCase):
         paddle.save(tensor, path)
         t_dygraph = paddle.load(path)
         np_dygraph = paddle.load(path, return_numpy=True)
-        self.assertTrue(isinstance(t_dygraph, paddle.fluid.core.VarBase))
+        self.assertTrue(
+            isinstance(t_dygraph, (paddle.fluid.core.VarBase,
+                                   paddle.fluid.core.eager.Tensor)))
         self.assertTrue(np.array_equal(tensor.numpy(), np_dygraph))
         self.assertTrue(np.array_equal(tensor.numpy(), t_dygraph.numpy()))
         paddle.enable_static()
@@ -685,27 +687,34 @@ class TestSaveLoadAny(unittest.TestCase):
                         np.array(v), np.array(load_tensor2['k2'][k])))
             self.assertTrue(load_tensor2['epoch'] == 123)
 
-            self.assertTrue(isinstance(load_tensor3[0], fluid.core.VarBase))
+            self.assertTrue(
+                isinstance(load_tensor3[0], (fluid.core.VarBase,
+                                             fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor3[0].numpy(), obj3[0]))
-            self.assertTrue(isinstance(load_tensor3[1], fluid.core.VarBase))
+            self.assertTrue(
+                isinstance(load_tensor3[1], (fluid.core.VarBase,
+                                             fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor3[1].numpy(), obj3[1]))
 
             for k, v in state_dict.items():
                 self.assertTrue(
-                    isinstance(load_tensor3[2]["state_dict"][k],
-                               fluid.core.VarBase))
+                    isinstance(load_tensor3[2]["state_dict"][k], (
+                        fluid.core.VarBase, fluid.core.eager.Tensor)))
                 self.assertTrue(
                     np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
                                    np.array(v)))
 
             for k, v in state_dict.items():
                 self.assertTrue(
-                    isinstance(load_tensor3[2]["opt"][k], fluid.core.VarBase))
+                    isinstance(load_tensor3[2]["opt"][k], (
+                        fluid.core.VarBase, fluid.core.eager.Tensor)))
                 self.assertTrue(
                     np.array_equal(load_tensor3[2]["opt"][k].numpy(),
                                    np.array(v)))
 
-            self.assertTrue(isinstance(load_tensor4[0], fluid.core.VarBase))
+            self.assertTrue(
+                isinstance(load_tensor4[0], (fluid.core.VarBase,
+                                             fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
 
             load_array1 = paddle.load(path1, return_numpy=True)
diff --git a/python/paddle/fluid/tests/unittests/test_renorm_op.py b/python/paddle/fluid/tests/unittests/test_renorm_op.py
index 3ea2002a9786fdd3f6c034e84176d0cae46ca591..e00a892cf7197bc94d85e9082651e26a4bb3bbb9 100644
--- a/python/paddle/fluid/tests/unittests/test_renorm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_renorm_op.py
@@ -54,7 +54,7 @@ class TestRenormAPI(unittest.TestCase):
     def test_dygraph_api(self):
         self.input_data()
         # case axis none
-        with fluid.dygraph.guard():
+        with fluid.dygraph.guard(fluid.CPUPlace()):
             input = [[[2.0, 2, -2], [3, 0.3, 3]], [[2, -8, 2], [3.1, 3.7, 3]]]
             x = paddle.to_tensor(input, stop_gradient=False)
             y = paddle.renorm(x, 1.0, 2, 2.05)
@@ -94,4 +94,5 @@ class TestRenormAPI(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
index ca324b4a8fd0581e7483c12321f54acaa1965f54..1bfc1b00aa8227e6ccaefcaf1044774ed1404f45 100644
--- a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
+++ b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
@@ -23,6 +23,7 @@ from test_multiclass_nms_op import iou
 from test_multiclass_nms_op import nms
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+import paddle
 
 
 def multiclass_nms(prediction, class_num, keep_top_k, nms_threshold):
@@ -518,4 +519,5 @@ class TestRetinanetDetectionOutOpError(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
index 9a97f57aaae5f290b20e34242b1b43e5e352223d..74409c8671059673121d0a73ed85d2cad8e3d6f2 100644
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
@@ -178,4 +178,5 @@ class SmoothL1Loss(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
index a01cf590e0cc6cd136335e9a74720a2b460dbb53..6f3d99f80ce21d9c3de194a8fe4c03fdbc83416f 100644
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -22,7 +22,7 @@ import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
 
-# Situation 1: repeat_times is a list (without tensor)
+#Situation 1: repeat_times is a list (without tensor)
 class TestTileOpRank1(OpTest):
     def setUp(self):
         self.op_type = "tile"
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index dbd40c349bbc81d39b8a929ee5b3e7b81a083406..57a7f94bedce9fb3cd9981e6ae21f6d902fd04d9 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -1361,4 +1361,5 @@ class TestVarBaseCopyGradientFrom(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 15d5640b11fe501e0d9f83168c434f9f02d7877c..59e285c1200b88cadd2016421b1a8de70c7dad34 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -68,8 +68,9 @@ def to_list(value):
 
 
 def to_numpy(var):
-    assert isinstance(var, (Variable, fluid.core.VarBase)), "not a variable"
-    if isinstance(var, fluid.core.VarBase):
+    assert isinstance(var, (Variable, fluid.core.VarBase,
+                            fluid.core.eager.Tensor)), "not a variable"
+    if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)):
         return var.numpy()
     t = global_scope().find_var(var.name).get_tensor()
     return np.array(t)
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 3ff91aa077954510a8e38bdee06b03968796b0f0..ba8aecd5d03ed8f4603eb6d4ebbbd799b3fc7a54 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -282,7 +282,7 @@ class Accuracy(Metric):
         Return:
             Tensor: the accuracy of current step.
         """
-        if isinstance(correct, paddle.Tensor):
+        if isinstance(correct, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             correct = correct.numpy()
         num_samples = np.prod(np.array(correct.shape[:-1]))
         accs = []
@@ -410,12 +410,12 @@ class Precision(Metric):
                 the shape should keep the same as preds.
                 The data type is 'int32' or 'int64'.
         """
-        if isinstance(preds, paddle.Tensor):
+        if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             preds = preds.numpy()
         elif not _is_numpy_(preds):
             raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
 
-        if isinstance(labels, paddle.Tensor):
+        if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             labels = labels.numpy()
         elif not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
@@ -543,12 +543,12 @@ class Recall(Metric):
                 the shape should keep the same as preds.
                 Shape: [batch_size, 1], Dtype: 'int32' or 'int64'.
         """
-        if isinstance(preds, paddle.Tensor):
+        if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             preds = preds.numpy()
         elif not _is_numpy_(preds):
             raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
 
-        if isinstance(labels, paddle.Tensor):
+        if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             labels = labels.numpy()
         elif not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
@@ -698,12 +698,12 @@ class Auc(Metric):
                 (batch_size, 1), labels[i] is either o or 1,
                 representing the label of the instance i.
         """
-        if isinstance(labels, paddle.Tensor):
+        if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             labels = labels.numpy()
         elif not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
 
-        if isinstance(preds, paddle.Tensor):
+        if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             preds = preds.numpy()
         elif not _is_numpy_(preds):
             raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index c227d2e6cee54a4da05b72ecc031061376e9425f..3b1ed17a70d3bdc57c20f88c121e973265e32fcd 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -462,7 +462,7 @@ def is_tensor(x):
             print(check)  #False
             
     """
-    return isinstance(x, Tensor)
+    return isinstance(x, (Tensor, paddle.fluid.core.eager.Tensor))
 
 
 def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index e7cf8059ae5aa52fe1ba205d265a7f02e8be85c8..a4972061d420853b4790dce5df457d981127194c 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -264,6 +264,9 @@ def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
 
 setattr(core.VarBase, 'fill_diagonal_tensor', fill_diagonal_tensor)
 
+if core._in_eager_mode():
+    setattr(core.eager.Tensor, 'fill_diagonal_tensor', fill_diagonal_tensor)
+
 
 @dygraph_only
 def tolist(x):
@@ -890,12 +893,20 @@ def stack(x, axis=0, name=None):
             x1 = paddle.to_tensor([[1.0, 2.0]])
             x2 = paddle.to_tensor([[3.0, 4.0]])
             x3 = paddle.to_tensor([[5.0, 6.0]])
+	    
             out = paddle.stack([x1, x2, x3], axis=0)
             print(out.shape)  # [3, 1, 2]
             print(out)
             # [[[1., 2.]],
             #  [[3., 4.]],
             #  [[5., 6.]]]
+	    
+	    out = paddle.stack([x1, x2, x3], axis=-2)
+	    print(out.shape)  # [1, 3, 2]
+	    print(out)
+	    # [[[1., 2.],
+	    #   [3., 4.],
+	    #   [5., 6.]]]
     """
     return layers.stack(x, axis, name)
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 0b04c83c3d05790b3633463f17eba78847279431..e1dd5f5e61d96d54873800770a77d37ca36db8fe 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1337,7 +1337,7 @@ def renorm(x, p, axis, max_norm):
             raise ValueError("the axis:{} should not be less than -1 * length of input_shape:{}".format(axis,-1 * len(input_shape)))
         axis = axis + len(input_shape)
     if paddle.in_dynamic_mode():
-        out = core.ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm)
+        out = _C_ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm)
         return out
 
     inputs = {'X': x}
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 1a3dbd68066a72384589ac24579e0540b5484a6e..9fd200bf0344d58d6a2705d768afffc7ce92dcc2 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -327,12 +327,17 @@ class ToTensor(BaseTransform):
             import paddle.vision.transforms as T
             import paddle.vision.transforms.functional as F
 
-            fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
+            fake_img = Image.fromarray((np.random.rand(4, 5, 3) * 255.).astype(np.uint8))
 
             transform = T.ToTensor()
 
             tensor = transform(fake_img)
-
+            
+            print(tensor.shape)
+            # [3, 4, 5]
+    
+            print(tensor.dtype)
+            # paddle.float32
     """
 
     def __init__(self, data_format='CHW', keys=None):