diff --git a/.gitignore b/.gitignore
index 801790d0a472080af607e9fbcde0284902a4ead8..664c45b7202f6bf93712062ffa1d003b575afffd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,12 +52,12 @@ tools/__pycache__
 
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
-paddle/infrt/dialect/pd_ops.td
+paddle/infrt/dialect/pd/ir/pd_ops.td
 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
 tools/infrt/kernels.json
 tools/infrt/kernel_signature.json
-paddle/infrt/dialect/pd_ops_info.h
+paddle/infrt/dialect/pd/common/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
 paddle/infrt/tests/lit.cfg.py
diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
index 661c3675c84b27a7ed8210fec0cfeaa2c858487c..ba6f0396008fc25dd21d462a2d19285a6cbe9080 100644
--- a/cmake/external/paddle2onnx.cmake
+++ b/cmake/external/paddle2onnx.cmake
@@ -61,6 +61,7 @@ set(PADDLE2ONNX_OPTIONAL_ARGS
       -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
       -DWITH_STATIC=OFF
       -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
+      -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}
       -DCMAKE_POSITION_INDEPENDENT_CODE=ON
       -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
       ${EXTERNAL_OPTIONAL_ARGS}
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 3e734b1b9ed241f54e14d8a7c94b834674db1054..8641b36a1be8ea51dc4ad911214c2cebe6121e20 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -4,7 +4,7 @@ if(WITH_PYTHON)
 endif()
 proto_library(interceptor_message_proto SRCS interceptor_message.proto)
 
-if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+if(WITH_DISTRIBUTE AND WITH_PSCORE)
   set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog)
 else()
   set(BRPC_DEPS "")
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index 8d2ec5c41d86499393f62c65c4519960669b8fd8..80a6b4667aa1a0dbfd957a390c9202ea1a4d2b68 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -67,8 +67,7 @@ bool MessageBus::IsInit() const { return is_init_; }
 
 MessageBus::~MessageBus() {
   VLOG(3) << "Message bus releases resource.";
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   server_.Stop(1000);
   server_.Join();
 #endif
@@ -87,8 +86,7 @@ bool MessageBus::Send(int64_t dst_rank,
       IsInit(), true,
       platform::errors::PreconditionNotMet(
           "Using message bus since it has not been initialized."));
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   int retry_time = 0;  // message bus will retry sending for 10 times
   while (retry_time < 10) {
     ++retry_time;
@@ -173,8 +171,7 @@ void MessageBus::ListenPort() {
     LOG(INFO) << "No need listen to port since training on single card.";
     return;
   }
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // function keep listen the port and handle the message
   PADDLE_ENFORCE_EQ(
       server_.AddService(&message_service_, brpc::SERVER_DOESNT_OWN_SERVICE), 0,
@@ -203,8 +200,7 @@ void MessageBus::ListenPort() {
 #endif
 }
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 bool MessageBus::SendInterRank(int64_t dst_rank,
                                const InterceptorMessage& interceptor_message) {
   const auto& dst_addr = GetAddr(dst_rank);
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h
index d805ac81606b8928b069174bf8aadc693db2aa0c..dfd65fdbc00d445a11f60f4e1cde4f4da77b80dc 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.h
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.h
@@ -20,8 +20,7 @@
 #include <thread>
 #include <unordered_map>
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "brpc/channel.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
@@ -64,8 +63,7 @@ class MessageBus final {
 
   const std::string& GetAddr(int64_t rank) const;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // send the message inter rank (dst is different rank with src)
   bool SendInterRank(int64_t dst_rank,
                      const InterceptorMessage& interceptor_message);
@@ -81,8 +79,7 @@ class MessageBus final {
   // the ip needs to be listened
   std::string addr_;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   MessageServiceImpl message_service_;
   // brpc server
   brpc::Server server_;
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc
index c3fff98f684ad5f0feb74f30fd51404d4693c7f9..1c66d83ea34d702733b3a5c0386abb62d4e1ec8a 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_service.cc
@@ -11,8 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.h b/paddle/fluid/distributed/fleet_executor/message_service.h
index 02f73471e3b911adc622ca990bca70b7a5f3033d..5ab687ff93dc4fc2ccd0884456cdbf2d6c3c6fcb 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.h
+++ b/paddle/fluid/distributed/fleet_executor/message_service.h
@@ -11,8 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #pragma once
 
 #include "brpc/server.h"
diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto
index 0ae87812bce434be5e664aefea4bba19ae147d28..fac30e26c388c65af13135699a886a3c69031d57 100644
--- a/paddle/fluid/distributed/ps.proto
+++ b/paddle/fluid/distributed/ps.proto
@@ -115,6 +115,7 @@ message TableParameter {
   optional CommonAccessorParameter common = 6;
   optional TableType type = 7;
   optional bool compress_in_save = 8 [ default = false ];
+  optional GraphParameter graph_parameter = 9;
 }
 
 message TableAccessorParameter {
@@ -211,3 +212,25 @@ message SparseAdamSGDParameter { // SparseAdamSGDRule
   optional double ada_epsilon = 5 [ default = 1e-08 ];
   repeated float weight_bounds = 6;
 }
+
+message GraphParameter {
+  optional int32 task_pool_size = 1 [ default = 24 ];
+  optional bool gpups_mode = 2 [ default = false ];
+  optional string gpups_graph_sample_class = 3
+      [ default = "CompleteGraphSampler" ];
+  optional string gpups_graph_sample_args = 4 [ default = "" ];
+  optional bool use_cache = 5 [ default = true ];
+  optional float cache_ratio = 6 [ default = 0.3 ];
+  optional int32 cache_ttl = 7 [ default = 5 ];
+  optional GraphFeature graph_feature = 8;
+  optional string table_name = 9 [ default = "" ];
+  optional string table_type = 10 [ default = "" ];
+  optional int32 gpups_mode_shard_num = 11 [ default = 127 ];
+  optional int32 gpu_num = 12 [ default = 1 ];
+}
+
+message GraphFeature {
+  repeated string name = 1;
+  repeated string dtype = 2;
+  repeated int32 shape = 3;
+}
\ No newline at end of file
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
index 301708f6b7bb3d465d8dcbd2b94bbc4c217fcc77..a3db88e3b679da63a9b205cc013d579cf9a4be2f 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
@@ -44,7 +44,7 @@ void GraphPsService_Stub::service(
   }
 }
 
-int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
+int GraphBrpcClient::get_server_index_by_id(int64_t id) {
   int shard_num = get_shard_num();
   int shard_per_server = shard_num % server_size == 0
                              ? shard_num / server_size
@@ -53,7 +53,7 @@ int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
 }
 
 std::future<int32_t> GraphBrpcClient::get_node_feat(
-    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const uint32_t &table_id, const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     std::vector<std::vector<std::string>> &res) {
   std::vector<int> request2server;
@@ -66,7 +66,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
@@ -129,7 +129,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     std::string joint_feature_name =
         paddle::string::join_strings(feature_names, '\t');
     closure->request(request_idx)
@@ -179,9 +179,9 @@ std::future<int32_t> GraphBrpcClient::clear_nodes(uint32_t table_id) {
   return fut;
 }
 std::future<int32_t> GraphBrpcClient::add_graph_node(
-    uint32_t table_id, std::vector<uint64_t> &node_id_list,
+    uint32_t table_id, std::vector<int64_t> &node_id_list,
     std::vector<bool> &is_weighted_list) {
-  std::vector<std::vector<uint64_t>> request_bucket;
+  std::vector<std::vector<int64_t>> request_bucket;
   std::vector<std::vector<bool>> is_weighted_bucket;
   bool add_weight = is_weighted_list.size() > 0;
   std::vector<int> server_index_arr;
@@ -191,7 +191,7 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
     if (index_mapping[server_index] == -1) {
       index_mapping[server_index] = request_bucket.size();
       server_index_arr.push_back(server_index);
-      request_bucket.push_back(std::vector<uint64_t>());
+      request_bucket.push_back(std::vector<int64_t>());
       if (add_weight) is_weighted_bucket.push_back(std::vector<bool>());
     }
     request_bucket[index_mapping[server_index]].push_back(
@@ -229,7 +229,7 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
     size_t node_num = request_bucket[request_idx].size();
     closure->request(request_idx)
         ->add_params((char *)request_bucket[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     if (add_weight) {
       bool weighted[is_weighted_bucket[request_idx].size() + 1];
       for (size_t j = 0; j < is_weighted_bucket[request_idx].size(); j++)
@@ -248,8 +248,8 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
   return fut;
 }
 std::future<int32_t> GraphBrpcClient::remove_graph_node(
-    uint32_t table_id, std::vector<uint64_t> &node_id_list) {
-  std::vector<std::vector<uint64_t>> request_bucket;
+    uint32_t table_id, std::vector<int64_t> &node_id_list) {
+  std::vector<std::vector<int64_t>> request_bucket;
   std::vector<int> server_index_arr;
   std::vector<int> index_mapping(server_size, -1);
   for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) {
@@ -257,7 +257,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
     if (index_mapping[server_index] == -1) {
       index_mapping[server_index] = request_bucket.size();
       server_index_arr.push_back(server_index);
-      request_bucket.push_back(std::vector<uint64_t>());
+      request_bucket.push_back(std::vector<int64_t>());
     }
     request_bucket[index_mapping[server_index]].push_back(
         node_id_list[query_idx]);
@@ -291,7 +291,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
 
     closure->request(request_idx)
         ->add_params((char *)request_bucket[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     // PsService_Stub rpc_stub(get_cmd_channel(server_index));
     GraphPsService_Stub rpc_stub =
         getServiceStub(get_cmd_channel(server_index));
@@ -303,9 +303,9 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
 }
 // char* &buffer,int &actual_size
 std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
-    uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
-    // std::vector<std::vector<std::pair<uint64_t, float>>> &res,
-    std::vector<std::vector<uint64_t>> &res,
+    uint32_t table_id, std::vector<int64_t> node_ids, int sample_size,
+    // std::vector<std::vector<std::pair<int64_t, float>>> &res,
+    std::vector<std::vector<int64_t>> &res,
     std::vector<std::vector<float>> &res_weight, bool need_weight,
     int server_index) {
   if (server_index != -1) {
@@ -337,7 +337,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
           int start = 0;
           while (start < actual_size) {
             res[node_idx].emplace_back(
-                *(uint64_t *)(node_buffer + offset + start));
+                *(int64_t *)(node_buffer + offset + start));
             start += GraphNode::id_size;
             if (need_weight) {
               res_weight[node_idx].emplace_back(
@@ -358,7 +358,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
     closure->request(0)->set_table_id(table_id);
     closure->request(0)->set_client_id(_client_id);
     closure->request(0)->add_params((char *)node_ids.data(),
-                                    sizeof(uint64_t) * node_ids.size());
+                                    sizeof(int64_t) * node_ids.size());
     closure->request(0)->add_params((char *)&sample_size, sizeof(int));
     closure->request(0)->add_params((char *)&need_weight, sizeof(bool));
     ;
@@ -380,14 +380,14 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
       server2request[server_index] = request2server.size();
       request2server.push_back(server_index);
     }
-    // res.push_back(std::vector<std::pair<uint64_t, float>>());
+    // res.push_back(std::vector<std::pair<int64_t, float>>());
     res.push_back({});
     if (need_weight) {
       res_weight.push_back({});
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
@@ -428,7 +428,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
               int start = 0;
               while (start < actual_size) {
                 res[query_idx].emplace_back(
-                    *(uint64_t *)(node_buffer + offset + start));
+                    *(int64_t *)(node_buffer + offset + start));
                 start += GraphNode::id_size;
                 if (need_weight) {
                   res_weight[query_idx].emplace_back(
@@ -459,7 +459,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     closure->request(request_idx)
         ->add_params((char *)&sample_size, sizeof(int));
     closure->request(request_idx)
@@ -476,7 +476,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
 }
 std::future<int32_t> GraphBrpcClient::random_sample_nodes(
     uint32_t table_id, int server_index, int sample_size,
-    std::vector<uint64_t> &ids) {
+    std::vector<int64_t> &ids) {
   DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
     int ret = 0;
     auto *closure = (DownpourBrpcClosure *)done;
@@ -490,7 +490,7 @@ std::future<int32_t> GraphBrpcClient::random_sample_nodes(
       auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
       int index = 0;
       while (index < bytes_size) {
-        ids.push_back(*(uint64_t *)(buffer + index));
+        ids.push_back(*(int64_t *)(buffer + index));
         index += GraphNode::id_size;
       }
       delete[] buffer;
@@ -633,7 +633,7 @@ std::future<int32_t> GraphBrpcClient::pull_graph_list(
 }
 
 std::future<int32_t> GraphBrpcClient::set_node_feat(
-    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const uint32_t &table_id, const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     const std::vector<std::vector<std::string>> &features) {
   std::vector<int> request2server;
@@ -646,7 +646,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   std::vector<std::vector<std::vector<std::string>>> features_idx_buckets(
       request_call_num);
@@ -696,7 +696,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     std::string joint_feature_name =
         paddle::string::join_strings(feature_names, '\t');
     closure->request(request_idx)
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
index 06e753d028baa2d9c0002620dc445d4204046180..e2b8a518615dc511a726c4be104cb03900dd2e9a 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
@@ -63,8 +63,8 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual ~GraphBrpcClient() {}
   // given a batch of nodes, sample graph_neighbors for each of them
   virtual std::future<int32_t> batch_sample_neighbors(
-      uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
-      std::vector<std::vector<uint64_t>>& res,
+      uint32_t table_id, std::vector<int64_t> node_ids, int sample_size,
+      std::vector<std::vector<int64_t>>& res,
       std::vector<std::vector<float>>& res_weight, bool need_weight,
       int server_index = -1);
 
@@ -75,20 +75,20 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual std::future<int32_t> random_sample_nodes(uint32_t table_id,
                                                    int server_index,
                                                    int sample_size,
-                                                   std::vector<uint64_t>& ids);
+                                                   std::vector<int64_t>& ids);
   virtual std::future<int32_t> get_node_feat(
-      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const uint32_t& table_id, const std::vector<int64_t>& node_ids,
       const std::vector<std::string>& feature_names,
       std::vector<std::vector<std::string>>& res);
 
   virtual std::future<int32_t> set_node_feat(
-      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const uint32_t& table_id, const std::vector<int64_t>& node_ids,
       const std::vector<std::string>& feature_names,
       const std::vector<std::vector<std::string>>& features);
 
   virtual std::future<int32_t> clear_nodes(uint32_t table_id);
   virtual std::future<int32_t> add_graph_node(
-      uint32_t table_id, std::vector<uint64_t>& node_id_list,
+      uint32_t table_id, std::vector<int64_t>& node_id_list,
       std::vector<bool>& is_weighted_list);
   virtual std::future<int32_t> use_neighbors_sample_cache(uint32_t table_id,
                                                           size_t size_limit,
@@ -96,11 +96,11 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual std::future<int32_t> load_graph_split_config(uint32_t table_id,
                                                        std::string path);
   virtual std::future<int32_t> remove_graph_node(
-      uint32_t table_id, std::vector<uint64_t>& node_id_list);
+      uint32_t table_id, std::vector<int64_t>& node_id_list);
   virtual int32_t initialize();
   int get_shard_num() { return shard_num; }
   void set_shard_num(int shard_num) { this->shard_num = shard_num; }
-  int get_server_index_by_id(uint64_t id);
+  int get_server_index_by_id(int64_t id);
   void set_local_channel(int index) {
     this->local_channel = get_cmd_channel(index);
   }
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index 441f489fb3097cda51fc62dc35e93264a1f7caef..20a55e4d11983dad37b9e2e7845923dded881d3b 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -140,9 +140,9 @@ int32_t GraphBrpcService::add_graph_node(Table *table,
     return 0;
   }
 
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
   std::vector<bool> is_weighted_list;
   if (request.params_size() == 2) {
     size_t weight_list_size = request.params(1).size() / sizeof(bool);
@@ -165,9 +165,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table,
         "graph_get_node_feat request requires at least 1 argument");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   ((GraphTable *)table)->remove_graph_node(node_ids);
   return 0;
@@ -386,9 +386,9 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
         "graph_random_sample_neighbors request requires at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  int sample_size = *(uint64_t *)(request.params(1).c_str());
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  int sample_size = *(int64_t *)(request.params(1).c_str());
   bool need_weight = *(bool *)(request.params(2).c_str());
   std::vector<std::shared_ptr<char>> buffers(node_num);
   std::vector<int> actual_sizes(node_num, 0);
@@ -407,7 +407,7 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
 int32_t GraphBrpcService::graph_random_sample_nodes(
     Table *table, const PsRequestMessage &request, PsResponseMessage &response,
     brpc::Controller *cntl) {
-  size_t size = *(uint64_t *)(request.params(0).c_str());
+  size_t size = *(int64_t *)(request.params(0).c_str());
   std::unique_ptr<char[]> buffer;
   int actual_size;
   if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) ==
@@ -430,9 +430,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table,
         "graph_get_node_feat request requires at least 2 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   std::vector<std::string> feature_names =
       paddle::string::split_string<std::string>(request.params(1), "\t");
@@ -464,16 +464,16 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
                       "at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t),
+  size_t node_num = request.params(0).size() / sizeof(int64_t),
          size_of_size_t = sizeof(size_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  int sample_size = *(uint64_t *)(request.params(1).c_str());
-  bool need_weight = *(uint64_t *)(request.params(2).c_str());
-  // std::vector<uint64_t> res = ((GraphTable
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  int sample_size = *(int64_t *)(request.params(1).c_str());
+  bool need_weight = *(int64_t *)(request.params(2).c_str());
+  // std::vector<int64_t> res = ((GraphTable
   // *)table).filter_out_non_exist_nodes(node_data, sample_size);
   std::vector<int> request2server;
   std::vector<int> server2request(server_size, -1);
-  std::vector<uint64_t> local_id;
+  std::vector<int64_t> local_id;
   std::vector<int> local_query_idx;
   size_t rank = get_rank();
   for (int query_idx = 0; query_idx < node_num; ++query_idx) {
@@ -496,7 +496,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
   std::vector<std::shared_ptr<char>> local_buffers;
   std::vector<int> local_actual_sizes;
   std::vector<size_t> seq;
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_num; ++query_idx) {
     int server_index =
@@ -583,7 +583,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     closure->request(request_idx)
         ->add_params((char *)&sample_size, sizeof(int));
     closure->request(request_idx)
@@ -618,9 +618,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table,
         "graph_set_node_feat request requires at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   std::vector<std::string> feature_names =
       paddle::string::split_string<std::string>(request.params(1), "\t");
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
index 088edcb75bbc67d6d2acef9609b442f6fa38c332..c8be0f797109078509eeced53920845ac4c51684 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
@@ -44,9 +44,9 @@ void GraphPyService::add_table_feat_conf(std::string table_name,
   }
 }
 
-void add_graph_node(std::vector<uint64_t> node_ids,
+void add_graph_node(std::vector<int64_t> node_ids,
                     std::vector<bool> weight_list) {}
-void remove_graph_node(std::vector<uint64_t> node_ids) {}
+void remove_graph_node(std::vector<int64_t> node_ids) {}
 void GraphPyService::set_up(std::string ips_str, int shard_num,
                             std::vector<std::string> node_types,
                             std::vector<std::string> edge_types) {
@@ -260,7 +260,7 @@ void GraphPyClient::clear_nodes(std::string name) {
 }
 
 void GraphPyClient::add_graph_node(std::string name,
-                                   std::vector<uint64_t>& node_ids,
+                                   std::vector<int64_t>& node_ids,
                                    std::vector<bool>& weight_list) {
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
@@ -271,7 +271,7 @@ void GraphPyClient::add_graph_node(std::string name,
 }
 
 void GraphPyClient::remove_graph_node(std::string name,
-                                      std::vector<uint64_t>& node_ids) {
+                                      std::vector<int64_t>& node_ids) {
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
     auto status = get_ps_client()->remove_graph_node(table_id, node_ids);
@@ -290,13 +290,12 @@ void GraphPyClient::load_node_file(std::string name, std::string filepath) {
   }
 }
 
-std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>>
+std::pair<std::vector<std::vector<int64_t>>, std::vector<float>>
 GraphPyClient::batch_sample_neighbors(std::string name,
-                                      std::vector<uint64_t> node_ids,
+                                      std::vector<int64_t> node_ids,
                                       int sample_size, bool return_weight,
                                       bool return_edges) {
-  // std::vector<std::vector<std::pair<uint64_t, float>>> v;
-  std::vector<std::vector<uint64_t>> v;
+  std::vector<std::vector<int64_t>> v;
   std::vector<std::vector<float>> v1;
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
@@ -309,7 +308,7 @@ GraphPyClient::batch_sample_neighbors(std::string name,
   // res.first[1]: slice index
   // res.first[2]: src nodes
   // res.second: edges weight
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>> res;
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>> res;
   res.first.push_back({});
   res.first.push_back({});
   if (return_edges) res.first.push_back({});
@@ -342,10 +341,10 @@ void GraphPyClient::use_neighbors_sample_cache(std::string name,
     status.wait();
   }
 }
-std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
-                                                         int server_index,
-                                                         int sample_size) {
-  std::vector<uint64_t> v;
+std::vector<int64_t> GraphPyClient::random_sample_nodes(std::string name,
+                                                        int server_index,
+                                                        int sample_size) {
+  std::vector<int64_t> v;
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
     auto status =
@@ -357,7 +356,7 @@ std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
 
 // (name, dtype, ndarray)
 std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
-    std::string node_type, std::vector<uint64_t> node_ids,
+    std::string node_type, std::vector<int64_t> node_ids,
     std::vector<std::string> feature_names) {
   std::vector<std::vector<std::string>> v(
       feature_names.size(), std::vector<std::string>(node_ids.size()));
@@ -371,7 +370,7 @@ std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
 }
 
 void GraphPyClient::set_node_feat(
-    std::string node_type, std::vector<uint64_t> node_ids,
+    std::string node_type, std::vector<int64_t> node_ids,
     std::vector<std::string> feature_names,
     const std::vector<std::vector<std::string>> features) {
   if (this->table_id_map.count(node_type)) {
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index c25ef5035453ded0996cfe190dec71b0ce4b9b4a..85707137c1800ed9486148584ce22a78c52a47fd 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -70,18 +70,34 @@ class GraphPyService {
     ::paddle::distributed::TableAccessorParameter* accessor_proto =
         sparse_table_proto->mutable_accessor();
 
-    ::paddle::distributed::CommonAccessorParameter* common_proto =
-        sparse_table_proto->mutable_common();
+    // ::paddle::distributed::CommonAccessorParameter* common_proto =
+    //     sparse_table_proto->mutable_common();
 
+    ::paddle::distributed::GraphParameter* graph_proto =
+        sparse_table_proto->mutable_graph_parameter();
+
+    ::paddle::distributed::GraphFeature* graph_feature =
+        graph_proto->mutable_graph_feature();
+
+    graph_proto->set_task_pool_size(24);
+
+    graph_proto->set_table_name(table_name);
+    graph_proto->set_table_type(table_type);
+    graph_proto->set_use_cache(false);
     // Set GraphTable Parameter
-    common_proto->set_table_name(table_name);
-    common_proto->set_name(table_type);
+    // common_proto->set_table_name(table_name);
+    // common_proto->set_name(table_type);
+    // for (size_t i = 0; i < feat_name.size(); i++) {
+    //   common_proto->add_params(feat_dtype[i]);
+    //   common_proto->add_dims(feat_shape[i]);
+    //   common_proto->add_attributes(feat_name[i]);
+    // }
+
     for (size_t i = 0; i < feat_name.size(); i++) {
-      common_proto->add_params(feat_dtype[i]);
-      common_proto->add_dims(feat_shape[i]);
-      common_proto->add_attributes(feat_name[i]);
+      graph_feature->add_dtype(feat_dtype[i]);
+      graph_feature->add_shape(feat_shape[i]);
+      graph_feature->add_name(feat_name[i]);
     }
-
     accessor_proto->set_accessor_class("CommMergeAccessor");
   }
 
@@ -143,24 +159,24 @@ class GraphPyClient : public GraphPyService {
   void load_edge_file(std::string name, std::string filepath, bool reverse);
   void load_node_file(std::string name, std::string filepath);
   void clear_nodes(std::string name);
-  void add_graph_node(std::string name, std::vector<uint64_t>& node_ids,
+  void add_graph_node(std::string name, std::vector<int64_t>& node_ids,
                       std::vector<bool>& weight_list);
-  void remove_graph_node(std::string name, std::vector<uint64_t>& node_ids);
+  void remove_graph_node(std::string name, std::vector<int64_t>& node_ids);
   int get_client_id() { return client_id; }
   void set_client_id(int client_id) { this->client_id = client_id; }
   void start_client();
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>>
-  batch_sample_neighbors(std::string name, std::vector<uint64_t> node_ids,
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>>
+  batch_sample_neighbors(std::string name, std::vector<int64_t> node_ids,
                          int sample_size, bool return_weight,
                          bool return_edges);
-  std::vector<uint64_t> random_sample_nodes(std::string name, int server_index,
-                                            int sample_size);
+  std::vector<int64_t> random_sample_nodes(std::string name, int server_index,
+                                           int sample_size);
   std::vector<std::vector<std::string>> get_node_feat(
-      std::string node_type, std::vector<uint64_t> node_ids,
+      std::string node_type, std::vector<int64_t> node_ids,
       std::vector<std::string> feature_names);
   void use_neighbors_sample_cache(std::string name, size_t total_size_limit,
                                   size_t ttl);
-  void set_node_feat(std::string node_type, std::vector<uint64_t> node_ids,
+  void set_node_feat(std::string node_type, std::vector<int64_t> node_ids,
                      std::vector<std::string> feature_names,
                      const std::vector<std::vector<std::string>> features);
   std::vector<FeatureNode> pull_graph_list(std::string name, int server_index,
diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index be916bf2e800308cdebbbfbe4e5ff4c467cf3f6f..2fa5ecb4051c568fa0697b236bcfb9c00e4319bf 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -53,7 +53,6 @@ cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_pro
 
 set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table)
-
 cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
 
 target_link_libraries(table -fopenmp)
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index 54b98cb96ce5196bb5133f777b2571f4d3d43c6e..2c07bd65d63d408b1bff12eda7bcf8fba3336db6 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -27,6 +27,288 @@
 namespace paddle {
 namespace distributed {
 
+#ifdef PADDLE_WITH_HETERPS
+
+int CompleteGraphSampler::run_graph_sampling() {
+  pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
+  pthread_rwlock_rdlock(rw_lock);
+  std::cout << "in graph sampling" << std::endl;
+  sample_nodes.clear();
+  sample_neighbors.clear();
+  sample_res.clear();
+  sample_nodes.resize(gpu_num);
+  sample_neighbors.resize(gpu_num);
+  sample_res.resize(gpu_num);
+  std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+      sample_nodes_ex(graph_table->task_pool_size_);
+  std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+      graph_table->task_pool_size_);
+  for (int i = 0; i < graph_table->task_pool_size_; i++) {
+    sample_nodes_ex[i].resize(gpu_num);
+    sample_neighbors_ex[i].resize(gpu_num);
+  }
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < graph_table->shards.size(); ++i) {
+    tasks.push_back(
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue([&, i, this]() -> int {
+              if (this->status == GraphSamplerStatus::terminating) return 0;
+              paddle::framework::GpuPsGraphNode node;
+              std::vector<Node *> &v =
+                  this->graph_table->shards[i]->get_bucket();
+              size_t ind = i % this->graph_table->task_pool_size_;
+              for (size_t j = 0; j < v.size(); j++) {
+                size_t location = v[j]->get_id() % this->gpu_num;
+                node.node_id = v[j]->get_id();
+                node.neighbor_size = v[j]->get_neighbor_size();
+                node.neighbor_offset =
+                    (int)sample_neighbors_ex[ind][location].size();
+                sample_nodes_ex[ind][location].emplace_back(node);
+                for (int k = 0; k < node.neighbor_size; k++)
+                  sample_neighbors_ex[ind][location].push_back(
+                      v[j]->get_neighbor_id(k));
+              }
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  tasks.clear();
+  for (size_t i = 0; i < gpu_num; i++) {
+    tasks.push_back(
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue([&, i, this]() -> int {
+              if (this->status == GraphSamplerStatus::terminating) return 0;
+              int total_offset = 0;
+              size_t ind = i % this->graph_table->task_pool_size_;
+              for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
+                for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) {
+                  sample_nodes[ind].push_back(sample_nodes_ex[j][ind][k]);
+                  sample_nodes[ind].back().neighbor_offset += total_offset;
+                }
+                size_t neighbor_size = sample_neighbors_ex[j][ind].size();
+                total_offset += neighbor_size;
+                for (size_t k = 0; k < neighbor_size; k++) {
+                  sample_neighbors[ind].push_back(
+                      sample_neighbors_ex[j][ind][k]);
+                }
+              }
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+
+  if (this->status == GraphSamplerStatus::terminating) {
+    pthread_rwlock_unlock(rw_lock);
+    return 0;
+  }
+  for (size_t i = 0; i < gpu_num; i++) {
+    sample_res[i].node_list = sample_nodes[i].data();
+    sample_res[i].neighbor_list = sample_neighbors[i].data();
+    sample_res[i].node_size = sample_nodes[i].size();
+    sample_res[i].neighbor_size = sample_neighbors[i].size();
+  }
+  pthread_rwlock_unlock(rw_lock);
+  if (this->status == GraphSamplerStatus::terminating) {
+    return 0;
+  }
+  callback(sample_res);
+  return 0;
+}
+void CompleteGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
+                                std::vector<std::string> args) {
+  this->gpu_num = gpu_num;
+  this->graph_table = graph_table;
+}
+
+int BasicBfsGraphSampler::run_graph_sampling() {
+  pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
+  pthread_rwlock_rdlock(rw_lock);
+  while (rounds > 0 && status == GraphSamplerStatus::running) {
+    for (size_t i = 0; i < sample_neighbors_map.size(); i++) {
+      sample_neighbors_map[i].clear();
+    }
+    sample_neighbors_map.clear();
+    std::vector<int> nodes_left(graph_table->shards.size(),
+                                node_num_for_each_shard);
+    std::promise<int> prom;
+    std::future<int> fut = prom.get_future();
+    sample_neighbors_map.resize(graph_table->task_pool_size_);
+    int task_size = 0;
+    std::vector<std::future<int>> tasks;
+    int init_size = 0;
+    //__sync_fetch_and_add
+    std::function<int(int, int64_t)> bfs = [&, this](int i, int id) -> int {
+      VLOG(0) << "in bfs " << i << " " << id;
+      if (this->status == GraphSamplerStatus::terminating) {
+        int task_left = __sync_sub_and_fetch(&task_size, 1);
+        if (task_left == 0) {
+          prom.set_value(0);
+        }
+        return 0;
+      }
+      size_t ind = i % this->graph_table->task_pool_size_;
+      if (nodes_left[i] > 0) {
+        nodes_left[i]--;
+        auto iter = sample_neighbors_map[ind].find(id);
+        if (iter == sample_neighbors_map[ind].end()) {
+          sample_neighbors_map[ind][id] = std::vector<int64_t>();
+          iter = sample_neighbors_map[ind].find(id);
+          Node *node = graph_table->shards[i]->find_node(id);
+          if (node != NULL) {
+            size_t edge_fetch_size =
+                std::min((size_t) this->edge_num_for_each_node,
+                         node->get_neighbor_size());
+            for (size_t k = 0; k < edge_fetch_size; k++) {
+              int64_t neighbor_id = node->get_neighbor_id(k);
+              int node_location = neighbor_id % this->graph_table->shard_num %
+                                  this->graph_table->task_pool_size_;
+              __sync_add_and_fetch(&task_size, 1);
+              graph_table->_shards_task_pool[node_location]->enqueue(
+                  bfs, neighbor_id % this->graph_table->shard_num, neighbor_id);
+              iter->second.push_back(neighbor_id);
+            }
+          }
+        }
+      }
+      int task_left = __sync_sub_and_fetch(&task_size, 1);
+      if (task_left == 0) {
+        prom.set_value(0);
+      }
+      return 0;
+    };
+    for (size_t i = 0; i < graph_table->shards.size(); ++i) {
+      std::vector<Node *> &v = graph_table->shards[i]->get_bucket();
+      if (v.size() > 0) {
+        init_size++;
+        __sync_add_and_fetch(&task_size, 1);
+        int64_t id = v[0]->get_id();
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue(bfs, i, id);
+      }  // if
+    }
+    if (init_size == 0) {
+      prom.set_value(0);
+    }
+    fut.get();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    std::cout << "bfs over" << std::endl;
+    sample_nodes.clear();
+    sample_neighbors.clear();
+    sample_res.clear();
+    sample_nodes.resize(gpu_num);
+    sample_neighbors.resize(gpu_num);
+    sample_res.resize(gpu_num);
+    std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+        sample_nodes_ex(graph_table->task_pool_size_);
+    std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+        graph_table->task_pool_size_);
+    for (int i = 0; i < graph_table->task_pool_size_; i++) {
+      sample_nodes_ex[i].resize(gpu_num);
+      sample_neighbors_ex[i].resize(gpu_num);
+    }
+    tasks.clear();
+    for (size_t i = 0; i < (size_t)graph_table->task_pool_size_; ++i) {
+      tasks.push_back(
+          graph_table->_shards_task_pool[i]->enqueue([&, i, this]() -> int {
+            if (this->status == GraphSamplerStatus::terminating) {
+              return 0;
+            }
+            paddle::framework::GpuPsGraphNode node;
+            auto iter = sample_neighbors_map[i].begin();
+            size_t ind = i;
+            for (; iter != sample_neighbors_map[i].end(); iter++) {
+              size_t location = iter->first % this->gpu_num;
+              node.node_id = iter->first;
+              node.neighbor_size = iter->second.size();
+              node.neighbor_offset =
+                  (int)sample_neighbors_ex[ind][location].size();
+              sample_nodes_ex[ind][location].emplace_back(node);
+              for (auto k : iter->second)
+                sample_neighbors_ex[ind][location].push_back(k);
+            }
+            return 0;
+          }));
+    }
+
+    for (size_t i = 0; i < tasks.size(); i++) {
+      tasks[i].get();
+      sample_neighbors_map[i].clear();
+    }
+    tasks.clear();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    for (size_t i = 0; i < gpu_num; i++) {
+      tasks.push_back(
+          graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+              ->enqueue([&, i, this]() -> int {
+                if (this->status == GraphSamplerStatus::terminating) {
+                  pthread_rwlock_unlock(rw_lock);
+                  return 0;
+                }
+                int total_offset = 0;
+                size_t ind = i % graph_table->task_pool_size_;
+                for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
+                  for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) {
+                    sample_nodes[i].push_back(sample_nodes_ex[j][ind][k]);
+                    sample_nodes[i].back().neighbor_offset += total_offset;
+                    // neighbor_offset[i].push_back(total_offset +
+                    // neighbor_offset_ex[j][i][k]);
+                  }
+                  size_t neighbor_size = sample_neighbors_ex[j][ind].size();
+                  total_offset += neighbor_size;
+                  for (size_t k = 0; k < neighbor_size; k++) {
+                    sample_neighbors[ind].push_back(
+                        sample_neighbors_ex[j][ind][k]);
+                  }
+                }
+                return 0;
+              }));
+    }
+    for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    // int64_t total_neighbors =
+    // std::accumulate(shard_neighbor_size.begin(),shard_neighbor_size.end(),0);
+    for (size_t i = 0; i < gpu_num; i++) {
+      sample_res[i].node_list = sample_nodes[i].data();
+      sample_res[i].neighbor_list = sample_neighbors[i].data();
+      sample_res[i].node_size = sample_nodes[i].size();
+      sample_res[i].neighbor_size = sample_neighbors[i].size();
+    }
+    pthread_rwlock_unlock(rw_lock);
+    if (this->status == GraphSamplerStatus::terminating) {
+      return 0;
+    }
+    callback(sample_res);
+    rounds--;
+    if (rounds > 0) {
+      for (int i = 0;
+           i < interval && this->status == GraphSamplerStatus::running; i++) {
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      }
+    }
+  }
+  return 0;
+}
+void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
+                                std::vector<std::string> args) {
+  this->gpu_num = gpu_num;
+  this->graph_table = graph_table;
+  node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10;
+  edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10;
+  rounds = args.size() > 2 ? std::stoi(args[2]) : 1;
+  interval = args.size() > 3 ? std::stoi(args[3]) : 60;
+}
+
+#endif
+
 std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
   if (start < 0) start = 0;
   std::vector<Node *> res;
@@ -38,10 +320,10 @@ std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
 
 size_t GraphShard::get_size() { return bucket.size(); }
 
-int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
+int32_t GraphTable::add_graph_node(std::vector<int64_t> &id_list,
                                    std::vector<bool> &is_weight_list) {
   size_t node_size = id_list.size();
-  std::vector<std::vector<std::pair<uint64_t, bool>>> batch(task_pool_size_);
+  std::vector<std::vector<std::pair<int64_t, bool>>> batch(task_pool_size_);
   for (size_t i = 0; i < node_size; i++) {
     size_t shard_id = id_list[i] % shard_num;
     if (shard_id >= shard_end || shard_id < shard_start) {
@@ -65,9 +347,9 @@ int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
   return 0;
 }
 
-int32_t GraphTable::remove_graph_node(std::vector<uint64_t> &id_list) {
+int32_t GraphTable::remove_graph_node(std::vector<int64_t> &id_list) {
   size_t node_size = id_list.size();
-  std::vector<std::vector<uint64_t>> batch(task_pool_size_);
+  std::vector<std::vector<int64_t>> batch(task_pool_size_);
   for (size_t i = 0; i < node_size; i++) {
     size_t shard_id = id_list[i] % shard_num;
     if (shard_id >= shard_end || shard_id < shard_start) continue;
@@ -98,7 +380,7 @@ void GraphShard::clear() {
 
 GraphShard::~GraphShard() { clear(); }
 
-void GraphShard::delete_node(uint64_t id) {
+void GraphShard::delete_node(int64_t id) {
   auto iter = node_location.find(id);
   if (iter == node_location.end()) return;
   int pos = iter->second;
@@ -110,7 +392,7 @@ void GraphShard::delete_node(uint64_t id) {
   node_location.erase(id);
   bucket.pop_back();
 }
-GraphNode *GraphShard::add_graph_node(uint64_t id) {
+GraphNode *GraphShard::add_graph_node(int64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
     bucket.push_back(new GraphNode(id));
@@ -126,7 +408,7 @@ GraphNode *GraphShard::add_graph_node(Node *node) {
   }
   return (GraphNode *)bucket[node_location[id]];
 }
-FeatureNode *GraphShard::add_feature_node(uint64_t id) {
+FeatureNode *GraphShard::add_feature_node(int64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
     bucket.push_back(new FeatureNode(id));
@@ -134,11 +416,11 @@ FeatureNode *GraphShard::add_feature_node(uint64_t id) {
   return (FeatureNode *)bucket[node_location[id]];
 }
 
-void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) {
+void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) {
   find_node(id)->add_edge(dst_id, weight);
 }
 
-Node *GraphShard::find_node(uint64_t id) {
+Node *GraphShard::find_node(int64_t id) {
   auto iter = node_location.find(id);
   return iter == node_location.end() ? nullptr : bucket[iter->second];
 }
@@ -185,14 +467,14 @@ int32_t GraphTable::load(const std::string &path, const std::string &param) {
 }
 
 int32_t GraphTable::get_nodes_ids_by_ranges(
-    std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res) {
+    std::vector<std::pair<int, int>> ranges, std::vector<int64_t> &res) {
   int start = 0, end, index = 0, total_size = 0;
   res.clear();
-  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  std::vector<std::future<std::vector<int64_t>>> tasks;
   for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
     end = total_size + shards[i]->get_size();
     start = total_size;
-    while (start < end && index < ranges.size()) {
+    while (start < end && index < (int)ranges.size()) {
       if (ranges[index].second <= start)
         index++;
       else if (ranges[index].first >= end) {
@@ -204,7 +486,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
         first -= total_size;
         second -= total_size;
         tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
-            [this, first, second, i]() -> std::vector<uint64_t> {
+            [this, first, second, i]() -> std::vector<int64_t> {
               return shards[i]->get_ids_by_range(first, second);
             }));
       }
@@ -276,6 +558,9 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
 }
 
 int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
+#ifdef PADDLE_WITH_HETERPS
+  if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
+#endif
   auto paths = paddle::string::split_string<std::string>(path, ";");
   int64_t count = 0;
   std::string sample_type = "random";
@@ -351,6 +636,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   /*-----------------------
   relocate the duplicate nodes to make them distributed evenly among threads.
 */
+  if (!use_duplicate_nodes) {
+#ifdef PADDLE_WITH_HETERPS
+    if (gpups_mode) pthread_rwlock_unlock(rw_lock.get());
+#endif
+
+    return 0;
+  }
   for (auto &shard : extra_shards) {
     auto bucket = shard->get_bucket();
     for (size_t i = 0; i < bucket.size(); i++) {
@@ -360,13 +652,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   int size = extra_nodes_to_thread_index.size();
   if (size == 0) return 0;
   std::vector<int> index;
-  for (int i = 0; i < used.size(); i++) index.push_back(i);
+  for (int i = 0; i < (int)used.size(); i++) index.push_back(i);
   sort(index.begin(), index.end(),
        [&](int &a, int &b) { return used[a] < used[b]; });
 
   std::vector<int> alloc(index.size(), 0), has_alloc(index.size(), 0);
   int t = 1, aim = 0, mod = 0;
-  for (; t < used.size(); t++) {
+  for (; t < (int)used.size(); t++) {
     if ((used[index[t]] - used[index[t - 1]]) * t >= size) {
       break;
     } else {
@@ -380,7 +672,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
     if (t - x <= mod) alloc[index[x]]++;
     alloc[index[x]] -= used[index[x]];
   }
-  std::vector<uint64_t> vec[index.size()];
+  std::vector<int64_t> vec[index.size()];
   for (auto p : extra_nodes_to_thread_index) {
     has_alloc[p.second]++;
     vec[p.second].push_back(p.first);
@@ -395,7 +687,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
                      has_alloc[index[right]] - alloc[index[right]]);
     has_alloc[index[left]] += x;
     has_alloc[index[right]] -= x;
-    uint64_t id;
+    int64_t id;
     while (x--) {
       id = vec[index[right]].back();
       vec[index[right]].pop_back();
@@ -424,10 +716,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
     delete extra_shards[i];
     extra_shards[i] = extra_shards_copy[i];
   }
+#ifdef PADDLE_WITH_HETERPS
+  if (gpups_mode) pthread_rwlock_unlock(rw_lock.get());
+#endif
   return 0;
 }
 
-Node *GraphTable::find_node(uint64_t id) {
+Node *GraphTable::find_node(int64_t id) {
   size_t shard_id = id % shard_num;
   if (shard_id >= shard_end || shard_id < shard_start) {
     if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
@@ -443,7 +738,7 @@ Node *GraphTable::find_node(uint64_t id) {
   Node *node = shards[index]->find_node(id);
   return node;
 }
-uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
+uint32_t GraphTable::get_thread_pool_index(int64_t node_id) {
   if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
     return node_id % shard_num % shard_num_per_server % task_pool_size_;
   size_t src_shard_id = node_id % shard_num;
@@ -456,8 +751,7 @@ uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
   return src_shard_id % shard_num_per_server % task_pool_size_;
 }
 
-uint32_t GraphTable::get_thread_pool_index_by_shard_index(
-    uint64_t shard_index) {
+uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) {
   return shard_index % shard_num_per_server % task_pool_size_;
 }
 
@@ -484,7 +778,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
                                         std::unique_ptr<char[]> &buffer,
                                         int &actual_size) {
   int total_size = 0;
-  for (int i = 0; i < shards.size(); i++) {
+  for (int i = 0; i < (int)shards.size(); i++) {
     total_size += shards[i]->get_size();
   }
   if (sample_size > total_size) sample_size = total_size;
@@ -537,16 +831,16 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
     }
   }
   for (auto &pair : first_half) second_half.push_back(pair);
-  std::vector<uint64_t> res;
+  std::vector<int64_t> res;
   get_nodes_ids_by_ranges(second_half, res);
-  actual_size = res.size() * sizeof(uint64_t);
+  actual_size = res.size() * sizeof(int64_t);
   buffer.reset(new char[actual_size]);
   char *pointer = buffer.get();
   memcpy(pointer, res.data(), actual_size);
   return 0;
 }
 int32_t GraphTable::random_sample_neighbors(
-    uint64_t *node_ids, int sample_size,
+    int64_t *node_ids, int sample_size,
     std::vector<std::shared_ptr<char>> &buffers, std::vector<int> &actual_sizes,
     bool need_weight) {
   size_t node_num = buffers.size();
@@ -560,10 +854,10 @@ int32_t GraphTable::random_sample_neighbors(
     seq_id[index].emplace_back(idx);
     id_list[index].emplace_back(node_ids[idx], sample_size, need_weight);
   }
-  for (int i = 0; i < seq_id.size(); i++) {
+  for (int i = 0; i < (int)seq_id.size(); i++) {
     if (seq_id[i].size() == 0) continue;
     tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
-      uint64_t node_id;
+      int64_t node_id;
       std::vector<std::pair<SampleKey, SampleResult>> r;
       LRUResponse response = LRUResponse::blocked;
       if (use_cache) {
@@ -576,7 +870,7 @@ int32_t GraphTable::random_sample_neighbors(
       std::vector<SampleKey> sample_keys;
       auto &rng = _shards_task_rng_pool[i];
       for (size_t k = 0; k < id_list[i].size(); k++) {
-        if (index < r.size() &&
+        if (index < (int)r.size() &&
             r[index].first.node_key == id_list[i][k].node_key) {
           idx = seq_id[i][k];
           actual_sizes[idx] = r[index].second.actual_size;
@@ -597,7 +891,7 @@ int32_t GraphTable::random_sample_neighbors(
               res.size() * (need_weight ? (Node::id_size + Node::weight_size)
                                         : Node::id_size);
           int offset = 0;
-          uint64_t id;
+          int64_t id;
           float weight;
           char *buffer_addr = new char[actual_size];
           if (response == LRUResponse::ok) {
@@ -632,13 +926,13 @@ int32_t GraphTable::random_sample_neighbors(
   return 0;
 }
 
-int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
+int32_t GraphTable::get_node_feat(const std::vector<int64_t> &node_ids,
                                   const std::vector<std::string> &feature_names,
                                   std::vector<std::vector<std::string>> &res) {
   size_t node_num = node_ids.size();
   std::vector<std::future<int>> tasks;
   for (size_t idx = 0; idx < node_num; ++idx) {
-    uint64_t node_id = node_ids[idx];
+    int64_t node_id = node_ids[idx];
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, node_id]() -> int {
           Node *node = find_node(node_id);
@@ -646,7 +940,8 @@ int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
           if (node == nullptr) {
             return 0;
           }
-          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+          for (int feat_idx = 0; feat_idx < (int)feature_names.size();
+               ++feat_idx) {
             const std::string &feature_name = feature_names[feat_idx];
             if (feat_id_map.find(feature_name) != feat_id_map.end()) {
               // res[feat_idx][idx] =
@@ -665,19 +960,20 @@ int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
 }
 
 int32_t GraphTable::set_node_feat(
-    const std::vector<uint64_t> &node_ids,
+    const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     const std::vector<std::vector<std::string>> &res) {
   size_t node_num = node_ids.size();
   std::vector<std::future<int>> tasks;
   for (size_t idx = 0; idx < node_num; ++idx) {
-    uint64_t node_id = node_ids[idx];
+    int64_t node_id = node_ids[idx];
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, node_id]() -> int {
           size_t index = node_id % this->shard_num - this->shard_start;
           auto node = shards[index]->add_feature_node(node_id);
           node->set_feature_size(this->feat_name.size());
-          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+          for (int feat_idx = 0; feat_idx < (int)feature_names.size();
+               ++feat_idx) {
             const std::string &feature_name = feature_names[feat_idx];
             if (feat_id_map.find(feature_name) != feat_id_map.end()) {
               node->set_feature(feat_id_map[feature_name], res[feat_idx][idx]);
@@ -771,35 +1067,68 @@ int32_t GraphTable::pull_graph_list(int start, int total_size,
   return 0;
 }
 
-int32_t GraphTable::get_server_index_by_id(uint64_t id) {
+int32_t GraphTable::get_server_index_by_id(int64_t id) {
   return id % shard_num / shard_num_per_server;
 }
+int32_t GraphTable::initialize(const TableParameter &config,
+                               const FsClientParameter &fs_config) {
+  LOG(INFO) << "in graphTable initialize";
+  _config = config;
+  if (initialize_accessor() != 0) {
+    LOG(WARNING) << "Table accessor initialize failed";
+    return -1;
+  }
 
-int32_t GraphTable::initialize() {
+  if (_afs_client.initialize(fs_config) != 0) {
+    LOG(WARNING) << "Table fs_client initialize failed";
+    // return -1;
+  }
+  auto graph = config.graph_parameter();
+  shard_num = _config.shard_num();
+  LOG(INFO) << "in graphTable initialize over";
+  return initialize(graph);
+}
+int32_t GraphTable::initialize(const GraphParameter &graph) {
+#ifdef PADDLE_WITH_HETERPS
+  if (graph.gpups_mode()) {
+    gpups_mode = true;
+    if (shard_num == 0) {
+      shard_num = graph.gpups_mode_shard_num();
+      server_num = 1;
+      _shard_idx = 0;
+    }
+    auto *sampler =
+        CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class());
+    auto slices =
+        string::split_string<std::string>(graph.gpups_graph_sample_args(), ",");
+    std::cout << "slices" << std::endl;
+    for (auto x : slices) std::cout << x << std::endl;
+    sampler->init(graph.gpu_num(), this, slices);
+    graph_sampler.reset(sampler);
+  }
+#endif
+  task_pool_size_ = graph.task_pool_size();
   _shards_task_pool.resize(task_pool_size_);
   for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
     _shards_task_pool[i].reset(new ::ThreadPool(1));
     _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0));
   }
-  server_num = _shard_num;
-  // VLOG(0) << "in init graph table server num = " << server_num;
-  /*
-  _shard_num is actually server number here
-  when a server initialize its tables, it sets tables' _shard_num to server_num,
-  and _shard_idx to server
-  rank
-  */
-  auto common = _config.common();
-
-  this->table_name = common.table_name();
-  this->table_type = common.name();
+  auto graph_feature = graph.graph_feature();
+  // this->table_name = common.table_name();
+  // this->table_type = common.name();
+  this->table_name = graph.table_name();
+  this->table_type = graph.table_type();
   VLOG(0) << " init graph table type " << this->table_type << " table name "
           << this->table_name;
-  int feat_conf_size = static_cast<int>(common.attributes().size());
+  // int feat_conf_size = static_cast<int>(common.attributes().size());
+  int feat_conf_size = static_cast<int>(graph_feature.name().size());
   for (int i = 0; i < feat_conf_size; i++) {
-    auto &f_name = common.attributes()[i];
-    auto &f_shape = common.dims()[i];
-    auto &f_dtype = common.params()[i];
+    // auto &f_name = common.attributes()[i];
+    // auto &f_shape = common.dims()[i];
+    // auto &f_dtype = common.params()[i];
+    auto &f_name = graph_feature.name()[i];
+    auto &f_shape = graph_feature.shape()[i];
+    auto &f_dtype = graph_feature.dtype()[i];
     this->feat_name.push_back(f_name);
     this->feat_shape.push_back(f_shape);
     this->feat_dtype.push_back(f_dtype);
@@ -807,8 +1136,6 @@ int32_t GraphTable::initialize() {
     VLOG(0) << "init graph table feat conf name:" << f_name
             << " shape:" << f_shape << " dtype:" << f_dtype;
   }
-
-  shard_num = _config.shard_num();
   VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx"
           << _shard_idx;
   shard_num_per_server = sparse_local_shard_num(shard_num, server_num);
@@ -826,5 +1153,6 @@ int32_t GraphTable::initialize() {
 
   return 0;
 }
+
 }  // namespace distributed
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index c76a62248c8fcab677d3afd8b3985700ca5f2f33..7946569525cc4bb1351046632dfe5894611c4b67 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -38,10 +38,14 @@
 #include <vector>
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/graph/class_macro.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
+#endif
 namespace paddle {
 namespace distributed {
 class GraphShard {
@@ -51,37 +55,37 @@ class GraphShard {
   ~GraphShard();
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
-  std::vector<uint64_t> get_ids_by_range(int start, int end) {
-    std::vector<uint64_t> res;
+  std::vector<int64_t> get_ids_by_range(int start, int end) {
+    std::vector<int64_t> res;
     for (int i = start; i < end && i < (int)bucket.size(); i++) {
       res.push_back(bucket[i]->get_id());
     }
     return res;
   }
 
-  GraphNode *add_graph_node(uint64_t id);
+  GraphNode *add_graph_node(int64_t id);
   GraphNode *add_graph_node(Node *node);
-  FeatureNode *add_feature_node(uint64_t id);
-  Node *find_node(uint64_t id);
-  void delete_node(uint64_t id);
+  FeatureNode *add_feature_node(int64_t id);
+  Node *find_node(int64_t id);
+  void delete_node(int64_t id);
   void clear();
-  void add_neighbor(uint64_t id, uint64_t dst_id, float weight);
-  std::unordered_map<uint64_t, int> &get_node_location() {
+  void add_neighbor(int64_t id, int64_t dst_id, float weight);
+  std::unordered_map<int64_t, int> &get_node_location() {
     return node_location;
   }
 
  private:
-  std::unordered_map<uint64_t, int> node_location;
+  std::unordered_map<int64_t, int> node_location;
   std::vector<Node *> bucket;
 };
 
 enum LRUResponse { ok = 0, blocked = 1, err = 2 };
 
 struct SampleKey {
-  uint64_t node_key;
+  int64_t node_key;
   size_t sample_size;
   bool is_weighted;
-  SampleKey(uint64_t _node_key, size_t _sample_size, bool _is_weighted)
+  SampleKey(int64_t _node_key, size_t _sample_size, bool _is_weighted)
       : node_key(_node_key),
         sample_size(_sample_size),
         is_weighted(_is_weighted) {}
@@ -300,7 +304,7 @@ class ScaledLRU {
       node_size += lru_pool[i].node_size - lru_pool[i].remove_count;
     }
 
-    if (node_size <= size_t(1.1 * size_limit) + 1) return 0;
+    if ((size_t)node_size <= size_t(1.1 * size_limit) + 1) return 0;
     if (pthread_rwlock_wrlock(&rwlock) == 0) {
       // VLOG(0)<"in shrink\n";
       global_count = 0;
@@ -308,9 +312,9 @@ class ScaledLRU {
         global_count += lru_pool[i].node_size - lru_pool[i].remove_count;
       }
       // VLOG(0)<<"global_count "<<global_count<<"\n";
-      if (global_count > size_limit) {
+      if ((size_t)global_count > size_limit) {
         size_t remove = global_count - size_limit;
-        for (int i = 0; i < lru_pool.size(); i++) {
+        for (size_t i = 0; i < lru_pool.size(); i++) {
           lru_pool[i].total_diff = 0;
           lru_pool[i].remove_count +=
               1.0 * (lru_pool[i].node_size - lru_pool[i].remove_count) /
@@ -352,9 +356,69 @@ class ScaledLRU {
   friend class RandomSampleLRU<K, V>;
 };
 
+#ifdef PADDLE_WITH_HETERPS
+enum GraphSamplerStatus { waiting = 0, running = 1, terminating = 2 };
+class GraphTable;
+class GraphSampler {
+ public:
+  GraphSampler() {
+    status = GraphSamplerStatus::waiting;
+    thread_pool.reset(new ::ThreadPool(1));
+    callback = [](std::vector<paddle::framework::GpuPsCommGraph> &res) {
+      return;
+    };
+  }
+  virtual int run_graph_sampling() = 0;
+  virtual int start_graph_sampling() {
+    if (status != GraphSamplerStatus::waiting) {
+      return -1;
+    }
+    std::promise<int> prom;
+    std::future<int> fut = prom.get_future();
+    graph_sample_task_over = thread_pool->enqueue([&prom, this]() {
+      prom.set_value(0);
+      status = GraphSamplerStatus::running;
+      return run_graph_sampling();
+    });
+    return fut.get();
+  }
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args) = 0;
+  virtual void set_graph_sample_callback(
+      std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+          callback) {
+    this->callback = callback;
+  }
+
+  virtual int end_graph_sampling() {
+    if (status == GraphSamplerStatus::running) {
+      status = GraphSamplerStatus::terminating;
+      return graph_sample_task_over.get();
+    }
+    return -1;
+  }
+  virtual GraphSamplerStatus get_graph_sampler_status() { return status; }
+
+ protected:
+  std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+      callback;
+  std::shared_ptr<::ThreadPool> thread_pool;
+  GraphSamplerStatus status;
+  std::future<int> graph_sample_task_over;
+  std::vector<paddle::framework::GpuPsCommGraph> sample_res;
+};
+#endif
+
 class GraphTable : public SparseTable {
  public:
-  GraphTable() { use_cache = false; }
+  GraphTable() {
+    use_cache = false;
+    shard_num = 0;
+#ifdef PADDLE_WITH_HETERPS
+    gpups_mode = false;
+#endif
+    rw_lock.reset(new pthread_rwlock_t());
+  }
   virtual ~GraphTable();
   virtual int32_t pull_graph_list(int start, int size,
                                   std::unique_ptr<char[]> &buffer,
@@ -362,7 +426,7 @@ class GraphTable : public SparseTable {
                                   int step);
 
   virtual int32_t random_sample_neighbors(
-      uint64_t *node_ids, int sample_size,
+      int64_t *node_ids, int sample_size,
       std::vector<std::shared_ptr<char>> &buffers,
       std::vector<int> &actual_sizes, bool need_weight);
 
@@ -370,9 +434,11 @@ class GraphTable : public SparseTable {
                               int &actual_sizes);
 
   virtual int32_t get_nodes_ids_by_ranges(
-      std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res);
-  virtual int32_t initialize();
-
+      std::vector<std::pair<int, int>> ranges, std::vector<int64_t> &res);
+  virtual int32_t initialize() { return 0; }
+  virtual int32_t initialize(const TableParameter &config,
+                             const FsClientParameter &fs_config);
+  virtual int32_t initialize(const GraphParameter &config);
   int32_t load(const std::string &path, const std::string &param);
   int32_t load_graph_split_config(const std::string &path);
 
@@ -380,13 +446,13 @@ class GraphTable : public SparseTable {
 
   int32_t load_nodes(const std::string &path, std::string node_type);
 
-  int32_t add_graph_node(std::vector<uint64_t> &id_list,
+  int32_t add_graph_node(std::vector<int64_t> &id_list,
                          std::vector<bool> &is_weight_list);
 
-  int32_t remove_graph_node(std::vector<uint64_t> &id_list);
+  int32_t remove_graph_node(std::vector<int64_t> &id_list);
 
-  int32_t get_server_index_by_id(uint64_t id);
-  Node *find_node(uint64_t id);
+  int32_t get_server_index_by_id(int64_t id);
+  Node *find_node(int64_t id);
 
   virtual int32_t pull_sparse(float *values,
                               const PullSparseValue &pull_value) {
@@ -407,16 +473,27 @@ class GraphTable : public SparseTable {
     return 0;
   }
   virtual int32_t initialize_shard() { return 0; }
-  virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index);
-  virtual uint32_t get_thread_pool_index(uint64_t node_id);
+  virtual int32_t set_shard(size_t shard_idx, size_t server_num) {
+    _shard_idx = shard_idx;
+    /*
+    _shard_num is not used in graph_table, this following operation is for the
+    purpose of
+    being compatible with base class table.
+    */
+    _shard_num = server_num;
+    this->server_num = server_num;
+    return 0;
+  }
+  virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index);
+  virtual uint32_t get_thread_pool_index(int64_t node_id);
   virtual std::pair<int32_t, std::string> parse_feature(std::string feat_str);
 
-  virtual int32_t get_node_feat(const std::vector<uint64_t> &node_ids,
+  virtual int32_t get_node_feat(const std::vector<int64_t> &node_ids,
                                 const std::vector<std::string> &feature_names,
                                 std::vector<std::vector<std::string>> &res);
 
   virtual int32_t set_node_feat(
-      const std::vector<uint64_t> &node_ids,
+      const std::vector<int64_t> &node_ids,
       const std::vector<std::string> &feature_names,
       const std::vector<std::vector<std::string>> &res);
 
@@ -433,11 +510,25 @@ class GraphTable : public SparseTable {
     }
     return 0;
   }
-
+#ifdef PADDLE_WITH_HETERPS
+  virtual int32_t start_graph_sampling() {
+    return this->graph_sampler->start_graph_sampling();
+  }
+  virtual int32_t end_graph_sampling() {
+    return this->graph_sampler->end_graph_sampling();
+  }
+  virtual int32_t set_graph_sample_callback(
+      std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+          callback) {
+    graph_sampler->set_graph_sample_callback(callback);
+    return 0;
+  }
+// virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); }
+#endif
  protected:
   std::vector<GraphShard *> shards, extra_shards;
   size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
-  const int task_pool_size_ = 24;
+  int task_pool_size_ = 24;
   const int random_sample_nodes_ranges = 3;
 
   std::vector<std::string> feat_name;
@@ -450,11 +541,61 @@ class GraphTable : public SparseTable {
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
   std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
   std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
-  std::unordered_set<uint64_t> extra_nodes;
-  std::unordered_map<uint64_t, size_t> extra_nodes_to_thread_index;
+  std::unordered_set<int64_t> extra_nodes;
+  std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index;
   bool use_cache, use_duplicate_nodes;
   mutable std::mutex mutex_;
+  std::shared_ptr<pthread_rwlock_t> rw_lock;
+#ifdef PADDLE_WITH_HETERPS
+  // paddle::framework::GpuPsGraphTable gpu_graph_table;
+  bool gpups_mode;
+  // std::shared_ptr<::ThreadPool> graph_sample_pool;
+  std::shared_ptr<GraphSampler> graph_sampler;
+  REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler)
+#endif
+};
+
+#ifdef PADDLE_WITH_HETERPS
+REGISTER_PSCORE_REGISTERER(GraphSampler);
+class CompleteGraphSampler : public GraphSampler {
+ public:
+  CompleteGraphSampler() {}
+  ~CompleteGraphSampler() {}
+  // virtual pthread_rwlock_t *export_rw_lock();
+  virtual int run_graph_sampling();
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args_);
+
+ protected:
+  GraphTable *graph_table;
+  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<int64_t>> sample_neighbors;
+  // std::vector<GpuPsCommGraph> sample_res;
+  // std::shared_ptr<std::mt19937_64> random;
+  int gpu_num;
+};
+
+class BasicBfsGraphSampler : public GraphSampler {
+ public:
+  BasicBfsGraphSampler() {}
+  ~BasicBfsGraphSampler() {}
+  // virtual pthread_rwlock_t *export_rw_lock();
+  virtual int run_graph_sampling();
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args_);
+
+ protected:
+  GraphTable *graph_table;
+  // std::vector<std::vector<GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<int64_t>> sample_neighbors;
+  size_t gpu_num;
+  int node_num_for_each_shard, edge_num_for_each_node;
+  int rounds, interval;
+  std::vector<std::unordered_map<int64_t, std::vector<int64_t>>>
+      sample_neighbors_map;
 };
+#endif
 }  // namespace distributed
 
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h
index 5ac0c08f97d76f6bc1cb77f1f6cd0da77be2385f..f46e659a88babb07918d02f1e05859829895f2bf 100644
--- a/paddle/fluid/distributed/ps/table/depends/initializers.h
+++ b/paddle/fluid/distributed/ps/table/depends/initializers.h
@@ -23,6 +23,7 @@
 #include "gflags/gflags.h"
 
 #include "paddle/fluid/framework/generator.h"
+
 #include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 
 namespace paddle {
@@ -117,13 +118,9 @@ class TruncatedGaussianInitializer : public Initializer {
     seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
     mean_ = std::stof(attrs[2]);
     std_ = std::stof(attrs[3]);
-    auto normal_cdf = [](float x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    float a_normal_cdf = normal_cdf((-2.0 - mean_) / std_);
-    float b_normal_cdf = normal_cdf((2.0 - mean_) / std_);
-    std::uniform_real_distribution<float> dist_(2.0 * a_normal_cdf - 1.0,
-                                                2.0 * b_normal_cdf - 1.0);
+
+    std::uniform_real_distribution<float> dist_(
+        std::numeric_limits<float>::min(), 1.0);
     random_engine_ = framework::GetCPURandomEngine(seed_);
   }
 
diff --git a/paddle/fluid/distributed/ps/table/graph/class_macro.h b/paddle/fluid/distributed/ps/table/graph/class_macro.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf59dbacb253707efdc527a23232fcb6c11554b4
--- /dev/null
+++ b/paddle/fluid/distributed/ps/table/graph/class_macro.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#define DECLARE_GRAPH_FRIEND_CLASS(a) friend class a;
+#define DECLARE_1_FRIEND_CLASS(a, ...) DECLARE_GRAPH_FRIEND_CLASS(a)
+#define DECLARE_2_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_1_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_3_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_2_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_4_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_3_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_5_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_4_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_6_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_5_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_7_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_6_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_8_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_7_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_9_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_8_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_10_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_9_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_11_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_10_FRIEND_CLASS(__VA_ARGS__)
+#define REGISTER_GRAPH_FRIEND_CLASS(n, ...) \
+  DECLARE_##n##_FRIEND_CLASS(__VA_ARGS__)
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
index d1961b655d8829716b392c24ad6f1139089eb80d..004a536e8e56c28151986d56833a5708999e297c 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
@@ -17,11 +17,11 @@
 namespace paddle {
 namespace distributed {
 
-void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+void GraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   id_arr.push_back(id);
 }
 
-void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   id_arr.push_back(id);
   weight_arr.push_back(weight);
 }
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
index 3dfe5a6f357a7cd7d79834a20b6411995665f4fa..5fc785fe25682c8ff8de6606581cf7a13ae52999 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
@@ -24,19 +24,20 @@ class GraphEdgeBlob {
   GraphEdgeBlob() {}
   virtual ~GraphEdgeBlob() {}
   size_t size() { return id_arr.size(); }
-  virtual void add_edge(uint64_t id, float weight);
-  uint64_t get_id(int idx) { return id_arr[idx]; }
+  virtual void add_edge(int64_t id, float weight);
+  int64_t get_id(int idx) { return id_arr[idx]; }
   virtual float get_weight(int idx) { return 1; }
+  std::vector<int64_t>& export_id_array() { return id_arr; }
 
  protected:
-  std::vector<uint64_t> id_arr;
+  std::vector<int64_t> id_arr;
 };
 
 class WeightedGraphEdgeBlob : public GraphEdgeBlob {
  public:
   WeightedGraphEdgeBlob() {}
   virtual ~WeightedGraphEdgeBlob() {}
-  virtual void add_edge(uint64_t id, float weight);
+  virtual void add_edge(int64_t id, float weight);
   virtual float get_weight(int idx) { return weight_arr[idx]; }
 
  protected:
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h
index b838c2c1258d84fec8c4a25f5855209d5b428d4c..c6c594036d4fc94b296c0801b05c05801beb4fc0 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -48,6 +48,7 @@ class Node {
   virtual void set_feature(int idx, std::string str) {}
   virtual void set_feature_size(int size) {}
   virtual int get_feature_size() { return 0; }
+  virtual size_t get_neighbor_size() { return 0; }
 
  protected:
   uint64_t id;
@@ -70,6 +71,7 @@ class GraphNode : public Node {
   }
   virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
   virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
+  virtual size_t get_neighbor_size() { return edges->size(); }
 
  protected:
   Sampler *sampler;
diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc
index fa8169da07ab7fdf7ed28c840f062741913a8702..fc2ea56e95d7721fdba10e8499c22ca98bbd4c3a 100644
--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -37,6 +37,8 @@ REGISTER_PSCORE_CLASS(Table, CommonDenseTable);
 REGISTER_PSCORE_CLASS(Table, CommonSparseTable);
 #ifdef PADDLE_WITH_HETERPS
 REGISTER_PSCORE_CLASS(Table, SSDSparseTable);
+REGISTER_PSCORE_CLASS(GraphSampler, CompleteGraphSampler);
+REGISTER_PSCORE_CLASS(GraphSampler, BasicBfsGraphSampler);
 #endif
 REGISTER_PSCORE_CLASS(Table, SparseGeoTable);
 REGISTER_PSCORE_CLASS(Table, BarrierTable);
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 2223334ccc442f5e53805ac8c078df07155565a8..cb46c38d4de4b7546af3e3f9e973ee2accba1921 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -24,6 +24,9 @@ cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope serv
 set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
 
+set_source_files_properties(graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS  scope server communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+
 set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table)
 
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index 9949dce4e933b03da4260c34b3beaf2b7bcdc4f1..a2f495de3c953a418f6e9c57a0535264eb401e65 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -236,7 +236,7 @@ void RunGraphSplit() {
   sleep(2);
   std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
   dense_regions.insert(
-      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+      std::pair<int64_t, std::vector<paddle::distributed::Region>>(0, {}));
   auto regions = dense_regions[0];
 
   RunClient(dense_regions, 0, pserver_ptr_->get_service());
@@ -250,16 +250,16 @@ void RunGraphSplit() {
       worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
   srand(time(0));
   pull_status.wait();
-  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<int64_t>> _vs;
   std::vector<std::vector<float>> vs;
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 10240001024), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(0, _vs[0].size());
   _vs.clear();
   vs.clear();
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 97), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 97), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(3, _vs[0].size());
   std::remove(edge_file_name);
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 22c2d1e60992e2955824f004fbb89ea6c22da823..565d51379d5a8519de241deea192ffbdbfa49fd0 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -48,10 +48,10 @@ namespace distributed = paddle::distributed;
 
 void testSampleNodes(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<uint64_t> ids;
+  std::vector<int64_t> ids;
   auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids);
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {37, 59};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {37, 59};
   pull_status.wait();
   for (auto id : ids) s.insert(id);
   ASSERT_EQ(true, s.size() == s1.size());
@@ -106,14 +106,14 @@ void testFeatureNodeSerializeFloat64() {
 
 void testSingleSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<std::vector<uint64_t>> vs;
+  std::vector<std::vector<int64_t>> vs;
   std::vector<std::vector<float>> vs1;
   auto pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 37), 4, vs, vs1, true);
+      0, std::vector<int64_t>(1, 37), 4, vs, vs1, true);
   pull_status.wait();
 
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {112, 45, 145};
   for (auto g : vs[0]) {
     s.insert(g);
   }
@@ -126,7 +126,7 @@ void testSingleSampleNeighboor(
   vs.clear();
   vs1.clear();
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 96), 4, vs, vs1, true);
+      0, std::vector<int64_t>(1, 96), 4, vs, vs1, true);
   pull_status.wait();
   s1 = {111, 48, 247};
   for (auto g : vs[0]) {
@@ -147,30 +147,30 @@ void testAddNode(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
   worker_ptr_->clear_nodes(0);
   int total_num = 270000;
-  uint64_t id;
-  std::unordered_set<uint64_t> id_set;
+  int64_t id;
+  std::unordered_set<int64_t> id_set;
   for (int i = 0; i < total_num; i++) {
     while (id_set.find(id = rand()) != id_set.end())
       ;
     id_set.insert(id);
   }
-  std::vector<uint64_t> id_list(id_set.begin(), id_set.end());
+  std::vector<int64_t> id_list(id_set.begin(), id_set.end());
   std::vector<bool> weight_list;
   auto status = worker_ptr_->add_graph_node(0, id_list, weight_list);
   status.wait();
-  std::vector<uint64_t> ids[2];
+  std::vector<int64_t> ids[2];
   for (int i = 0; i < 2; i++) {
     auto sample_status =
         worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
     sample_status.wait();
   }
-  std::unordered_set<uint64_t> id_set_check(ids[0].begin(), ids[0].end());
+  std::unordered_set<int64_t> id_set_check(ids[0].begin(), ids[0].end());
   for (auto x : ids[1]) id_set_check.insert(x);
   ASSERT_EQ(id_set.size(), id_set_check.size());
   for (auto x : id_set) {
     ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true);
   }
-  std::vector<uint64_t> remove_ids;
+  std::vector<int64_t> remove_ids;
   for (auto p : id_set_check) {
     if (remove_ids.size() == 0)
       remove_ids.push_back(p);
@@ -187,7 +187,7 @@ void testAddNode(
         worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
     sample_status.wait();
   }
-  std::unordered_set<uint64_t> id_set_check1(ids[0].begin(), ids[0].end());
+  std::unordered_set<int64_t> id_set_check1(ids[0].begin(), ids[0].end());
   for (auto x : ids[1]) id_set_check1.insert(x);
   ASSERT_EQ(id_set_check1.size(), id_set_check.size());
   for (auto x : id_set_check1) {
@@ -196,14 +196,14 @@ void testAddNode(
 }
 void testBatchSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<std::vector<uint64_t>> vs;
+  std::vector<std::vector<int64_t>> vs;
   std::vector<std::vector<float>> vs1;
-  std::vector<std::uint64_t> v = {37, 96};
+  std::vector<std::int64_t> v = {37, 96};
   auto pull_status =
       worker_ptr_->batch_sample_neighbors(0, v, 4, vs, vs1, false);
   pull_status.wait();
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {112, 45, 145};
   for (auto g : vs[0]) {
     s.insert(g);
   }
@@ -417,7 +417,7 @@ void RunBrpcPushSparse() {
 
   std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
   dense_regions.insert(
-      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+      std::pair<int64_t, std::vector<paddle::distributed::Region>>(0, {}));
   auto regions = dense_regions[0];
 
   RunClient(dense_regions, 0, pserver_ptr_->get_service());
@@ -427,14 +427,14 @@ void RunBrpcPushSparse() {
       worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
   srand(time(0));
   pull_status.wait();
-  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<int64_t>> _vs;
   std::vector<std::vector<float>> vs;
   testSampleNodes(worker_ptr_);
   sleep(5);
   testSingleSampleNeighboor(worker_ptr_);
   testBatchSampleNeighboor(worker_ptr_);
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 10240001024), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(0, _vs[0].size());
   paddle::distributed::GraphTable* g =
@@ -445,14 +445,14 @@ void RunBrpcPushSparse() {
   while (round--) {
     vs.clear();
     pull_status = worker_ptr_->batch_sample_neighbors(
-        0, std::vector<uint64_t>(1, 37), 1, _vs, vs, false);
+        0, std::vector<int64_t>(1, 37), 1, _vs, vs, false);
     pull_status.wait();
 
     for (int i = 0; i < ttl; i++) {
-      std::vector<std::vector<uint64_t>> vs1;
+      std::vector<std::vector<int64_t>> vs1;
       std::vector<std::vector<float>> vs2;
       pull_status = worker_ptr_->batch_sample_neighbors(
-          0, std::vector<uint64_t>(1, 37), 1, vs1, vs2, false);
+          0, std::vector<int64_t>(1, 37), 1, vs1, vs2, false);
       pull_status.wait();
       ASSERT_EQ(_vs[0].size(), vs1[0].size());
 
@@ -540,7 +540,7 @@ void RunBrpcPushSparse() {
 
   // Test Pull by step
 
-  std::unordered_set<uint64_t> count_item_nodes;
+  std::unordered_set<int64_t> count_item_nodes;
   // pull by step 2
   for (int test_step = 1; test_step < 4; test_step++) {
     count_item_nodes.clear();
@@ -558,18 +558,18 @@ void RunBrpcPushSparse() {
     ASSERT_EQ(count_item_nodes.size(), 12);
   }
 
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>> res;
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>> res;
   res = client1.batch_sample_neighbors(
-      std::string("user2item"), std::vector<uint64_t>(1, 96), 4, true, false);
+      std::string("user2item"), std::vector<int64_t>(1, 96), 4, true, false);
   ASSERT_EQ(res.first[0].size(), 3);
-  std::vector<uint64_t> node_ids;
+  std::vector<int64_t> node_ids;
   node_ids.push_back(96);
   node_ids.push_back(37);
   res = client1.batch_sample_neighbors(std::string("user2item"), node_ids, 4,
                                        true, false);
 
   ASSERT_EQ(res.first[1].size(), 1);
-  std::vector<uint64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
+  std::vector<int64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
   ASSERT_EQ(nodes_ids.size(), 2);
   ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) ||
                       (nodes_ids[0] == 37 && nodes_ids[1] == 59));
diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..65455028247ddf7d310040ecae0018b619f75bf1
--- /dev/null
+++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_set>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include <chrono>
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+
+std::vector<std::string> edges = {
+    std::string("37\t45\t0.34"),  std::string("37\t145\t0.31"),
+    std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
+    std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
+    std::string("59\t45\t0.34"),  std::string("59\t145\t0.31"),
+    std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
+    std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
+// odd id:96 48 122 112
+char edge_file_name[] = "edges.txt";
+
+std::vector<std::string> nodes = {
+    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
+    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
+    std::string("user\t59\ta 0.11\tb 11 14"),
+    std::string("user\t97\ta 0.11\tb 12 11"),
+    std::string("item\t45\ta 0.21"),
+    std::string("item\t145\ta 0.21"),
+    std::string("item\t112\ta 0.21"),
+    std::string("item\t48\ta 0.21"),
+    std::string("item\t247\ta 0.21"),
+    std::string("item\t111\ta 0.21"),
+    std::string("item\t46\ta 0.21"),
+    std::string("item\t146\ta 0.21"),
+    std::string("item\t122\ta 0.21"),
+    std::string("item\t49\ta 0.21"),
+    std::string("item\t248\ta 0.21"),
+    std::string("item\t113\ta 0.21")};
+char node_file_name[] = "nodes.txt";
+
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+
+void testGraphSample() {
+#ifdef PADDLE_WITH_HETERPS
+  ::paddle::distributed::GraphParameter table_proto;
+  table_proto.set_gpups_mode(true);
+  table_proto.set_gpups_mode_shard_num(127);
+  table_proto.set_gpu_num(2);
+
+  distributed::GraphTable graph_table, graph_table1;
+  graph_table.initialize(table_proto);
+  prepare_file(edge_file_name, edges);
+  graph_table.load(std::string(edge_file_name), std::string("e>"));
+  std::vector<paddle::framework::GpuPsCommGraph> res;
+  std::promise<int> prom;
+  std::future<int> fut = prom.get_future();
+  graph_table.set_graph_sample_callback(
+      [&res, &prom](std::vector<paddle::framework::GpuPsCommGraph> &res0) {
+        res = res0;
+        prom.set_value(0);
+      });
+  graph_table.start_graph_sampling();
+  fut.get();
+  graph_table.end_graph_sampling();
+  ASSERT_EQ(2, res.size());
+  // 37 59 97
+  for (int i = 0; i < (int)res[1].node_size; i++) {
+    std::cout << res[1].node_list[i].node_id << std::endl;
+  }
+  ASSERT_EQ(3, res[1].node_size);
+
+  ::paddle::distributed::GraphParameter table_proto1;
+  table_proto1.set_gpups_mode(true);
+  table_proto1.set_gpups_mode_shard_num(127);
+  table_proto1.set_gpu_num(2);
+  table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler");
+  table_proto1.set_gpups_graph_sample_args("5,5,1,1");
+  graph_table1.initialize(table_proto1);
+  graph_table1.load(std::string(edge_file_name), std::string("e>"));
+  std::vector<paddle::framework::GpuPsCommGraph> res1;
+  std::promise<int> prom1;
+  std::future<int> fut1 = prom1.get_future();
+  graph_table1.set_graph_sample_callback(
+      [&res1, &prom1](std::vector<paddle::framework::GpuPsCommGraph> &res0) {
+        res1 = res0;
+        prom1.set_value(0);
+      });
+  graph_table1.start_graph_sampling();
+  fut1.get();
+  graph_table1.end_graph_sampling();
+  // distributed::BasicBfsGraphSampler *sampler1 =
+  //     (distributed::BasicBfsGraphSampler *)graph_table1.get_graph_sampler();
+  //     sampler1->start_graph_sampling();
+  //     std::this_thread::sleep_for (std::chrono::seconds(1));
+  // std::vector<paddle::framework::GpuPsCommGraph> res1;// =
+  // sampler1->fetch_sample_res();
+  ASSERT_EQ(2, res1.size());
+  // odd id:96 48 122 112
+  for (int i = 0; i < (int)res1[0].node_size; i++) {
+    std::cout << res1[0].node_list[i].node_id << std::endl;
+  }
+  ASSERT_EQ(4, res1[0].node_size);
+#endif
+}
+
+TEST(testGraphSample, Run) { testGraphSample(); }
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 3a2ec403c0a59aaa23decc72fb9581b5a7f78343..9c4089af092e418d6845864671124917c6498cf1 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -24,7 +24,7 @@
 #include "paddle/fluid/platform/errors.h"
 
 #include "glog/logging.h"
-
+DECLARE_bool(retain_grad_for_all_tensor);
 namespace egr {
 
 static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
@@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
 }
 
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
-operator()(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+           bool create_graph) {
   VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
@@ -62,7 +62,7 @@ operator()(
     grad_out = grads[0][0];
   }
 
-  if (!weak_grad_.expired()) {
+  if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) {
     auto grad = weak_grad_.lock();
     CopyOrAddTensor(grad.get(), grad_out);
   }
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 07fa40165167ce2352018c0e1b1cb08222d5a181..a91a0b6e34c0d9440e3645d1a6982748c4315962 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) override;
+
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
 
   std::string name() { return "GradNodeAccumulation"; }
 
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 5a2595b9103e4d49845fa8938ee3577b6b3f3f06..0bc998a03a80b7b8a1e486ad68f1575c130d2c1b 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X(
 void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
 
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
-operator()(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+           bool create_graph) {
   // 1. Check Output Size
   PADDLE_ENFORCE(
       ((grads.size() == 1) && (grads[0].size() == 1)),
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index 247fde6ed1f869542969b068cdae9f59cedd732a..e263f73a6b8a4a1f9ce23d9b5ca383fd6828016b 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) override;
+
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
 
   void SetTensorWrappers_X(
       const std::vector<paddle::experimental::Tensor>& tensors);
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index bf838b27615028167e35a8e85e7636dd4c834016..d9f201dc9f1e8b9a0296288917b82f3e2903330e 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -2074,7 +2074,8 @@ static std::string GenerateGradNodeCCContents(
   const char* GRAD_FUNCTION_TEMPLATE =
       "std::vector<std::vector<paddle::experimental::Tensor>> "
       "GradNode%s::operator()(const "
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads) {\n%s\n}";
+      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, "
+      "bool create_graph) {\n%s\n}";
   std::string grad_function_str = paddle::string::Sprintf(
       GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
 
@@ -2109,18 +2110,28 @@ static std::string GenerateGradNodeHeaderContents(
       "\n"
       "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
       "operator()(const "
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
+      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, const "
+      "bool create_graph = false) "
       "override;\n"
       "\n"
+      "  void ClearTensorWrappers() override { \n"
+      "%s\n"
+      "    is_tensor_wrappers_cleared = true;\n"
+      "  }\n"
       "  std::string name() override { return \" GradNode%s \"; } \n "
       "\n"
       "  // SetX, SetY, ...\n"
       "%s\n"
       "  // SetAttrMap\n"
       "%s\n"
+      "  bool IsTensorWrappersCleared() override { \n"
+      "    return is_tensor_wrappers_cleared;\n"
+      "  }\n"
       " private:\n"
       "   // TensorWrappers\n"
       "%s\n"
+      "   bool is_tensor_wrappers_cleared = false;\n"
+      "\n"
       "   // Attribute Map\n"
       "%s\n"
       "};";
@@ -2154,6 +2165,7 @@ static std::string GenerateGradNodeHeaderContents(
 
   std::string set_tensor_wrappers_str = "";
   std::string tensor_wrapper_members_str = "";
+  std::string clear_tensor_wrappers_str = "";
   for (const auto& iter : op_base_infos) {
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
         iter.GetGradInsFwdSlotnameMap();
@@ -2185,6 +2197,13 @@ static std::string GenerateGradNodeHeaderContents(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
             struct_tensor_wrapper_name);
 
+        const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
+            "for (auto tw: %s)   {\n"
+            "       tw.clear();\n"
+            "     }\n";
+        clear_tensor_wrappers_str += paddle::string::Sprintf(
+            CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
+
       } else {
         const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
             "const paddle::experimental::Tensor& %s";
@@ -2197,10 +2216,14 @@ static std::string GenerateGradNodeHeaderContents(
             TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
-            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);";
+            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n";
         tensor_wrapper_body_str = paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
             tensor_wrapper_name, full_reserved_str);
+
+        const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = "   %s.clear();\n";
+        clear_tensor_wrappers_str += paddle::string::Sprintf(
+            CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
       }
       std::string full_reserved_signature_str = "bool full_reserved";
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
@@ -2215,8 +2238,8 @@ static std::string GenerateGradNodeHeaderContents(
 
   std::string grad_node_str = paddle::string::Sprintf(
       GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
-      op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
-      tensor_wrapper_members_str, attr_members_str);
+      op_type, clear_tensor_wrappers_str, op_type, set_tensor_wrappers_str,
+      set_attr_map_str, tensor_wrapper_members_str, attr_members_str);
 
   return grad_node_str;
 }
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 380edb9164e4f40fb03755bf5d17f70a0ff7cb53..9dccba034598bbfef205b6eb85ed5e149ba6d040 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -213,7 +213,8 @@ def ParseYamlArgs(string):
         default_value = m.group(3).split("=")[1].strip() if len(
             m.group(3).split("=")) > 1 else None
 
-        assert arg_type in yaml_types_mapping.keys(), arg_type
+        assert arg_type in yaml_types_mapping.keys(
+        ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping."
         arg_type = yaml_types_mapping[arg_type]
 
         arg_name = RemoveSpecialSymbolsInName(arg_name)
@@ -248,7 +249,8 @@ def ParseYamlReturns(string):
         else:
             ret_type = ret.strip()
 
-        assert ret_type in yaml_types_mapping.keys(), ret_type
+        assert ret_type in yaml_types_mapping.keys(
+        ), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping."
         ret_type = yaml_types_mapping[ret_type]
 
         assert "Tensor" in ret_type
@@ -477,6 +479,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
     # SetTensorWrapper Methods & TensorWrapper Members
     set_tensor_wrapper_methods_str = ""
     tensor_wrapper_members_str = ""
+    clear_tensor_wrapper_str = ""
     for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
         if tname in no_need_buffer_set:
             no_need_buffer = "true"
@@ -498,6 +501,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
 """
             tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format(
                 tensor_wrapper_name)
+
+            CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
+   {}.clear();
+"""
+            clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
+                tensor_wrapper_name)
+
         else:
             assert IsVectorTensorType(ttype)
             SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
@@ -515,6 +525,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
 """
             tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format(
                 tensor_wrapper_name)
+
+            CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
+   for (auto tw: {}) {
+     tw.clear();
+   };
+"""
+            clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
+                tensor_wrapper_name)
+
     # End: SetTensorWrapper Methods & TensorWrapper Members
 
     # SetAttributes & Attribute Members
@@ -523,7 +542,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
     for aname, atype, default_val, _ in backward_attrs_list:
         saved_attr_name = GetSavedName(aname)
         SET_ATTR_METHOD_TEMPLATE = """
-   void SetAttribute{}({} {}) {{     
+   void SetAttribute{}({} {}) {{
      {} = {};
    }}
 """
@@ -554,25 +573,37 @@ class {} : public egr::GradNodeBase {{
   ~{}() override = default;
 
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
   std::string name() override {{ return \" {} \"; }}
+  
+  void ClearTensorWrappers() override {{
+      {}
+    is_tensor_wrappers_cleared = true;
+  }}
+  
   // SetTensorWrapperX, SetTensorWrapperY, ...
   {}
   // SetAttributes
   {}
+
+  bool IsTensorWrappersCleared() override {{
+      return is_tensor_wrappers_cleared;  
+  }}
  private:
   // TensorWrappers
   {}
 
+  bool is_tensor_wrappers_cleared = false;
+
   // Attributes
   {}
 }};
 """
     node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
         grad_node_name, grad_node_name, grad_node_name, grad_node_name,
-        grad_node_name, set_tensor_wrapper_methods_str,
-        set_attribute_methods_str, tensor_wrapper_members_str,
-        attribute_members_str)
+        grad_node_name, clear_tensor_wrapper_str,
+        set_tensor_wrapper_methods_str, set_attribute_methods_str,
+        tensor_wrapper_members_str, attribute_members_str)
 
     return node_declaration_str
 
@@ -636,7 +667,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
         grad_api_namespace = f"paddle::experimental"
 
     FUNCTION_TEMPLATE = """
-std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
+std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
     // Call grad_api function
     auto grad_api_returns = {}::{}({});
     {}
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index b4b19c52a348be960758a50926803c9ed669eef6..aba3e227ab4b3c52f423ea581a502589fa93f416 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -25,7 +25,7 @@ atype_to_parsing_function = {
     "std::string": "CastPyArg2String",
     "int64_t": "CastPyArg2Long",
     "float": "CastPyArg2Float",
-    "string": "CastPyArg2String",
+    "std::string": "CastPyArg2String",
     "std::vector<bool>": "CastPyArg2Booleans",
     "std::vector<int>": "CastPyArg2Ints",
     "std::vector<long>": "CastPyArg2Longs",
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 1987d024d8f3e34121f54962c45f0f8c1e91b723..75ddfb92275524eece120e6f2aae4f41a3e67701 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -39,12 +39,21 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
   // Copy nodes
   std::queue<GradNodeBase*> queue = init_queue;
   std::unordered_set<GradNodeBase*> visited;
+  size_t potential_startup_ops_cnt = queue.size();
+  size_t cnt = 0;
 
   // Visit each node exactly once in any order
   while (!queue.empty()) {
     GradNodeBase* node = queue.front();
     queue.pop();
 
+    if (cnt < potential_startup_ops_cnt) {
+      if (!node_in_degree_map.count(node)) {
+        node_in_degree_map[node] = 0;
+      }
+      cnt += 1;
+    }
+
     if (visited.count(node)) {
       continue;
     }
@@ -76,23 +85,248 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
   return node_in_degree_map;
 }
 
-void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
-                 const std::vector<paddle::experimental::Tensor>& grad_tensors,
-                 bool retain_graph) {
-  paddle::platform::RecordEvent backward_record_event(
-      "backward", paddle::platform::TracerEventType::Operator, 1);
+// Remove some nodes those doesn't need to be
+// stored in potential_stop_nodes、potential_startup_nodes
+void UpdateGraphInfo(
+    std::unordered_map<GradNodeBase*, AutogradMeta*>*
+        target_nodes_inputmeta_map,
+    std::unordered_map<GradNodeBase*, std::unordered_set<GradNodeBase*>>*
+        depending_nodes,
+    std::unordered_set<GradNodeBase*>* potential_stop_nodes,
+    std::unordered_set<GradNodeBase*>* potential_startup_nodes) {
+  // Updated potential_sotp_nodes by depending_nodes,
+  // make sure the path from root to target_node is ok
+  std::unordered_set<GradNodeBase*> _startup_ops;
+  VLOG(6) << "Running in UpdateGraphInfo";
+  std::queue<GradNodeBase*> queue;
+  for (auto& target_nodes_inputmeta_pair : *target_nodes_inputmeta_map) {
+    queue.emplace(target_nodes_inputmeta_pair.first);
+  }
+
+  while (!queue.empty()) {
+    auto* target_node = queue.front();
+    queue.pop();
+    if (!(*depending_nodes)[target_node].empty()) {
+      auto precedding_nodes = (*depending_nodes)[target_node];
+      for (auto pre_nodes : precedding_nodes) {
+        queue.emplace(pre_nodes);
+        if (potential_stop_nodes->find(pre_nodes) !=
+            potential_stop_nodes->end()) {
+          potential_stop_nodes->erase(pre_nodes);
+        }
+      }
+    } else {  // startup_ops have no precedding nodes
+      VLOG(6) << "Emplace _startup_ops";
+      _startup_ops.emplace(target_node);
+    }
+  }
+  // Purify potential_startup_nodes again, remove some
+  // potential startup_nodes that unreach to input target nodes
+  if (!_startup_ops.empty()) {
+    std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
+    for (auto node : *potential_startup_nodes) {
+      if (_startup_ops.count(node) == 0) {
+        VLOG(6) << "Set up potential_startup_nodes_to_be_erased";
+        potential_startup_nodes_to_be_erased.emplace(node);
+      }
+    }
+    if (!potential_startup_nodes_to_be_erased.empty()) {
+      for (auto node : potential_startup_nodes_to_be_erased) {
+        VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased";
+        potential_startup_nodes->erase(node);
+      }
+    }
+  }
+}
+
+// Get Graph Info Betweent input target gradnode and outputs，
+// record depending_nodes、 potential_stop_nodes、potential_startup_nodes
+void GetGraphInfoBetweenTargets(
+    const std::queue<GradNodeBase*>& init_queue,
+    std::unordered_map<GradNodeBase*, AutogradMeta*>*
+        input_target_nodes_inputmeta_map,
+    std::unordered_map</*child node*/ GradNodeBase*,
+                       /*father nodes*/ std::unordered_set<GradNodeBase*>>*
+        depending_nodes,
+    std::unordered_set<GradNodeBase*>* potential_stop_nodes,
+    std::unordered_set<GradNodeBase*>* potential_startup_nodes) {
+  if (input_target_nodes_inputmeta_map->empty()) return;
+
+  VLOG(6) << "Runing In GetGraphInfoBetweenTargets";
+
+  // Calculate in_degree for each node
+  std::unordered_map<GradNodeBase*, int> node_in_degree_map;
+
+  // Copy nodes
+  std::queue<GradNodeBase*> queue = init_queue;
+  std::unordered_set<GradNodeBase*> visited;
+
+  // Visit each node exactly once in any order
+  while (!queue.empty()) {
+    GradNodeBase* node = queue.front();
+    queue.pop();
+
+    if (visited.count(node)) {
+      continue;
+    }
+    visited.insert(node);
+
+    // Check node is target_nodes or not, if node is not target_node,
+    // all the next_node will be marked in potential_stop_nodes
+    bool is_potential_stop_nodes =
+        input_target_nodes_inputmeta_map->count(node);
+
+    // Find and append next nodes
+    const std::vector<std::vector<Edge>>& edges = node->GetEdges();
+    for (const auto& edge_list : edges) {
+      for (const Edge& edge : edge_list) {
+        GradNodeBase* next_node = edge.GetMutableGradNode().get();
+
+        // Next node could be nullptr if it is leaf tensor with no
+        // AccumulationNode attached
+        // Or it could also originated from dispensable inputs
+        if (!next_node) continue;
+
+        // if node not in input_target_nodes,
+        // all the next_nodes of current node will be inserted to
+        // potential_stop_node
+        if (is_potential_stop_nodes) {
+          potential_stop_nodes->emplace(next_node);
+        }
+
+        // Update in_degree
+        if (!node_in_degree_map.count(next_node))
+          node_in_degree_map[next_node] = 0;
+        node_in_degree_map[next_node]++;
 
+        // Record depending relationship
+        (*depending_nodes)[next_node].emplace(node);
+        queue.push(next_node);
+      }
+    }
+  }
+  // Update Graph Info, remove some stop_node in potential_stop_nodes
+  UpdateGraphInfo(input_target_nodes_inputmeta_map, depending_nodes,
+                  potential_stop_nodes, potential_startup_nodes);
+}
+
+void GetTargetNodesInfo(const std::vector<paddle::experimental::Tensor>& inputs,
+                        std::unordered_map<GradNodeBase*, AutogradMeta*>*
+                            target_nodes_inputmeta_map) {
+  VLOG(6) << "Running in GetTargetNodesInfo";
+  if (!inputs.empty()) {
+    VLOG(6) << "Inputs are not empty";
+    size_t num_inputs = inputs.size();
+    for (size_t i = 0; i < num_inputs; i++) {
+      AutogradMeta* auto_grad_meta =
+          EagerUtils::unsafe_autograd_meta(inputs[i]);
+      auto target_node = auto_grad_meta->GetMutableGradNode().get();
+
+      PADDLE_ENFORCE_NOT_NULL(target_node,
+                              paddle::platform::errors::Fatal(
+                                  "There is no grad op for input:%d or it's"
+                                  "stop_gradient=True",
+                                  i));
+      (*target_nodes_inputmeta_map)[target_node] = auto_grad_meta;
+    }
+  }
+}
+
+std::vector<paddle::experimental::Tensor> GetResults(
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    std::unordered_map<GradNodeBase*, paddle::experimental::Tensor>*
+        results_map,
+    bool allow_unused, bool create_graph) {
+  VLOG(6) << "Running in GetResults";
+  if (inputs.empty()) return {};
+
+  std::vector<paddle::experimental::Tensor> results;
+  results.reserve(inputs.size());
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto& input = inputs[i];
+    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input);
+    auto target_node = auto_grad_meta->GetMutableGradNode().get();
+
+    auto iter = results_map->find(target_node);
+    if (iter != results_map->end()) {
+      // set StopGradient = !create_graph
+      AutogradMeta* tensor_auto_grad_meta =
+          EagerUtils::autograd_meta(&(iter->second));
+      tensor_auto_grad_meta->SetStopGradient(!create_graph);
+      results.emplace_back(iter->second);
+    } else {
+      PADDLE_ENFORCE_EQ(allow_unused, true,
+                        paddle::platform::errors::InvalidArgument(
+                            "The %d-th input does not appear in the backward "
+                            "graph. Please check the input variable or set "
+                            "allow_unused=True to get None result.",
+                            i));
+      results.emplace_back();
+    }
+  }
+  return results;
+}
+
+// Enforce GradNode has TensorWrappers as Input
+void EnforceGradNodeHasInput(GradNodeBase* node) {
+  VLOG(6) << "Running in EnforceGradNodeHasInput";
+  PADDLE_ENFORCE_NE(
+      node->IsTensorWrappersCleared(), true,
+      paddle::platform::errors::Fatal(
+          "The TensorWrappers of %s do not exist. This may be because:\n"
+          "You calculate backward twice for the same subgraph without "
+          "setting retain_graph=True. Please set retain_graph=True in the "
+          "first backward/grad call.\n",
+          node->name()));
+}
+
+// Purify potential_startup_nodes, remove nodes those are the same as
+// input_target_nodes
+void PurifyPotentialStartUpNodes(
+    std::unordered_set<GradNodeBase*>* potential_startup_nodes,
+    std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>*
+        input_target_nodes_inputmeta_map) {
+  VLOG(6) << "Running in PurifyPotentialStartUpNodes";
+  if (input_target_nodes_inputmeta_map->empty()) return;
+  std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
+  for (auto startup_op : *potential_startup_nodes) {
+    auto iter = input_target_nodes_inputmeta_map->find(startup_op);
+    if (iter != input_target_nodes_inputmeta_map->end()) {
+      potential_startup_nodes_to_be_erased.emplace(iter->first);
+    }
+  }
+  if (!potential_startup_nodes_to_be_erased.empty()) {
+    for (auto nodes : potential_startup_nodes_to_be_erased) {
+      potential_startup_nodes->erase(nodes);
+    }
+  }
+}
+
+std::vector<paddle::experimental::Tensor> RunBackward(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // output
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph, bool create_graph = false,
+    const std::vector<paddle::experimental::Tensor>& inputs = {},
+    bool allow_unused = false,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
   VLOG(6) << "Start Backward";
   // *Gradient Hook should happen at node-level
   // *Inplace version check should perform at node-level
   // *Cross-batch accumulation happens at forward pass
 
+  std::unordered_map<GradNodeBase*, AutogradMeta*>
+      no_grad_var_nodes_inputmeta_map;
+  // Get no_grad_vars's GradNodes and InputMeta Info
+  GetTargetNodesInfo(no_grad_vars, &no_grad_var_nodes_inputmeta_map);
+
   /* --- Initialization --- */
   // 1. Init queue with starting nodes
   // 2. Prepare initial input buffers
   std::queue<GradNodeBase*> queue;
   std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
       node_input_buffers_dict;
+  std::unordered_set<GradNodeBase*> potential_startup_nodes;
   for (size_t i = 0; i < tensors.size(); i++) {
     const paddle::experimental::Tensor& tensor = tensors[i];
 
@@ -132,8 +366,17 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
               "size = 0 or same size as tensors"));
       // Feed given tensor if it's provided
       VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
-      node_input_buffers_dict[grad_node]->add(
-          input_info.first, input_info.second, grad_tensors[i]);
+
+      if (grad_tensors[i].is_initialized()) {
+        // Deep copy
+        paddle::experimental::Tensor tmp_tensor;
+        tmp_tensor.copy_(grad_tensors[i], grad_tensors[i].inner_place(), true);
+        node_input_buffers_dict[grad_node]->add(input_info.first,
+                                                input_info.second, tmp_tensor);
+      } else {
+        node_input_buffers_dict[grad_node]->add(
+            input_info.first, input_info.second, grad_tensors[i]);
+      }
 
     } else {
       VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
@@ -146,8 +389,9 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
           input_info.first, input_info.second, tensor, true /*fill_one=true*/);
     }
 
-    // Prepare queue
+    // Prepare queue, potential startup_nodes
     queue.push(grad_node);
+    potential_startup_nodes.emplace(grad_node);
   }
 
   VLOG(6) << "Update In degree Map for backward";
@@ -155,25 +399,74 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
   std::unordered_map<GradNodeBase*, int> node_in_degree_map =
       getInDegreeMap(queue);
 
+  // Get input's GradNodes and InputMeta Info
+  std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
+      input_target_nodes_inputmeta_map;
+  GetTargetNodesInfo(inputs, &input_target_nodes_inputmeta_map);
+
+  // Purify potential_startup_ops, remove those nodes that are the same as
+  // input_target_nodes
+  PurifyPotentialStartUpNodes(&potential_startup_nodes,
+                              &input_target_nodes_inputmeta_map);
+
+  // Get Graph Info Betweent input target gradnode and outputs
+  // Record the depending_nodes and potential_stop_nodes
+  std::unordered_map<GradNodeBase* /* child node */,
+                     std::unordered_set<GradNodeBase*> /* father node */>
+      depending_nodes;
+  std::unordered_set<GradNodeBase*> potential_stop_nodes;
+  // std::unordered_set<GradNodeBase*> startup_ops;
+
+  GetGraphInfoBetweenTargets(queue, &input_target_nodes_inputmeta_map,
+                             &depending_nodes, &potential_stop_nodes,
+                             &potential_startup_nodes);
+
+  // ready_queue store all startup nodes
+  std::queue<GradNodeBase*> ready_queue;
+  // startup op's indegree should be 0
+  for (auto node : potential_startup_nodes) {
+    if (node_in_degree_map[node] == 0) {
+      ready_queue.emplace(node);
+    }
+  }
+
+  VLOG(1) << " startup_ops' size is :" << ready_queue.size();
+
+  std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;
+
+  // read_queue is empty only when 1.input equals to output. 2.input can not
+  // reach to output.
+  if (ready_queue.size() == 0) {
+    for (auto input_target_node : input_target_nodes_inputmeta_map) {
+      // out rank_info of forward op
+      auto rank_info = input_target_node.second->OutRankInfo();
+      if (node_input_buffers_dict[input_target_node.first]) {
+        auto& target_result =
+            node_input_buffers_dict[input_target_node.first]
+                ->Buffers()[rank_info.first][rank_info.second];
+        // save the target result
+        results_map[input_target_node.first] = target_result;
+      }
+    }
+  }
+
   /* --- Topological Visit --- */
   // 1. Pop queue
   // 2. Run node
+  //    |- Check and capture target result
   //    |- node(grads)
   //    |- Prepare for next node
   // 3. Update queue
   VLOG(6) << "Run Backward";
-  while (!queue.empty()) {
-    GradNodeBase* node = queue.front();
+  while (!ready_queue.empty()) {
+    GradNodeBase* node = ready_queue.front();
+    VLOG(6) << "Running GradNode:" << node->name();
+    ready_queue.pop();
 
     paddle::platform::RecordEvent node_record_event(
         std::string(typeid(*node).name()) + " grad_node",
         paddle::platform::TracerEventType::Operator, 1);
 
-    if (queue.size() > 1 && node_in_degree_map[node] != 0) {
-      queue.pop();
-      continue;
-    }
-    queue.pop();
     // Run node: This is where Hook happens
     PADDLE_ENFORCE(
         node_input_buffers_dict.count(node),
@@ -184,10 +477,45 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
     std::unique_ptr<GradTensorHolder> node_input_buffer =
         std::move(node_input_buffers_dict[node]);
 
+    // get target grad_var from node_input_buffer by inputmeta
+    if (input_target_nodes_inputmeta_map.find(node) !=
+        input_target_nodes_inputmeta_map.end()) {
+      VLOG(6) << "Get target result by by inputmeta";
+      // out rank_info of forward op
+      auto rank_info = input_target_nodes_inputmeta_map[node]->OutRankInfo();
+      // rank_info is a pair, first means slot_id, second means rank.
+      auto& target_result =
+          node_input_buffer->Buffers()[rank_info.first][rank_info.second];
+      // save the target result
+      results_map[node] = target_result;
+    }
+
+    // no_grad_vars
+    if (no_grad_var_nodes_inputmeta_map.find(node) !=
+        no_grad_var_nodes_inputmeta_map.end()) {
+      VLOG(6) << "Change the input buffer[slot][rank] by Zeros";
+      auto rank_info = no_grad_var_nodes_inputmeta_map[node]->OutRankInfo();
+      node_input_buffer->SetBufferSlotRankZeros(rank_info.first,
+                                                rank_info.second);
+    }
+
+    VLOG(6) << "Running GradNode:" << node->name();
+
+    // check input
+    EnforceGradNodeHasInput(node);
+
     VLOG(6) << "Run Backward Kernel with GradTensorHolder";
     // Run Pre Backward Node and get outputs
     std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
-        (*node)(node_input_buffer->Buffers());
+        (*node)(node_input_buffer->Buffers(), create_graph);
+
+    // retain_grad or not
+    if (!retain_graph) {
+      VLOG(6)
+          << "retain_graph is false, need to clear the TensorWrapper of nodes.";
+      node->ClearTensorWrappers();
+    }
+
     // TODO(jiabin): Should we erase it or find a more efficient way.
     node_input_buffers_dict.erase(node);
 
@@ -252,18 +580,44 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
 
         // Update queue
         node_in_degree_map[next_node]--;
+
         PADDLE_ENFORCE(
             node_in_degree_map[next_node] >= 0,
             paddle::platform::errors::Fatal(
                 "Detected in-degree value smaller than zero. For Node: %s"
                 "Node's in-degree cannot be negative",
                 next_node->name()));
-        if (node_in_degree_map[next_node] == 0) {
-          queue.emplace(std::move(next_node));
+
+        bool is_potential_stop_node = potential_stop_nodes.count(next_node);
+
+        if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) {
+          ready_queue.emplace(std::move(next_node));
         }
       }
     }
   }
+
+  return GetResults(inputs, &results_map, allow_unused, create_graph);
 }
 
+void Backward(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // output
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph) {
+  VLOG(6) << "Run in Backward";
+  paddle::platform::RecordEvent backward_record_event(
+      "backward", paddle::platform::TracerEventType::Operator, 1);
+  RunBackward(tensors, grad_tensors, retain_graph);
+}
+
+std::vector<paddle::experimental::Tensor> Grad(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // output
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
+  VLOG(6) << "Run in Grad";
+  return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs,
+                     allow_unused, no_grad_vars);
+}
 }  // namespace egr
diff --git a/paddle/fluid/eager/backward.h b/paddle/fluid/eager/backward.h
index 2856d9fb87f34b1066bb59eb38bcaee786d2a260..bebe664838e6c1f98219ceee6e6733b49c319b3c 100644
--- a/paddle/fluid/eager/backward.h
+++ b/paddle/fluid/eager/backward.h
@@ -19,12 +19,20 @@
 
 namespace egr {
 
-// run_backward():
+// Backward():
 // tensors corresponds to those lived in the backward graph
 // each grad_tensors[i] keeps the value for its corresponding tensors[i]
-void RunBackward(const std::vector<paddle::experimental::Tensor> &tensors,
-                 const std::vector<paddle::experimental::Tensor> &grad_tensors,
-                 bool retain_graph = false);
+void Backward(const std::vector<paddle::experimental::Tensor>& tensors,
+              const std::vector<paddle::experimental::Tensor>& grad_tensors,
+              bool retain_graph = false);
+
+std::vector<paddle::experimental::Tensor> Grad(
+    const std::vector<paddle::experimental::Tensor>& tensors,
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    const std::vector<paddle::experimental::Tensor>& grad_tensors = {},
+    bool retain_graph = false, bool create_graph = false,
+    bool only_inputs = false, bool allow_unused = false,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {});
 
 // Reserved for gradient()
 
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 48ac8c8358afd68cee9d22b8ea0a4e8fd7c3c92e..72af1cc4b068679e72ae6bdc5e09fab8f56bac04 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -20,8 +20,8 @@
 
 namespace egr {
 std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
-operator()(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+           bool create_graph) {
   paddle::CustomOpKernelContext ctx;
   auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
       egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h
index e5ddef9c062149282d790a5fd6bf31b25a20cf5a..6ece2658575c795856438904c2716d61f0985879 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.h
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -37,8 +37,8 @@ class RunCustomOpNode : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph) override;
 
   std::string name() {
     return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_);
@@ -62,6 +62,12 @@ class RunCustomOpNode : public GradNodeBase {
     return res;
   }
 
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
+
   void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
 
  public:
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 16513f05e0777a8e57f54c925d68867dda656612..168e1bcca77ca85eb6fa90a23350d1f62f63dc8e 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -95,8 +95,12 @@ class GradNodeBase {
    * is better choice to fit this format.
    * **/
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads) = 0;
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) = 0;
 
+  virtual void ClearTensorWrappers() = 0;
+
+  virtual bool IsTensorWrappersCleared() = 0;
   /**
    * AddEdges is designed to set input tensors' backward Node as current
    * node's Edges.
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 69fc7df2f1420382735cf59fbe85f7e2207d0f77..163d25e85ce8c085087331c6e3273075aed5e5f4 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -21,6 +21,11 @@
 
 namespace egr {
 
+void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) {
+  buffer_[slot_id][rank] =
+      paddle::experimental::zeros_like(buffer_[slot_id][rank]);
+}
+
 void GradTensorHolder::add(size_t slot_id, size_t rank,
                            const paddle::experimental::Tensor& t,
                            bool fill_one) {
diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h
index d66a81fe8285980bad4159d5414985dc9c744549..9059b403607461cc980a58d345fe1542aa4b1903 100644
--- a/paddle/fluid/eager/grad_tensor_holder.h
+++ b/paddle/fluid/eager/grad_tensor_holder.h
@@ -56,6 +56,8 @@ class GradTensorHolder {
     return buffer_;
   }
 
+  void SetBufferSlotRankZeros(size_t slot_id, size_t rank);
+
  private:
   std::vector<std::vector<paddle::experimental::Tensor>> buffer_;
 };
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 31aaa93c41643f565836c536d7001c01d2a0826d..0e11444b81526de1904b72fc983814314d834a45 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -98,6 +98,8 @@ class TensorWrapper {
     }
   }
 
+  void clear() { intermidiate_tensor_.reset(); }
+
  private:
   bool full_reserved_ = false;
   std::pair<size_t, size_t> out_rank_info_;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index 1683f4ed5fbe5e4b014e9b369e0231d149c187f1..c8b2d22dcf95139db47704be86a6f64554f7c0ba 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -17,6 +17,14 @@
 
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT);
+#endif
 
 namespace eager_test {
 using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta;
@@ -151,5 +159,50 @@ TEST(EagerVariable, Constructor) {
   CHECK_EQ(dt3_tmp_ptr[1], 10.0f);
   t4.reset();
   CHECK(t4.defined() == false);
+
+  VLOG(6) << "Check Tensor Copy_";
+  std::vector<int64_t> rows = {1, 2};
+  std::vector<int64_t> dims = {2};
+  paddle::experimental::Tensor t7(std::make_shared<phi::SelectedRows>(rows, 2));
+  std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
+      ->mutable_value()
+      ->Resize(phi::make_ddim(dims));
+  auto* dt7_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
+                          ->mutable_value()
+                          ->mutable_data<float>(paddle::platform::CPUPlace());
+  dt7_tmp_ptr[0] = 6.0f;
+  dt7_tmp_ptr[1] = 11.0f;
+
+  paddle::experimental::Tensor t8;
+  paddle::experimental::Tensor t5;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  paddle::experimental::Tensor t6;
+  paddle::experimental::Tensor t9;
+  VLOG(6) << "Check Tensor Copy_ Selected Rows";
+  t8.copy_(t7, paddle::platform::CUDAPlace(0), true);
+  t9.copy_(t8, paddle::platform::CPUPlace(), true);
+  auto* dt9_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())
+                          ->value()
+                          .data<float>();
+  CHECK_EQ(dt9_tmp_ptr[0], 6.0f);
+  CHECK_EQ(dt9_tmp_ptr[1], 11.0f);
+  CHECK_EQ(std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())->height(),
+           2);
+
+  VLOG(6) << "Check Tensor Copy_ Dense Tensor";
+  t5.copy_(t3, paddle::platform::CUDAPlace(0), true);
+  t6.copy_(t5, paddle::platform::CPUPlace(), true);
+  auto* dt6_tmp_ptr =
+      std::dynamic_pointer_cast<phi::DenseTensor>(t6.impl())->data<float>();
+  CHECK_EQ(dt6_tmp_ptr[0], 5.0f);
+  CHECK_EQ(dt6_tmp_ptr[1], 10.0f);
+#else
+  t5.copy_(t3, paddle::platform::CPUPlace(), true);
+  auto* dt5_tmp_ptr =
+      std::dynamic_pointer_cast<phi::DenseTensor>(t5.impl())->data<float>();
+  CHECK_EQ(dt5_tmp_ptr[0], 5.0f);
+  CHECK_EQ(dt5_tmp_ptr[1], 10.0f);
+#endif
+
   VLOG(6) << "Finish";
 }
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index 535c93ac53b1751d9634476e47f32dc0cbe22708..0b167203735d65683b0f978fa34fe7f457aae4f2 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase {
   GradTestNode() : GradNodeBase() { val_ = 1.0; }
   std::string name() override { return "GradTestNode"; }
   std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override {
+      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) override {
     val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
                ->data<float>()[0];
     phi::DenseTensorMeta meta =
@@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase {
     std::vector<std::vector<paddle::experimental::Tensor>> res = {{et1}};
     return res;
   }
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
   float val_;
 };
 }  // namespace eager_test
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index 769bd7f687f4584d44bbfa30b73611a3128289bf..887ea3e3acfd50a15206f3e84ab45e16707f80af 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -58,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 10)
@@ -80,7 +80,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 2)
@@ -106,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 2)
@@ -137,7 +137,7 @@ void benchmark_eager_intermediate_mlp(
       reduce_sum_dygraph_function(input0, {{"reduce_all", true}});
 
   std::vector<paddle::experimental::Tensor> target_tensors = {Out};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     std::unordered_map<std::string, float> result =
diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
index c65ad4641cf2206cc0f97d91f1fb24e50b7b63cd..52dba6b9218c7be8a29ae1aff619facd25a6f3b6 100644
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
@@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_
 cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
+cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node)
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index 0c894ed267fcdd08d44d4df08bfaf0554874aebf..87f8f6eca1f88fe9a54583ee19586dd75c7e231e 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -33,6 +33,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
 namespace egr {
 
@@ -79,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
   }
   std::vector<paddle::experimental::Tensor> outs = {target_tensor};
   // Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
@@ -138,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) {
   }
 
   // Run Backward
-  RunBackward(target_tensors, grad_tensors);
+  Backward(target_tensors, grad_tensors);
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
@@ -211,7 +212,7 @@ TEST(Backward, LinearNodes) {
   }
 
   // Use Empty Grad Tensor
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
@@ -315,7 +316,7 @@ TEST(Backward, WithAccumulation) {
     node2_ptr->AddEdges(&res2, 0);
   }
 
-  RunBackward(target_tensors, grad_tensors);
+  Backward(target_tensors, grad_tensors);
 
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
 }
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 36594f1aac8cdb131bb77f1396dca19a0c2e8cc0..8b0759c17ed3712079e8954df60e35afaaf02a9e 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -71,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
   std::vector<egr::AutogradMeta*> res = {meta};
   scale_node_ptr->AddEdges(&res, 0);
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index f7fa642ea8dd17d20816e74c9bfb4cd92b184b4a..882695e98d109e09340223e21322a02d1b48c6ea 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -86,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) {
 
   std::vector<paddle::experimental::Tensor> outs = {out};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   VLOG(7) << "Target Grad is: "
           << std::static_pointer_cast<phi::DenseTensor>(
@@ -137,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) {
 
   std::vector<paddle::experimental::Tensor> outs = {out1};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 10.0);
@@ -203,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
@@ -260,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   // leaf grad
@@ -318,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
 
   // Cross Batch Accumulation
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 60.0);
@@ -356,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
 
   std::vector<paddle::experimental::Tensor> outs = {out};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 2.0);
@@ -412,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
   // TODO(jiabin): fix this with add functor
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 2a5ad53204a6201149bec0b3dac0fa3baf441f2e..49e517dc9b3f3271ef26dfbece46f799ef805c57 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -57,7 +57,7 @@ TEST(Generated, Sigmoid) {
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
   VLOG(6) << "Runing Backward";
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   VLOG(6) << "Finish Backward";
   eager_test::CompareGradTensorWithValue<float>(tensor, 0.25);
@@ -89,7 +89,7 @@ TEST(Generated, Matmul_v2) {
   eager_test::CompareTensorWithValue<float>(output_tensor, 96);
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
   eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4);
@@ -120,7 +120,7 @@ TEST(Generated, ElementwiseAdd) {
   eager_test::CompareTensorWithValue<float>(output_tensor, 5);
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 1.0);
   eager_test::CompareGradTensorWithValue<float>(Y, 1.0);
@@ -128,6 +128,6 @@ TEST(Generated, ElementwiseAdd) {
 
 }  // namespace egr
 
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6b03799c48659c579938df6efc0f7cf57bbc0bec
--- /dev/null
+++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc
@@ -0,0 +1,339 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tests/test_utils.h"
+
+#include "paddle/fluid/eager/api/all.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+namespace egr {
+
+TEST(Grad, SingleNodeEmptyGrad) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor (output)
+  paddle::experimental::Tensor output_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+
+  // Create input tensor
+  const paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+
+  {
+    // Create Scale Node
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Output_tensor set GradNode、OutRank、StopGradient propertis
+    AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&output_tensor);
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+
+    // Get autograd_meta from input tensor
+    AutogradMeta* auto_grad_meta1 =
+        EagerUtils::unsafe_autograd_meta(leaf_tensor);
+
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    // input tensor set GradNode、OutRank、StopGradient propertis
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+
+    // grad_node Add Edges
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
+    node0_ptr->AddEdges(&res, 0);
+  }
+  std::vector<paddle::experimental::Tensor> outs = {output_tensor};
+
+  // Run Grad
+  auto result = Grad(outs, {leaf_tensor}, {});
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 5.0);
+}
+
+TEST(Grad, SingleNodeCustomGrad) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+
+  std::vector<paddle::experimental::Tensor> grad_tensors;
+  // Create Grad Tensor
+  paddle::experimental::Tensor grad_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+  grad_tensors.emplace_back(std::move(grad_tensor));
+
+  paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+
+  {
+    // Create Scale Node
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Connect Tensor and Node via AutoGradMeta
+    AutogradMeta* auto_grad_meta =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
+    node0_ptr->AddEdges(&res, 0);
+  }
+
+  auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
+
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 50.0);
+}
+
+/*
+Node1
+  |
+Node0
+  |
+ { } // empty grad tensor
+*/
+TEST(Grad, LinearNodes) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Target Tensor
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+
+  paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+  {
+    // Create Node0
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta for node0
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Create Node1
+    auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node1_ptr->SetAttributes_scale(10.0 /*scale*/);
+
+    // Set grad in/out meta for node1
+    node1_ptr->SetDefaultGradInOutMeta();
+
+    // Connect Input Tensor and Node0 via AutoGradMeta
+    AutogradMeta* auto_grad_meta =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+    // Connect Node0 -> Node1 via Edge
+    auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
+    meta0.SetSingleOutRankWithSlot(0, 0);
+    meta0.SetGradNode(node1_ptr);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
+    node1_ptr->AddEdges(&res1, 0);
+  }
+
+  // Use Empty Grad Tensor
+  auto result = Grad(target_tensors, {leaf_tensor}, {});
+
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 50.0);
+}
+
+/*
+    Node2
+    |   |
+Node0   Node1
+  |      |
+ in0   in1
+*/
+TEST(Grad, WithAccumulation) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor0));
+  target_tensors.emplace_back(std::move(tensor1));
+
+  // Create Grad Tensor
+  std::vector<paddle::experimental::Tensor> grad_tensors;
+  paddle::experimental::Tensor grad_tensor0 =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor grad_tensor1 =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+  grad_tensors.emplace_back(std::move(grad_tensor0));
+  grad_tensors.emplace_back(std::move(grad_tensor1));
+
+  paddle::experimental::Tensor leaf_tensor;
+  {
+    // Create Node0
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Create Node1
+    auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node1_ptr->SetAttributes_scale(10.0 /*scale*/);
+    node1_ptr->SetDefaultGradInOutMeta();
+    // Create Node2
+    auto node2_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node2_ptr->SetAttributes_scale(20.0 /*scale*/);
+    node2_ptr->SetDefaultGradInOutMeta();
+    // Connect Inp0 and Node0 via AutoGradMeta
+    AutogradMeta* auto_grad_meta0 =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta0->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta0->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta0->SetStopGradient(false);
+    // Connect Inp1 and Node1 via AutoGradMeta
+    AutogradMeta* auto_grad_meta1 =
+        EagerUtils::autograd_meta(&(target_tensors[1]));
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node1_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+
+    // Connect Node0 -> Node2 via Edge
+    auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
+    meta0.SetSingleOutRankWithSlot(0, 0);
+    meta0.SetGradNode(node2_ptr);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
+
+    // Connect Node1 -> Node2 via Edge
+    auto meta1 = egr::AutogradMeta();
+    meta1.SetStopGradient(false);
+    meta1.SetSingleOutRankWithSlot(0, 0);
+    meta1.SetGradNode(node2_ptr);
+    std::vector<egr::AutogradMeta*> res1 = {&meta1};
+    node1_ptr->AddEdges(&res1, 0);
+
+    AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta2);
+
+    auto_grad_meta2->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
+
+    auto_grad_meta2->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
+    node2_ptr->AddEdges(&res2, 0);
+  }
+
+  auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
+
+  eager_test::CompareTensorWithValue<float>(result[0], 2500.0);
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index d546df4ed087a99a28096a5336fab3826991534a..2c53fc89f650e36f1435c7e1e805453fe7822cf2 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -132,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         leaf_tensor);  // result: 4.0*5.0 + 3.0 = 23.0
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 4.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
@@ -199,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) {
         leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
 }
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index 56813c498d2410caa452da7a334c393b230c65bf..b86865e2d126fbfc0b00495a6e3208932ac6de39 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -108,7 +108,7 @@ void test_sigmoid(bool is_remove_gradient_hook) {
   }
 
   VLOG(6) << "Runing Backward";
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
   VLOG(6) << "Finish Backward";
 
   eager_test::CompareGradTensorWithValue<float>(
@@ -166,7 +166,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) {
     grad_node_tmp->RemoveGradientHook(hook_id);
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 1.0);
   eager_test::CompareGradTensorWithValue<float>(
@@ -224,7 +224,7 @@ void test_matmul(bool is_remove_gradient_hook) {
     grad_node_tmp->RemoveGradientHook(hook_id);
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
   eager_test::CompareGradTensorWithValue<float>(
@@ -255,6 +255,6 @@ TEST(Hook_intermidiate, Matmul_v2) {
 }
 }  // namespace egr
 
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index d99624e49324853d513a20a725c1a3d12b6aaab5..4eaa64d3ac659ca0ec76083b70855d8b6b241556 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
   ~GradNodeRunProgram() override = default;
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>> &grads)
-      override {
+      const std::vector<std::vector<paddle::experimental::Tensor>> &grads,
+      bool create_graph) override {
     VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
     PADDLE_ENFORCE_EQ(
         grads.size(), 1,
@@ -415,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase {
     // return {x_grad, details::DereferenceTensors(params_grad_ptr)};
   }
 
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
+
   // SetAttrMap
   void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
     attrs_ = attrs;
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 17346f5fd939324e6c2d709fb09be2cb65669429..2b8b4b3ff9573f601f8da3092c18433a49a93869 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -10,8 +10,9 @@ IF(WITH_GPU)
     nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
-    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm)
+    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table)
     nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
+    nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..235f7a226ad17649960d1e72d7907e8013e406fe
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace framework {
+struct GpuPsGraphNode {
+  int64_t node_id;
+  int neighbor_size, neighbor_offset;
+  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
+  // neighbor_size) of int64_t *neighbor_list;
+};
+
+struct GpuPsCommGraph {
+  int64_t *neighbor_list;
+  GpuPsGraphNode *node_list;
+  int neighbor_size, node_size;
+  // the size of neighbor array and graph_node_list array
+  GpuPsCommGraph()
+      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
+  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
+                 int neighbor_size_, int node_size_)
+      : neighbor_list(neighbor_list_),
+        node_list(node_list_),
+        neighbor_size(neighbor_size_),
+        node_size(node_size_) {}
+};
+
+/*
+suppose we have a graph like this
+
+0----3-----5----7
+ \   |\         |\
+ 17  8 9        1 2
+
+we save the nodes in arbitrary order,
+in this example,the order is
+[0,5,1,2,7,3,8,9,17]
+let us name this array u_id;
+we record each node's neighbors:
+0:3,17
+5:3,7
+1:7
+2:7
+7:1,2,5
+3:0,5,8,9
+8:3
+9:3
+17:0
+
+by concatenating each node's neighbor_list in the order we save the node id.
+we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
+this is the neighbor_list of GpuPsCommGraph
+given this neighbor_list and the order to save node id,
+we know,
+node 0's neighbors are in the range [0,1] of neighbor_list
+node 5's neighbors are in the range [2,3] of neighbor_list
+node 1's neighbors are in the range [4,4] of neighbor_list
+node 2:[5,5]
+node 7:[6,6]
+node 3:[9,12]
+node 8:[13,13]
+node 9:[14,14]
+node 17:[15,15]
+...
+by the above information,
+we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
+of size 9,
+where node_list[i].id = u_id[i]
+then we have:
+node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
+node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
+node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
+node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
+node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
+node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
+node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
+node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
+node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
+*/
+struct NeighborSampleResult {
+  int64_t *val;
+  int *actual_sample_size, sample_size, key_size;
+  NeighborSampleResult(int _sample_size, int _key_size)
+      : sample_size(_sample_size), key_size(_key_size) {
+    actual_sample_size = NULL;
+    val = NULL;
+  };
+  ~NeighborSampleResult() {
+    if (val != NULL) cudaFree(val);
+    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
+  }
+};
+
+struct NodeQueryResult {
+  int64_t *val;
+  int actual_sample_size;
+  NodeQueryResult() {
+    val = NULL;
+    actual_sample_size = 0;
+  };
+  ~NodeQueryResult() {
+    if (val != NULL) cudaFree(val);
+  }
+};
+}
+};
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index a6508bf96c00f835da4aee79503f16fa5451e794..b8f9f0bfec9b2a0bf6b6fb1e122e40b3eaa90fa8 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -14,114 +14,25 @@
 
 #pragma once
 #include "heter_comm.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 #include "paddle/fluid/platform/enforce.h"
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace framework {
-struct GpuPsGraphNode {
-  int64_t node_id;
-  int neighbor_size, neighbor_offset;
-  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
-  // neighbor_size) of int64_t *neighbor_list;
-};
-
-struct GpuPsCommGraph {
-  int64_t *neighbor_list;
-  GpuPsGraphNode *node_list;
-  int neighbor_size, node_size;
-  // the size of neighbor array and graph_node_list array
-  GpuPsCommGraph()
-      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
-  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
-                 int neighbor_size_, int node_size_)
-      : neighbor_list(neighbor_list_),
-        node_list(node_list_),
-        neighbor_size(neighbor_size_),
-        node_size(node_size_) {}
-};
-
-/*
-suppose we have a graph like this
 
-0----3-----5----7
- \   |\         |\
- 17  8 9        1 2
-
-we save the nodes in arbitrary order,
-in this example,the order is
-[0,5,1,2,7,3,8,9,17]
-let us name this array u_id;
-we record each node's neighbors:
-0:3,17
-5:3,7
-1:7
-2:7
-7:1,2,5
-3:0,5,8,9
-8:3
-9:3
-17:0
-
-by concatenating each node's neighbor_list in the order we save the node id.
-we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
-this is the neighbor_list of GpuPsCommGraph
-given this neighbor_list and the order to save node id,
-we know,
-node 0's neighbors are in the range [0,1] of neighbor_list
-node 5's neighbors are in the range [2,3] of neighbor_list
-node 1's neighbors are in the range [4,4] of neighbor_list
-node 2:[5,5]
-node 7:[6,6]
-node 3:[9,12]
-node 8:[13,13]
-node 9:[14,14]
-node 17:[15,15]
-...
-by the above information,
-we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
-of size 9,
-where node_list[i].id = u_id[i]
-then we have:
-node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
-node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
-node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
-node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
-node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
-node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
-node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
-node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
-node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
-*/
-struct NeighborSampleResult {
-  int64_t *val;
-  int *actual_sample_size, sample_size, key_size;
-  NeighborSampleResult(int _sample_size, int _key_size)
-      : sample_size(_sample_size), key_size(_key_size) {
-    actual_sample_size = NULL;
-    val = NULL;
-  };
-  ~NeighborSampleResult() {
-    if (val != NULL) cudaFree(val);
-    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
-  }
-};
-
-struct NodeQueryResult {
-  int64_t *val;
-  int actual_sample_size;
-  NodeQueryResult() {
-    val = NULL;
-    actual_sample_size = 0;
-  };
-  ~NodeQueryResult() {
-    if (val != NULL) cudaFree(val);
-  }
-};
 class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
  public:
   GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource)
       : HeterComm<int64_t, int, int>(1, resource) {
     load_factor_ = 0.25;
+    rw_lock.reset(new pthread_rwlock_t());
+    cpu_table_status = -1;
+  }
+  ~GpuPsGraphTable() {
+    if (cpu_table_status != -1) {
+      end_graph_sampling();
+    }
   }
   void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
   NodeQueryResult *graph_node_sample(int gpu_id, int sample_size);
@@ -134,9 +45,19 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
                                                  int *h_right,
                                                  int64_t *src_sample_res,
                                                  int *actual_sample_size);
+  int init_cpu_table(const paddle::distributed::GraphParameter &graph);
+  int load(const std::string &path, const std::string &param);
+  virtual int32_t end_graph_sampling() {
+    return cpu_graph_table->end_graph_sampling();
+  }
 
  private:
   std::vector<GpuPsCommGraph> gpu_graph_list;
+  std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table;
+  std::shared_ptr<pthread_rwlock_t> rw_lock;
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  int cpu_table_status;
 };
 }
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
index 839c7e5468c6c6938c6b4cda3dd879c7366e7d6e..16a6857ae96eecaaa06b92b9912387f22612f53e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
+//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 namespace paddle {
 namespace framework {
 /*
@@ -45,6 +46,33 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
   }
 }
 
+int GpuPsGraphTable::init_cpu_table(
+    const paddle::distributed::GraphParameter& graph) {
+  cpu_graph_table.reset(new paddle::distributed::GraphTable);
+  cpu_table_status = cpu_graph_table->initialize(graph);
+  if (cpu_table_status != 0) return cpu_table_status;
+  std::function<void(std::vector<GpuPsCommGraph>&)> callback =
+      [this](std::vector<GpuPsCommGraph>& res) {
+        pthread_rwlock_wrlock(this->rw_lock.get());
+        this->clear_graph_info();
+        this->build_graph_from_cpu(res);
+        pthread_rwlock_unlock(this->rw_lock.get());
+        cv_.notify_one();
+      };
+  cpu_graph_table->set_graph_sample_callback(callback);
+  return cpu_table_status;
+}
+
+int GpuPsGraphTable::load(const std::string& path, const std::string& param) {
+  int status = cpu_graph_table->load(path, param);
+  if (status != 0) {
+    return status;
+  }
+  std::unique_lock<std::mutex> lock(mutex_);
+  cpu_graph_table->start_graph_sampling();
+  cv_.wait(lock);
+  return 0;
+}
 /*
  comment 1
 
@@ -68,6 +96,7 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
  that's what fill_dvals does.
 
 */
+
 void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
     int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right,
     int64_t* src_sample_res, int* actual_sample_size) {
@@ -258,7 +287,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
 
   auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
   int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t));
+  auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t));
   int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
   auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
   int* d_shard_actual_sample_size_ptr =
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 2cf702969f99a02cd2b89d69c94f42b265d46135..f85ed330dc8ea4eb4199b6ab006ac54be1b30b0d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
+//#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include <queue>
 
 namespace paddle {
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8c7ea10b26565a4181230f6150272babd315105f
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+using namespace paddle::framework;
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+char edge_file_name[] = "edges.txt";
+TEST(TEST_FLEET, graph_sample) {
+  std::vector<std::string> edges;
+  int gpu_count = 3;
+  std::vector<int> dev_ids;
+  dev_ids.push_back(0);
+  dev_ids.push_back(1);
+  dev_ids.push_back(2);
+
+  std::shared_ptr<HeterPsResource> resource =
+      std::make_shared<HeterPsResource>(dev_ids);
+  resource->enable_p2p();
+  GpuPsGraphTable g(resource);
+  int node_count = 10;
+  std::vector<std::vector<int64_t>> neighbors(node_count);
+  int ind = 0;
+  int64_t node_id = 0;
+  // std::vector<GpuPsCommGraph> graph_list(gpu_count);
+  while (ind < node_count) {
+    int neighbor_size = ind + 1;
+    while (neighbor_size--) {
+      edges.push_back(std::to_string(ind) + "\t" + std::to_string(node_id) +
+                      "\t1.0");
+      node_id++;
+    }
+    ind++;
+  }
+  /*
+  gpu 0:
+  0,3,6,9
+  gpu 1:
+  1,4,7
+  gpu 2:
+  2,5,8
+
+  query(2,6) returns nodes [6,9,1,4,7,2]
+  */
+  ::paddle::distributed::GraphParameter table_proto;
+  table_proto.set_gpups_mode(true);
+  table_proto.set_gpups_mode_shard_num(127);
+  table_proto.set_gpu_num(3);
+  table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler");
+  table_proto.set_gpups_graph_sample_args("5,5,1,1");
+  prepare_file(edge_file_name, edges);
+  g.init_cpu_table(table_proto);
+  g.load(std::string(edge_file_name), std::string("e>"));
+  /*
+   node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x]
+   so node 6's neighbors are [21,22...,27]
+   node 7's neighbors are [28,29,..35]
+    node 0's neighbors are [0]
+   query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23]
+   6 --index-->2
+   0 --index--->0
+   7 --index-->2
+  */
+  int64_t cpu_key[3] = {7, 0, 6};
+  void *key;
+  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
+  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
+  auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
+  int64_t *res = new int64_t[9];
+  cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost);
+  std::sort(res, res + 3);
+  std::sort(res + 6, res + 9);
+  int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
+  for (int i = 0; i < 9; i++) {
+    if (expected_sample_val[i] != -1) {
+      ASSERT_EQ(res[i], expected_sample_val[i]);
+    }
+  }
+  delete[] res;
+  delete neighbor_sample_res;
+}
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index b1d7059f311cd370a40e83d7b0016d5af8cdb163..2babecc6ddf933e19b9d704ee7515f56f7431839 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -78,6 +78,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
     return var_types[0] == proto::VarType::SELECTED_ROWS;
   }
 
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    auto var_types = ctx_.GetInputsVarType(name);
+    return var_types[0] == proto::VarType::LOD_TENSOR_ARRAY;
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     auto var_types = ctx_.GetOutputsVarType(name);
     return var_types[0] == proto::VarType::LOD_TENSOR;
@@ -125,9 +130,14 @@ class CompatMetaTensor : public phi::MetaTensor {
         return var->Get<phi::DenseTensor>().dims();
       } else if (var->IsType<phi::SelectedRows>()) {
         return var->Get<phi::SelectedRows>().dims();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // use tensor array size as dims
+        auto& tensor_array = var->Get<framework::LoDTensorArray>();
+        return phi::make_ddim({static_cast<int64_t>(tensor_array.size())});
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
-            "Currently, only can get dims from DenseTensor or SelectedRows."));
+            "Currently, only can get dims from DenseTensor or SelectedRows or "
+            "DenseTensorArray."));
       }
     } else {
       auto* var = BOOST_GET_CONST(VarDesc*, var_);
@@ -144,6 +154,10 @@ class CompatMetaTensor : public phi::MetaTensor {
         return var->Get<phi::DenseTensor>().dtype();
       } else if (var->IsType<phi::SelectedRows>()) {
         return var->Get<phi::SelectedRows>().dtype();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported get dtype from LoDTensorArray now
+        return phi::DataType::UNDEFINED;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can get dtype from DenseTensor or SelectedRows."));
@@ -157,7 +171,19 @@ class CompatMetaTensor : public phi::MetaTensor {
   DataLayout layout() const override {
     if (is_runtime_) {
       auto* var = BOOST_GET_CONST(Variable*, var_);
-      return var->Get<LoDTensor>().layout();
+      if (var->IsType<phi::DenseTensor>()) {
+        return var->Get<phi::DenseTensor>().layout();
+      } else if (var->IsType<phi::SelectedRows>()) {
+        return var->Get<phi::SelectedRows>().layout();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported get layout from LoDTensorArray now
+        return phi::DataLayout::UNDEFINED;
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can get layout from DenseTensor or "
+            "SelectedRows."));
+      }
     } else {
       // NOTE(chenweihang): do nothing
       // Unsupported get layout for VarDesc now
@@ -174,6 +200,16 @@ class CompatMetaTensor : public phi::MetaTensor {
       } else if (var->IsType<phi::SelectedRows>()) {
         auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
         phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        auto* tensor_array = var->GetMutable<framework::LoDTensorArray>();
+        // Note: Here I want enforce `tensor_array->size() == 0UL`, because
+        // inplace using on LoDTensorArray is dangerous, but the unittest
+        // `test_list` contains this behavior
+        PADDLE_ENFORCE_EQ(dims.size(), 1UL,
+                          platform::errors::InvalidArgument(
+                              "LoDTensorArray can only have one dimension."));
+        // only set the array size for LoDTensorArray input
+        tensor_array->resize(dims[0]);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can set dims from DenseTensor or SelectedRows."));
@@ -193,6 +229,9 @@ class CompatMetaTensor : public phi::MetaTensor {
       } else if (var->IsType<phi::SelectedRows>()) {
         auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
         phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported set dtype for LoDTensorArray now
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can set dtype from DenseTensor or SelectedRows."));
@@ -206,10 +245,20 @@ class CompatMetaTensor : public phi::MetaTensor {
   void set_layout(DataLayout layout) override {
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(tensor))
-          ->layout = layout;
+      if (var->IsType<phi::DenseTensor>()) {
+        auto* tensor = var->GetMutable<phi::DenseTensor>();
+        phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
+      } else if (var->IsType<phi::SelectedRows>()) {
+        auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
+        phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported set dtype for LoDTensorArray now
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can set layout from DenseTensor or "
+            "SelectedRows."));
+      }
     } else {
       // NOTE(chenweihang): do nothing
       // Unsupported set layout for VarDesc now
@@ -251,9 +300,7 @@ class CompatMetaTensor : public phi::MetaTensor {
   void share_meta(const MetaTensor& meta_tensor) override {
     share_dims(meta_tensor);
     set_dtype(meta_tensor.dtype());
-    // VarDesc doesn't contains layout, so we cannot share layout
-    // set_layout(meta_tensor.layout());
-
+    set_layout(meta_tensor.layout());
     // special case: share lod of LoDTensor
     share_lod(meta_tensor);
   }
@@ -442,6 +489,51 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
               attr_name, infershape_input.size()));
         }
       }
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = attr_reader.GetAttr(attr_name);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct InferMetaContext.",
+            attr_names[i]));
+      }
     } else if (ctx->HasAttr(attr_name)) {
       // Emplace Back Attr according to the type of attr.
       auto& attr = attr_reader.GetAttr(attr_name);
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 623c8a048c2417ab51772c55b681031d9bcfd925..7aaaef712a6e9186058b579d1c69b0cfb201d899 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -97,6 +97,7 @@ pass_library(layer_norm_fuse_pass inference)
 pass_library(add_support_int8_pass inference)
 pass_library(matmul_scale_fuse_pass inference)
 pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
+pass_library(mixed_precision_configure_pass inference)
 pass_library(generate_pass DEPS pass_desc_proto)
 target_link_libraries(generate_pass pass_desc_proto)
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 18068e22b7f3c31d59636bc7ab6a234e109d5ee6..164a13d1560f4d0008c2bdb5a56d8ad6f875157b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2052,18 +2052,19 @@ PDNode *patterns::Pool::operator()() {
   return output_var;
 }
 
-PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
-  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
-                                ->assert_is_op("elementwise_add");
-
-  x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
-  y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
-  auto out_var = pattern->NewNode(elementwise_add_out_repr())
+PDNode *patterns::Elementwise::operator()(PDNode *x_var, PDNode *y_var,
+                                          const std::string elementwise_type) {
+  auto elementwise_op =
+      pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type);
+
+  x_var->AsInput()->assert_is_op_input(elementwise_type, "X");
+  y_var->AsInput()->assert_is_op_input(elementwise_type, "Y");
+  auto out_var = pattern->NewNode(elementwise_out_repr())
                      ->AsOutput()
-                     ->assert_is_op_output("elementwise_add", "Out");
+                     ->assert_is_op_output(elementwise_type, "Out");
 
-  elementwise_add_op->LinksFrom({x_var, y_var});
-  elementwise_add_op->LinksTo({out_var});
+  elementwise_op->LinksFrom({x_var, y_var});
+  elementwise_op->LinksTo({out_var});
 
   return out_var;
 }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 062d2f9dedce65f6e16b70f0b201a4ca63b0531a..17c70ace301d39db6fcf14d01c11baab0dc7d403 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1016,20 +1016,20 @@ struct Pool : public PatternBase {
   PATTERN_DECL_NODE(pool_output);
 };
 
-// ElementwiseAdd used in residual connections.
-// y_var is used and convolution output.
-// The operator is removed, when residual
-// connection fusion is on.
-struct ElementwiseAdd : public PatternBase {
-  ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "elementwise_add") {}
-
-  PDNode* operator()(PDNode* x_var, PDNode* y_var);
-
-  PATTERN_DECL_NODE(elementwise_add_op);
-  PATTERN_DECL_NODE(elementwise_add_x);
-  PATTERN_DECL_NODE(elementwise_add_y);
-  PATTERN_DECL_NODE(elementwise_add_out);
+// Elementwise ops
+// Forward pass for element-wise operators (add, mul)
+// elementwise_mul_out is the result of the operator
+struct Elementwise : public PatternBase {
+  Elementwise(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elementwise") {}
+
+  PDNode* operator()(PDNode* x_var, PDNode* y_var,
+                     const std::string elementwise_type);
+
+  PATTERN_DECL_NODE(elementwise_op);
+  PATTERN_DECL_NODE(elementwise_x);
+  PATTERN_DECL_NODE(elementwise_y);
+  PATTERN_DECL_NODE(elementwise_out);
 };
 
 // Transpose op
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
index 1b2a62695fb135925d43a3341aaacdf956da8da3..9fc6de3c8c1725707edd9f3b9f8de87706c16cc9 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
@@ -73,8 +73,10 @@ static void ShareVarInfoToCinnLaunch(
       varinfo_maps.at(cinn_launch_op->GetScopeIdx());
 
   // collect all MemOptVarInfos of external variables
-  // that would be eager deleted after the cinn_launch subgraph executed,
-  // and store them as attribute of the subgraph
+  // that were eager deleted after the cinn_launch subgraph executed,
+  // and we will delete them in advance among eager_deletion_ops
+  // inside cinn_launch subgraph, so store them as attribute of the subgraph
+  // to pass to the inner eager_deletion_ops.
   for (const auto& var_name : vars_to_delete) {
     auto it = src_varinfo_map.find(var_name);
     PADDLE_ENFORCE_NE(it, src_varinfo_map.end(),
@@ -82,6 +84,8 @@ static void ShareVarInfoToCinnLaunch(
                           "MemOptVarInfo of var[%s] not found", var_name));
     dst_varinfo_map.emplace(var_name, it->second);
   }
+  // skip running of the followed eager_deletion_op
+  followed_eager_deletion_op->SetSkipRunning(true);
 }
 
 static void TakeVarInfoFromMainGraph(
diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4aa59d9196b1b4d73fffa8f1b2a9bba08d6091be
--- /dev/null
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void MixedPrecisionConfigurePass::InsertCastOps(
+    Graph* graph, const StringSet& blacklist) const {
+  VLOG(3) << "Insert the cast op before and after the kernel that does not "
+             "supports fp16 precision";
+
+  auto update_cast_desc = [&](
+      framework::OpDesc& desc, const std::string& x_name,
+      const std::string& out_name, const int in_dtype, const int out_dtype) {
+    desc.SetType("cast");
+    desc.SetInput("X", {x_name});
+    desc.SetOutput("Out", {out_name});
+    desc.SetAttr("in_dtype", in_dtype);
+    desc.SetAttr("out_dtype", out_dtype);
+    desc.SetAttr("use_mkldnn", false);
+    desc.SetAttr("with_quant_attr", false);
+    desc.Flush();
+  };
+
+  auto cast_input = [&](Graph* graph, Node* op_node,
+                        const StringSet& cast_list) {
+    auto inlinks = op_node->inputs;
+    for (auto* pre_node : inlinks) {
+      if (pre_node->IsVar()) {
+        const auto is_persistable = pre_node->Var()->Persistable();
+        const auto is_float =
+            pre_node->Var()->GetDataType() == proto::VarType::FP16 ||
+            pre_node->Var()->GetDataType() == proto::VarType::FP32 ||
+            pre_node->Var()->GetDataType() == proto::VarType::FP64;
+        if (!is_persistable && is_float) {
+          int suffix = 0;
+          for (auto* pre_node_input : pre_node->inputs) {
+            if (!pre_node_input->IsOp()) continue;
+            const auto& type = pre_node_input->Op()->Type();
+            if (!cast_list.count(type) && type != "cast") {
+              std::string old_name = pre_node->Name();
+              std::string new_name =
+                  old_name + "_cast.tmp_" + std::to_string(suffix);
+              suffix++;
+
+              framework::OpDesc new_op_desc(op_node->Op()->Block());
+              // 4 for fp16, 5 for fp32
+              update_cast_desc(new_op_desc, old_name, new_name, 4, 5);
+              auto* new_op = graph->CreateOpNode(&new_op_desc);
+
+              VarDesc out_var(new_name);
+              out_var.SetPersistable(false);
+              auto* node_var = graph->CreateVarNode(&out_var);
+
+              op_node->Op()->RenameInput(old_name, new_name);
+              IR_NODE_LINK_TO(pre_node, new_op);
+              IR_NODE_LINK_TO(new_op, node_var);
+              IR_NODE_LINK_TO(node_var, op_node);
+            }
+          }
+        }
+      }
+    }
+  };
+
+  auto cast_output = [&](Graph* graph, Node* op_node,
+                         const StringSet& cast_list) {
+    auto outlinks = op_node->outputs;
+    for (auto* next_node : outlinks) {
+      if (next_node->IsVar()) {
+        const auto is_persistable = next_node->Var()->Persistable();
+        const auto is_float =
+            next_node->Var()->GetDataType() == proto::VarType::FP16 ||
+            next_node->Var()->GetDataType() == proto::VarType::FP32 ||
+            next_node->Var()->GetDataType() == proto::VarType::FP64;
+        if (!is_persistable && is_float) {
+          int suffix = 0;
+          for (auto* next_node_output : next_node->outputs) {
+            if (!next_node_output->IsOp()) continue;
+
+            const auto& type = next_node_output->Op()->Type();
+            if (!cast_list.count(type) && type != "cast") {
+              std::string old_name = next_node->Name();
+              std::string new_name =
+                  old_name + "_cast.tmp_" + std::to_string(suffix);
+              suffix++;
+
+              framework::OpDesc new_op_desc(op_node->Op()->Block());
+              // 4 for fp16, 5 for fp32
+              update_cast_desc(new_op_desc, old_name, new_name, 5, 4);
+              auto* new_op = graph->CreateOpNode(&new_op_desc);
+
+              VarDesc out_var(new_name);
+              out_var.SetPersistable(false);
+              auto* node_var = graph->CreateVarNode(&out_var);
+
+              next_node_output->Op()->RenameInput(old_name, new_name);
+              IR_NODE_LINK_TO(next_node, new_op);
+              IR_NODE_LINK_TO(new_op, node_var);
+              IR_NODE_LINK_TO(node_var, next_node_output);
+            }
+          }
+        }
+      }
+    }
+  };
+
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    const auto& type = op_node->Op()->Type();
+    if (blacklist.count(type)) {
+      cast_input(graph, op_node, blacklist);
+      cast_output(graph, op_node, blacklist);
+    }
+  }
+}
+
+void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const {
+  const auto blacklist =
+      Get<std::unordered_set<std::string>>("gpu_fp16_disabled_op_types");
+  InsertCastOps(graph, blacklist);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(mixed_precision_configure_pass,
+              paddle::framework::ir::MixedPrecisionConfigurePass);
diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.h b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc5a612ecb833d2a5117a2dab58747d21226df8d
--- /dev/null
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using StringSet = std::unordered_set<std::string>;
+
+class MixedPrecisionConfigurePass : public FusePassBase {
+ public:
+  MixedPrecisionConfigurePass() = default;
+  virtual ~MixedPrecisionConfigurePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const override;
+
+ private:
+  void InsertCastOps(Graph* graph, const StringSet& blacklist) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index 0f3f37320b026a7100bd050c1a01b6683765a44f..fc2758c27345032c1ad0831b4ee0016fa84b3f5c 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -145,10 +145,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
   patterns::Conv conv_pattern{pattern, name_scope};
   auto conv_output = conv_pattern();
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(
-      conv_output,
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(
+      conv_output, pattern->NewNode(elementwise_pattern.elementwise_y_repr()),
+      "elementwise_add");
   conv_output->AsIntermediate();
 
   int found_conv_as_x_count = 0;
@@ -160,16 +160,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
     GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
 
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_identity, elementwise_y,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
-    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return;
 
-    if (!IsReachable(g, elementwise_add_identity, conv_output)) return;
+    if (!IsReachable(g, elementwise_identity, conv_output)) return;
 
     if (HasFusedActivation(conv_op)) return;
 
@@ -179,14 +179,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
       return;
     }
 
-    conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
-    conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    conv_op->Op()->SetInput("ResidualData", {elementwise_identity->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
     conv_op->Op()->SetAttr("fuse_residual_connection", true);
 
-    GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_op});
 
-    IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
-    IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+    IR_NODE_LINK_TO(elementwise_identity, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_out);
 
     found_conv_as_x_count++;
   };
@@ -212,10 +212,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
   patterns::Conv conv_pattern{pattern, name_scope};
   auto conv_output = conv_pattern();
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
-      conv_output);
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(
+      pattern->NewNode(elementwise_pattern.elementwise_x_repr()), conv_output,
+      "elementwise_add");
   conv_output->AsIntermediate();
 
   int found_conv_as_y_count = 0;
@@ -227,16 +227,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
     GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
 
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
-    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return;
 
-    if (!IsReachable(g, elementwise_add_x, conv_output)) return;
+    if (!IsReachable(g, elementwise_x, conv_output)) return;
 
     if (HasFusedActivation(conv_op)) return;
 
@@ -246,14 +246,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
       return;
     }
 
-    conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()});
-    conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    conv_op->Op()->SetInput("ResidualData", {elementwise_x->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
     conv_op->Op()->SetAttr("fuse_residual_connection", true);
 
-    GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_op});
 
-    IR_NODE_LINK_TO(elementwise_add_x, conv_op);
-    IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+    IR_NODE_LINK_TO(elementwise_x, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_out);
 
     found_conv_as_y_count++;
   };
@@ -282,8 +282,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
   patterns::Conv conv_y_pattern{pattern, name_scope};
   auto conv_y_output = conv_y_pattern();
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(conv_x_output, conv_y_output);
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(conv_x_output, conv_y_output, "elementwise_add");
   conv_x_output->AsIntermediate();
   conv_y_output->AsIntermediate();
 
@@ -301,10 +301,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
     GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern);
 
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
     if (!IsCompat(subgraph, g)) {
       LOG(WARNING)
@@ -312,8 +312,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
       return;
     }
 
-    if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return;
-    if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_x_op, *elementwise_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_y_op, *elementwise_op) != FUSE_MKLDNN) return;
 
     Node* projection_node;
     Node* residual_conv_op;
@@ -333,14 +333,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
     if (HasFusedActivation(residual_conv_op)) return;
 
     residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
-    residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    residual_conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
 
     residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
 
-    GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op});
+    GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_op});
 
     IR_NODE_LINK_TO(projection_node, residual_conv_op);
-    IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
+    IR_NODE_LINK_TO(residual_conv_op, elementwise_out);
 
     found_projection_conv_count++;
   };
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 371482b5343d638f005aa8e0700680b6ac00d6ec..f4358fb243f20bc9b024ef6b02768773fa995f45 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -807,74 +807,74 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
   PrettyLogDetail("---    quantized %d matmul ops", quantize_matmul_count);
 }
 
-void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
+void CPUQuantizePass::QuantizeElementwise(
+    Graph* graph, const std::string elementwise_type) const {
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_};
+  patterns::Elementwise elementwise_pattern{pattern, name_scope_};
 
-  elementwise_add_pattern(
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
+  elementwise_pattern(
+      pattern->NewNode(elementwise_pattern.elementwise_x_repr()),
+      pattern->NewNode(elementwise_pattern.elementwise_y_repr()),
+      elementwise_type);
 
-  int quantize_elementwise_add_count = 0;
+  int quantize_elementwise_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "Quantize elementwise_add op";
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                              elementwise_add_pattern);
+    VLOG(4) << "Quantize " + elementwise_type + " op";
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
 
     // skip if should not be quantized
-    if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) {
-      LogQuantizationDisabled(elementwise_add_op);
+    if (!platform::HasOpINT8DataType(elementwise_op->Op())) {
+      LogQuantizationDisabled(elementwise_op);
       return;
     }
 
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_y, elementwise_y,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
     if (!AreScalesPresentForNodes(
-            {elementwise_add_x, elementwise_add_y, elementwise_add_out})) {
-      LogCannotQuantizeOp(elementwise_add_op,
+            {elementwise_x, elementwise_y, elementwise_out})) {
+      LogCannotQuantizeOp(elementwise_op,
                           "No scale available for the operator");
       return;
     }
 
     bool is_x_unsigned{false}, is_y_unsigned{false};
-    auto input_x_scale =
-        GetScaleValueForNode(elementwise_add_x, &is_x_unsigned);
-    auto input_y_scale =
-        GetScaleValueForNode(elementwise_add_y, &is_y_unsigned);
+    auto input_x_scale = GetScaleValueForNode(elementwise_x, &is_x_unsigned);
+    auto input_y_scale = GetScaleValueForNode(elementwise_y, &is_y_unsigned);
 
     // TODO(sfraczek): add support for different signness
     if (is_x_unsigned != is_y_unsigned) {
-      LogCannotQuantizeOp(elementwise_add_op,
-                          "ElementwiseAdd inputs must be of the same type.");
+      LogCannotQuantizeOp(elementwise_op,
+                          "Elementwise inputs must be of the same type.");
       return;
     }
 
-    QuantizeInput(g, elementwise_add_op, elementwise_add_x, "X", input_x_scale,
+    QuantizeInput(g, elementwise_op, elementwise_x, "X", input_x_scale,
                   is_x_unsigned, "Scale_x");
-    QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale,
+    QuantizeInput(g, elementwise_op, elementwise_y, "Y", input_y_scale,
                   is_y_unsigned, "Scale_y");
 
     bool is_output_unsigned{false};
     auto output_scale =
-        GetScaleValueForNode(elementwise_add_out, &is_output_unsigned);
+        GetScaleValueForNode(elementwise_out, &is_output_unsigned);
 
-    DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out",
-                     output_scale, is_output_unsigned, "Scale_out");
+    DequantizeOutput(g, elementwise_op, elementwise_out, "Out", output_scale,
+                     is_output_unsigned, "Scale_out");
 
-    ++quantize_elementwise_add_count;
+    ++quantize_elementwise_count;
   };
   gpd(graph, handler);
-  AddStatis(quantize_elementwise_add_count);
+  AddStatis(quantize_elementwise_count);
 
-  PrettyLogDetail("---    quantized %d elementwise_add ops",
-                  quantize_elementwise_add_count);
+  PrettyLogDetail("---    quantized %d %s ops", quantize_elementwise_count,
+                  elementwise_type);
 }
 
 void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
@@ -1146,7 +1146,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeFc(graph);
   QuantizeReshape(graph);
   QuantizeMatmul(graph);
-  QuantizeElementwiseAdd(graph);
+  QuantizeElementwise(graph, "elementwise_add");
+  QuantizeElementwise(graph, "elementwise_mul");
   QuantizeFusionGru(graph);
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index 412c4e40a01d50b73f72076f3a0424081d633247..3a286264e41ffe1c329ba3971d777ce4fbc05b5e 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -57,7 +57,8 @@ class CPUQuantizePass : public FusePassBase {
   void QuantizeTranspose(Graph* graph) const;
   void QuantizeReshape(Graph* graph) const;
   void QuantizeMatmul(Graph* graph) const;
-  void QuantizeElementwiseAdd(Graph* graph) const;
+  void QuantizeElementwise(Graph* graph,
+                           const std::string elementwise_type) const;
   void QuantizeFusionGru(Graph* graph) const;
   void QuantizeMultiGru(Graph* graph) const;
   void QuantizeFusionLSTM(Graph* graph) const;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 889417b78c8641060b8ad89219749d8400558c6a..22000865948d629a5933ad0319e41dab71433fff 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -90,7 +90,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
-  } else if (type == "elementwise_add") {
+  } else if (type == "elementwise_add" || type == "elementwise_mul") {
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
@@ -167,7 +167,8 @@ void CheckScales(const OpDesc* op, float scale, float shift) {
               scale);
     scale_names.push_back("Scale_in");
     scale_names.push_back("Scale_out");
-  } else if (type == "matmul" || type == "elementwise_add") {
+  } else if (type == "matmul" || type == "elementwise_add" ||
+             type == "elementwise_mul") {
     scale_names.push_back("Scale_x");
     scale_names.push_back("Scale_y");
     scale_names.push_back("Scale_out");
@@ -546,46 +547,77 @@ TEST(CpuQuantizePass, matmul_not_quantized) {
            expected_operators, added_nodes, 1.0f);
 }
 
-static const std::initializer_list<std::string> variable_names_elementwise_add =
-    {"a", "b", "c", "d", "e", "f"};
+static const std::initializer_list<std::string> variable_names_elementwise = {
+    "a", "b", "c", "d", "e", "f"};
 
-ProgramDesc BuildProgramDescElementwiseAdd() {
+ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type,
+                                        const std::string elementwise_name) {
   ProgramDesc prog;
-  for (auto& v : variable_names_elementwise_add) {
+  for (auto& v : variable_names_elementwise) {
     prog.MutableBlock(0)->Var(v);
   }
   SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
   SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
-  SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true,
+  SetOp(&prog, elementwise_type, elementwise_name, {"b", "d"}, {"e"}, true,
         "int8");
   SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
 
   return prog;
 }
 
-TEST(CpuQuantizePass, elementwise_add) {
+void TestElementwise(const std::string elementwise_type,
+                     const std::string elementwise_name) {
   // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
   int added_nodes = 6;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 2}, {"dequantize", 3}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, SCALE * S8_MAX);
+      {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes,
+           SCALE * S8_MAX);
 }
 
-TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
+void TestElementwiseOutputScaleMissing(const std::string elementwise_type,
+                                       const std::string elementwise_name) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, 1.f, 1.f, "e");
+      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes, 1.f,
+           1.f, "e");
 }
 
-TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
+void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type,
+                                           const std::string elementwise_name) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, 1.f, 1.f, "", "b");
+      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes, 1.f,
+           1.f, "", "b");
+}
+
+TEST(CpuQuantizePass, elementwise_add) {
+  TestElementwise("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
+  TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
+  TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_mul) {
+  TestElementwise("elementwise_mul", "ElementwiseMul");
+}
+
+TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) {
+  TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul");
+}
+
+TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) {
+  TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul");
 }
 
 const std::vector<std::string> churn_out_vars(ProgramDesc* prog,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 5f74b61ee86aad10880f3a67d8250026a6e9ac18..3b883dac9782af8350b3e22d2954e21789a1a120 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -26,10 +26,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Marks operators which are to be quantized.";
   std::unordered_set<std::string> supported_op_types =
       std::unordered_set<std::string>(
-          {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc",
-           "matmul", "nearest_interp", "nearest_interp_v2", "pool2d",
-           "prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm",
-           "multi_gru", "slice"});
+          {"concat", "conv2d", "depthwise_conv2d", "elementwise_add",
+           "elementwise_mul", "fc", "matmul", "nearest_interp",
+           "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2",
+           "fusion_gru", "fusion_lstm", "multi_gru", "slice"});
   const auto& excluded_ids_list =
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index eadb00b9e88e14075c46a53c711fd43774f26581..28e1145db42123b9dacfa9e359e08476d16ab4c0 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -31,7 +31,7 @@ USE_OP(slice);
 USE_OP(concat);
 USE_OP(matmul);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(tanh);
 USE_OP(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
@@ -47,7 +47,7 @@ USE_OP(square);
 USE_OP(transpose2_grad);
 USE_OP(concat_grad);
 USE_OP_ITSELF(elementwise_mul_grad);
-USE_OP(sigmoid_grad);
+USE_OP_ITSELF(sigmoid_grad);
 USE_OP_ITSELF(tanh_grad);
 USE_OP(sum);
 USE_OP(slice_grad);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ad01adf1a25b9d65c6ca85bc7e7a40d4b1fd0198..ec28c98d5986d96109332db488fd48fc20834bfb 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2103,16 +2103,25 @@ void OperatorWithKernel::BuildPhiKernelContext(
       auto* var = ins_vector[offset];
       if (var->IsType<framework::LoDTensor>()) {
         tensor_in = &(var->Get<framework::LoDTensor>());
+        pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var->IsType<phi::SelectedRows>()) {
         tensor_in = &(var->Get<phi::SelectedRows>());
+        pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        auto& tensor_array = var->Get<framework::LoDTensorArray>();
+        for (auto& t : tensor_array) {
+          tensor_vector.emplace_back(&t);
+        }
+        pt_kernel_context->EmplaceBackInputsWithoutSetRange(tensor_vector);
+        end_idx += tensor_array.size() - 1;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported input `%s` type when call pt kernel.",
             framework::ToTypeName(var->Type())));
       }
-
-      pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
     }
+    // Note: here cannot deal with vector<LoDTensorArray> input
     pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
   VLOG(4) << "Done inputs";
@@ -2140,22 +2149,33 @@ void OperatorWithKernel::BuildPhiKernelContext(
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
       phi::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset];
-
       if (var) {
         if (var->template IsType<framework::LoDTensor>()) {
           tensor_out = var->template GetMutable<framework::LoDTensor>();
+          pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<phi::SelectedRows>()) {
           tensor_out = var->template GetMutable<phi::SelectedRows>();
+          pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::LoDTensorArray>()) {
+          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          auto* tensor_array =
+              var->template GetMutable<framework::LoDTensorArray>();
+          // Note: If the input LoDTensorArray size is 0, the output
+          // LoDTensorArray is also 0
+          for (auto& t : *tensor_array) {
+            tensor_vector.emplace_back(&t);
+          }
+          pt_kernel_context->EmplaceBackOutputsWithoutSetRange(tensor_vector);
+          end_idx += tensor_array->size() - 1;
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported output `%s` type when call pt kernel.",
               framework::ToTypeName(var->Type())));
         }
+      } else {
+        pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
       }
-
-      pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
-
     pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
   VLOG(4) << "Done outputs";
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 1a1171f1dba4d794796ef1421fe386f60a0e587d..6f68c261d2b24dd66a70734d29d448e8927631e9 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -483,6 +483,10 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
     return ctx_.InputVar(name)->IsType<phi::SelectedRows>();
   }
 
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    return ctx_.InputVar(name)->IsType<framework::LoDTensorArray>();
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     return ctx_.OutputVar(name)->IsType<framework::LoDTensor>();
   }
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 12aa13bbacc3bae5d690323f45817f95762c376c..499cf4d8ad6d82dd554fa4f5bbcf39833fed0eab 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -423,7 +423,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
   if (data_type == framework::proto::VarType::BF16) {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       return TensorAddImpl<platform::CUDADeviceContext, platform::bfloat16>(
           src_tensor, dst_tensor, place);
 #else
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 16f2df79246f782ead9cc3177679674d98c3d1a9..f70f44878e361bf72c35ae5ae346c47869198eb5 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -289,14 +289,23 @@ void BuildDygraphPhiKernelContext(
       auto& var = ins_vector[offset]->Var();
       if (var.template IsType<phi::DenseTensor>()) {
         tensor_in = &(var.template Get<phi::DenseTensor>());
+        kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var.template IsType<phi::SelectedRows>()) {
         tensor_in = &(var.template Get<phi::SelectedRows>());
+        kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var.template IsType<framework::LoDTensorArray>()) {
+        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        auto& tensor_array = var.template Get<framework::LoDTensorArray>();
+        for (auto& t : tensor_array) {
+          tensor_vector.emplace_back(&t);
+        }
+        kernel_ctx->EmplaceBackInputsWithoutSetRange(tensor_vector);
+        end_idx += tensor_array.size() - 1;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported input `%s` type when call pt kernel.",
             framework::ToTypeName(var.Type())));
       }
-      kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
     }
     kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
@@ -326,16 +335,27 @@ void BuildDygraphPhiKernelContext(
       if (var) {
         if (var->template IsType<phi::DenseTensor>()) {
           tensor_out = var->template GetMutable<phi::DenseTensor>();
+          kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<phi::SelectedRows>()) {
           tensor_out = var->template GetMutable<phi::SelectedRows>();
+          kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::LoDTensorArray>()) {
+          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          auto* tensor_array =
+              var->template GetMutable<framework::LoDTensorArray>();
+          for (auto& t : *tensor_array) {
+            tensor_vector.emplace_back(&t);
+          }
+          kernel_ctx->EmplaceBackOutputsWithoutSetRange(tensor_vector);
+          end_idx += tensor_array->size() - 1;
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported output `%s` type when call pt kernel.",
               framework::ToTypeName(var->Type())));
         }
+      } else {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
       }
-
-      kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index a5c32164bf1a28687ea6f8cc53427db67560c307..74e8ca3f229c6b7093e29cb53c0ce15e0b15d6a9 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -188,6 +188,9 @@ struct Argument {
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
   DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
+  DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool);
+  DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes,
+                      std::unordered_set<std::string>);
 
   // Usually use for trt dynamic shape.
   // TRT will select the best kernel according to opt shape
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 796c86a3ad1efe45dd8a00139b92c2642676a811..287c896e49bf254d70a5c79c818a39f913472f2f 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -189,6 +189,10 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new int(argument->dlnne_min_subgraph_size()));
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
+    } else if (pass_name == "mixed_precision_configure_pass") {
+      pass->Set("gpu_fp16_disabled_op_types",
+                new std::unordered_set<std::string>(
+                    argument->gpu_fp16_disabled_op_types()));
     }
     if (pass_name == "lite_subgraph_pass") {
       bool lite_enable_int8 =
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index daa18d8c78bf875ebcc6571bf955a7f634948e4f..614eea24a0e2ee9d4fabd68a9374fa7c44b63ad7 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -65,6 +66,26 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
 
 #else
 
+void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap(
+    const framework::ir::Graph &graph,
+    std::unordered_map<std::string, std::string> *var_name_op_type_map) {
+  std::vector<framework::ir::Node *> node_list =
+      framework::ir::TopologyVarientSort(
+          graph, static_cast<framework::ir::SortKind>(0));
+  for (auto *op_node : node_list) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    for (auto *pre_node : op_node->inputs) {
+      if (pre_node->IsVar() && pre_node->Var()->Persistable()) {
+        var_name_op_type_map->insert(std::pair<std::string, std::string>(
+            pre_node->Var()->Name(), op_node->Op()->Type()));
+      }
+    }
+  }
+}
+
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
@@ -102,6 +123,16 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   if (with_dynamic_shape) {
     reserve_cpu_weights = true;
   }
+
+  bool mixed_precision_mode =
+      argument->Has("use_gpu_fp16") && argument->use_gpu_fp16();
+  std::unordered_map<std::string, std::string> var_name_op_type_map{};
+  std::unordered_set<std::string> blacklist{};
+  if (mixed_precision_mode) {
+    GetVarNameToOpTypeMap(graph, &var_name_op_type_map);
+    blacklist = argument->gpu_fp16_disabled_op_types();
+  }
+
   for (auto &var_name : all_vars) {
     if (std::count(repetitive_params.begin(), repetitive_params.end(),
                    var_name)) {
@@ -117,18 +148,29 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
         var->IsType<framework::Tensor>()) {
       auto *t = var->GetMutable<framework::LoDTensor>();
 
-      platform::CPUPlace cpu_place;
-      framework::LoDTensor temp_tensor;
-      temp_tensor.Resize(t->dims());
-      temp_tensor.mutable_data<float>(cpu_place);
-
-      // Copy the parameter data to a tmp tensor.
-      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
-      // Reallocation the space on GPU
-      t->clear();
-
-      // Copy parameter data to newly allocated GPU space.
-      paddle::framework::TensorCopySync(temp_tensor, place, t);
+      bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 ||
+                      t->dtype() == paddle::experimental::DataType::FLOAT64;
+      if (mixed_precision_mode &&
+          !blacklist.count(var_name_op_type_map[var_name]) && is_float) {
+        framework::Tensor half_tensor;
+        half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
+        half_tensor.Resize(t->dims());
+        auto *half_data =
+            half_tensor.mutable_data<float16>(platform::CPUPlace());
+        for (int i = 0; i < t->numel(); i++) {
+          auto *data = t->mutable_data<float>(platform::CPUPlace());
+          half_data[i] = static_cast<float16>(data[i]);
+        }
+        t->clear();
+        paddle::framework::TensorCopySync(half_tensor, place, t);
+      } else {
+        platform::CPUPlace cpu_place;
+        framework::LoDTensor temp_tensor;
+        temp_tensor.Resize(t->dims());
+        paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
+        t->clear();
+        paddle::framework::TensorCopySync(temp_tensor, place, t);
+      }
     }
   }
 }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index d5e98ec886e65f829a1496b1431f23aad6c4bc4c..f8209f051d53444435ed8c65b400f08bf8627553 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -38,7 +38,12 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
 #ifdef PADDLE_WITH_ASCEND_CL
   void CopyParamsToNpu(Argument *argument);
 #else
-  void CopyParamsToGpu(Argument *argument);
+
+  void GetVarNameToOpTypeMap(
+      const framework::ir::Graph& graph,
+      std::unordered_map<std::string, std::string>* var_name_op_type_map);
+
+  void CopyParamsToGpu(Argument* argument);
 #endif
 };
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 41c01d3b7e261314d8dc6b852f5b2a597421fe48..d08d28a3f623389790e63d45e13584a8d0db6adc 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -83,6 +83,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
 
   Update();
 }
+
 void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                   int device_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -97,12 +98,26 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
 
   Update();
 }
+
 void AnalysisConfig::DisableGpu() {
   use_gpu_ = false;
 
   Update();
 }
 
+void AnalysisConfig::Exp_EnableUseGpuFp16(
+    std::unordered_set<std::string> op_list) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  use_gpu_fp16_ = true;
+  gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end());
+#else
+  LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()";
+  use_gpu_fp16_ = false;
+#endif
+
+  Update();
+}
+
 void AnalysisConfig::DisableFCPadding() {
   use_fc_padding_ = false;
 
@@ -213,6 +228,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(use_cudnn_);
   CP_MEMBER(gpu_device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
+  CP_MEMBER(use_gpu_fp16_);
+  CP_MEMBER(gpu_fp16_disabled_op_types_);
 
   CP_MEMBER(enable_memory_optim_);
   // TensorRT related.
@@ -573,6 +590,20 @@ void AnalysisConfig::Update() {
 #endif
   }
 
+  if (use_gpu_fp16_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (!enable_ir_optim_) {
+      LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is "
+                    "enabled.";
+    } else if (!use_gpu()) {
+      LOG(ERROR)
+          << "Exp_EnableUseGpuFp16() only works when use_gpu is enabled.";
+    } else {
+      pass_builder()->Exp_EnableUseGpuFp16();
+    }
+#endif
+  }
+
   if (use_mkldnn_) {
 #ifdef PADDLE_WITH_MKLDNN
     if (!enable_ir_optim_) {
@@ -669,6 +700,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << params_file_;
 
   ss << use_gpu_;
+  ss << use_gpu_fp16_;
+  for (auto &item : gpu_fp16_disabled_op_types_) ss << item;
   ss << use_fc_padding_;
   ss << gpu_device_id_;
   ss << xpu_device_id_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 871ed596a3ee9d6362b03e99ca10313765826a51..a7caa3e369f80a954f36226c070ff1f7bd822a2b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -50,8 +50,7 @@
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/utils/string/split.h"
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -374,8 +373,7 @@ static void DisablePrepareDataOpt(
 }
 
 bool AnalysisPredictor::PrepareExecutor() {
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     VLOG(3) << "use_dist_model is enabled, will init FleetExecutor.";
     return PrepareFleetExecutor();
@@ -393,8 +391,7 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 bool AnalysisPredictor::PrepareFleetExecutor() {
   VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()";
   if (config_.dist_config().nranks() > 1 && !CommInit()) {
@@ -872,6 +869,11 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
   }
 
+  if (config_.gpu_fp16_enabled()) {
+    argument_.SetUseGPUFp16(true);
+    argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_);
+  }
+
   if (config_.lite_engine_enabled()) {
     argument_.SetCpuMathLibraryNumThreads(
         config_.cpu_math_library_num_threads());
@@ -1189,8 +1191,7 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
   framework::Scope *scope;
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
   } else {
@@ -1239,8 +1240,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
   framework::Scope *scope;
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
   } else {
@@ -1287,8 +1287,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 }
 
 bool AnalysisPredictor::ZeroCopyRun() {
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     VLOG(3) << "ZeroCopyRun will use the fleet executor.";
     inference::Timer timer;
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 21a7e9658bbeeb16d4cbff6364aaef68edcae16d..d9992f3fbef9d6ed626410ae5b9fc881b0772aa8 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,8 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #endif
 #include "paddle/fluid/framework/naive_executor.h"
@@ -395,8 +394,7 @@ class AnalysisPredictor : public PaddlePredictor {
   void StatisticShapeRangeInfo();
   void CollectShapeRangeInfo();
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // fleet exe related
 
   ///
@@ -488,8 +486,7 @@ class AnalysisPredictor : public PaddlePredictor {
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
   static int clone_num_;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // fleet executor related
   distributed::FleetExecutorDesc executor_desc_;
   std::shared_ptr<distributed::FleetExecutor> fleet_exe_;
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 2c6e8f4f1a4d9ea0dfba8f400c7d3782a5e2c32d..ecb5eaf982548c44eb97fde7e2b7365c9b0e9fc2 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -375,6 +375,19 @@ TEST(AnalysisPredictor, enable_onnxruntime) {
   ASSERT_TRUE(!config.use_onnxruntime());
 }
 
+TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) {
+  AnalysisConfig config;
+  config.SwitchIrOptim();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  config.EnableUseGpu(100, 0);
+  config.Exp_EnableUseGpuFp16();
+  ASSERT_TRUE(config.gpu_fp16_enabled());
+#else
+  config.DisableGpu();
+#endif
+  LOG(INFO) << config.Summary();
+}
+
 }  // namespace paddle
 
 namespace paddle_infer {
@@ -434,6 +447,19 @@ TEST(Predictor, EnableONNXRuntime) {
   auto predictor = CreatePredictor(config);
 }
 
+TEST(Predictor, Exp_EnableUseGpuFp16) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  config.EnableUseGpu(100, 0);
+  config.Exp_EnableUseGpuFp16();
+#else
+  config.DisableGpu();
+#endif
+  auto predictor = CreatePredictor(config);
+}
+
 TEST(Tensor, CpuShareExternalData) {
   Config config;
   config.SetModel(FLAGS_dirname);
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
index 4341fb0a9ccd8822151d4660f5a0c22901e47122..b2cfb060dd32559f6157fc456c7399736fc9fe51 100644
--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -14,7 +14,11 @@
 #
 
 cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope)
-cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+if (WITH_ONNXRUNTIME)
+    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce onnxruntime)
+else (WITH_ONNXRUNTIME)
+    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+endif (WITH_ONNXRUNTIME)
 cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
 
 cc_test(zero_copy_tensor_test SRCS zero_copy_tensor_test.cc DEPS paddle_inference_api)
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 18b1d09f0e8a7c4be9862991060a4706ee7cde7e..66dec0157d98e776b38ec8af81a0c006bc732bf4 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -22,12 +22,22 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/core/allocator.h"
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
 
 namespace paddle_infer {
 
 using float16 = paddle::platform::float16;
 
 void Tensor::Reshape(const std::vector<int> &shape) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    shape_.assign(shape.begin(), shape.end());
+    return;
+  }
+#endif
+
   PADDLE_ENFORCE_EQ(
       name_.empty(), false,
       paddle::platform::errors::PreconditionNotMet(
@@ -123,6 +133,11 @@ T *Tensor::data(PlaceType *place, int *size) const {
 }
 
 DataType Tensor::type() const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    return dtype_;
+  }
+#endif
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   auto type = paddle::framework::TransToProtoVarType(tensor->dtype());
   if (type == paddle::framework::proto::VarType::FP32) {
@@ -145,6 +160,13 @@ PlaceType Tensor::place() const { return place_; }
 
 template <typename T>
 void Tensor::CopyFromCpu(const T *data) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    ORTCopyFromCpu<T>(data);
+    return;
+  }
+#endif
+
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   PADDLE_ENFORCE_GE(tensor->numel(), 0,
                     paddle::platform::errors::PreconditionNotMet(
@@ -382,6 +404,13 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 
 template <typename T>
 void Tensor::CopyToCpu(T *data) const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    ORTCopyToCpu<T>(data);
+    return;
+  }
+#endif
+
   CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
 }
 
@@ -489,12 +518,7 @@ template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
 template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
 template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
 
-Tensor::Tensor(void *scope) : scope_{scope} {
-  PADDLE_ENFORCE_NOT_NULL(scope_,
-                          paddle::platform::errors::PreconditionNotMet(
-                              "The `scope` can not be nullptr. It should be "
-                              "set to the pointer of scope."));
-}
+Tensor::Tensor(void *scope) : scope_{scope} {}
 
 template <typename T>
 void *Tensor::FindTensor() const {
@@ -513,6 +537,26 @@ void *Tensor::FindTensor() const {
 }
 
 std::vector<int> Tensor::shape() const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    std::vector<int> shape;
+    // input handle
+    if (idx_ < 0) {
+      shape.assign(shape_.begin(), shape_.end());
+    } else {  // output handle
+      auto binding = binding_.lock();
+      PADDLE_ENFORCE_NOT_NULL(binding,
+                              paddle::platform::errors::PreconditionNotMet(
+                                  "output tensor [%s] no binding ptr", name_));
+      std::vector<Ort::Value> outputs = binding->GetOutputValues();
+      Ort::Value &value = outputs[idx_];
+      auto info = value.GetTensorTypeAndShapeInfo();
+      auto ort_shape = info.GetShape();
+      shape.assign(ort_shape.begin(), ort_shape.end());
+    }
+    return shape;
+  }
+#endif
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   PADDLE_ENFORCE_NOT_NULL(
       tensor_, paddle::platform::errors::PreconditionNotMet(
@@ -573,4 +617,99 @@ void Tensor::SetPlace(PlaceType place, int device) {
   device_ = device;
 }
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+void Tensor::SetOrtMark(bool is_ort_tensor) { is_ort_tensor_ = is_ort_tensor; }
+
+void Tensor::SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding) {
+  binding_ = binding;
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<float>(memory_info, data, size, shape,
+                                         shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int64_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int64_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int32_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int32_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, uint8_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<uint8_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int8_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int8_t>(memory_info, data, size, shape,
+                                          shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float16 *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor(memory_info, static_cast<void *>(data),
+                                  size * sizeof(float16), shape, shape_len,
+                                  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+}
+
+template <typename T>
+void Tensor::ORTCopyFromCpu(const T *data) {
+  auto binding = binding_.lock();
+  PADDLE_ENFORCE_NOT_NULL(binding,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "input tensor [%s] no binding ptr", name_));
+  const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, device_,
+                              OrtMemTypeDefault);
+  size_t size = std::accumulate(begin(shape_), end(shape_), 1UL,
+                                std::multiplies<size_t>());
+  auto ort_value = GetOrtVaule(memory_info, const_cast<T *>(data), size,
+                               shape_.data(), shape_.size());
+  binding->BindInput(name_.c_str(), ort_value);
+}
+
+template <typename T>
+void Tensor::ORTCopyToCpu(T *data) const {
+  auto binding = binding_.lock();
+  PADDLE_ENFORCE_NOT_NULL(binding,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "output tensor [%s] no binding ptr", name_));
+  std::vector<Ort::Value> outputs = binding->GetOutputValues();
+  Ort::Value &value = outputs[idx_];
+  auto info = value.GetTensorTypeAndShapeInfo();
+  size_t size = info.GetElementCount() * sizeof(T);
+
+  if (place_ == PlaceType::kCPU) {
+    std::memcpy(static_cast<void *>(data), value.GetTensorData<void *>(), size);
+  } else {
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                         static_cast<void *>(data),
+                         paddle::platform::CUDAPlace(device_),
+                         value.GetTensorData<void>(), size, nullptr);
+  }
+}
+
+template void Tensor::ORTCopyFromCpu<float>(const float *data);
+template void Tensor::ORTCopyFromCpu<int64_t>(const int64_t *data);
+template void Tensor::ORTCopyFromCpu<int32_t>(const int32_t *data);
+template void Tensor::ORTCopyFromCpu<uint8_t>(const uint8_t *data);
+template void Tensor::ORTCopyFromCpu<int8_t>(const int8_t *data);
+template void Tensor::ORTCopyFromCpu<float16>(const float16 *data);
+
+template void Tensor::ORTCopyToCpu<float>(float *data) const;
+template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
+template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
+template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
+template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
+#endif
+
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
index ee82da139d8f39c26002763c4a4835050c48fc99..bd9de252a0962bc27a23b949b428d8f18f96190f 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -25,11 +25,7 @@
 #include <vector>
 
 #include "paddle/fluid//platform/device/gpu/gpu_types.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
@@ -45,24 +41,23 @@
 
 namespace paddle {
 
-framework::proto::VarType::Type ConvertONNXType(
-    ONNXTensorElementDataType type) {
+paddle_infer::DataType ConvertONNXType(ONNXTensorElementDataType type) {
   switch (type) {
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
-      return framework::proto::VarType::FP32;
-    // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
-    //   return DataType::FP16;
+      return paddle_infer::DataType::FLOAT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+      return paddle_infer::DataType::FLOAT16;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
-      return framework::proto::VarType::INT8;
+      return paddle_infer::DataType::INT8;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
-      return framework::proto::VarType::INT32;
+      return paddle_infer::DataType::INT32;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
-      return framework::proto::VarType::INT64;
+      return paddle_infer::DataType::INT64;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
-      return framework::proto::VarType::UINT8;
+      return paddle_infer::DataType::UINT8;
     default:
       LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast<int>(type);
-      return framework::proto::VarType::FP32;
+      return paddle_infer::DataType::FLOAT32;
   }
 }
 
@@ -87,13 +82,12 @@ bool ONNXRuntimePredictor::Init() {
   VLOG(3) << "ONNXRuntime Predictor::init()";
 
   // Now ONNXRuntime only suuport CPU
+  const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
   if (config_.use_gpu()) {
     place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
   } else {
     place_ = paddle::platform::CPUPlace();
   }
-  scope_.reset(new paddle::framework::Scope());
-  sub_scope_ = &scope_->NewScope();
 
   std::string onnx_proto;
   paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto,
@@ -125,13 +119,12 @@ bool ONNXRuntimePredictor::Init() {
                "generated.";
   }
   session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options};
+  binding_ = std::make_shared<Ort::IoBinding>(session_);
 
-  auto memory_info =
-      Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+                              place_.GetDeviceId(), OrtMemTypeDefault);
   Ort::Allocator allocator(session_, memory_info);
 
-  framework::proto::VarType::Type proto_type =
-      framework::proto::VarType::LOD_TENSOR;
   size_t n_inputs = session_.GetInputCount();
   for (size_t i = 0; i < n_inputs; ++i) {
     auto input_name = session_.GetInputName(i, allocator);
@@ -141,8 +134,6 @@ bool ONNXRuntimePredictor::Init() {
     ONNXTensorElementDataType data_type =
         type_info.GetTensorTypeAndShapeInfo().GetElementType();
     input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type});
-    auto *ptr = scope_->Var(input_name);
-    framework::InitializeVariable(ptr, proto_type);
     allocator.Free(input_name);
   }
 
@@ -155,11 +146,13 @@ bool ONNXRuntimePredictor::Init() {
     ONNXTensorElementDataType data_type =
         type_info.GetTensorTypeAndShapeInfo().GetElementType();
     output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type});
-    auto *ptr = scope_->Var(output_name);
-    framework::InitializeVariable(ptr, proto_type);
+
+    Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator,
+                                    place_.GetDeviceId(), OrtMemTypeDefault);
+    binding_->BindOutput(output_name, out_memory_info);
+
     allocator.Free(output_name);
   }
-
   return true;
 }
 
@@ -216,15 +209,26 @@ std::vector<std::string> ONNXRuntimePredictor::GetOutputNames() {
   return output_names;
 }
 
+bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name,
+                                        bool is_input) {
+  if (is_input) {
+    for (auto i : input_desc_)
+      if (i.name == name) return true;
+  } else {
+    for (auto i : output_desc_)
+      if (i.name == name) return true;
+  }
+  return false;
+}
+
 std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
-                          platform::errors::PreconditionNotMet(
-                              "The in variable named %s is not found in the "
-                              "scope of the ONNXPredictor.",
-                              name));
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true,
+                    platform::errors::PreconditionNotMet(
+                        "The in variable named %s is not found in the "
+                        "ONNXPredictor.",
+                        name));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
   res->input_or_output_ = true;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -233,18 +237,19 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
   }
+  res->SetOrtMark(true);
+  res->SetOrtBinding(binding_);
   return res;
 }
 
 std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
-                          platform::errors::PreconditionNotMet(
-                              "The out variable named %s is not found in the "
-                              "scope of the ONNXPredictor.",
-                              name));
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true,
+                    platform::errors::PreconditionNotMet(
+                        "The out variable named %s is not found in the "
+                        "ONNXPredictor.",
+                        name));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
   res->input_or_output_ = false;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -253,46 +258,18 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
   }
+  res->SetOrtMark(true);
+  res->SetOrtBinding(binding_);
+  int size = output_desc_.size();
+  for (int i = 0; i < size; ++i)
+    if (output_desc_[i].name == name) {
+      res->idx_ = i;
+      res->dtype_ = ConvertONNXType(output_desc_[i].dtype);
+      break;
+    }
   return res;
 }
 
-Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc,
-                                             const char *device_name) {
-  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
-                              place_.GetDeviceId(), OrtMemTypeDefault);
-  auto *var = scope_->FindVar(desc.name);
-  auto *tensor = var->GetMutable<framework::LoDTensor>();
-  size_t size =
-      tensor->numel() *
-      framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype()));
-  std::vector<int64_t> shape = phi::vectorize<int64_t>(tensor->dims());
-  return Ort::Value::CreateTensor(memory_info,
-                                  static_cast<void *>(tensor->data()), size,
-                                  shape.data(), shape.size(), desc.dtype);
-}
-
-void ONNXRuntimePredictor::AsTensor(const Ort::Value &value,
-                                    const ONNXDesc &desc) {
-  auto info = value.GetTensorTypeAndShapeInfo();
-
-  auto *var = scope_->FindVar(desc.name);
-  auto *tensor = var->GetMutable<framework::LoDTensor>();
-  tensor->Resize(phi::make_ddim(info.GetShape()));
-  auto dtype = ConvertONNXType(info.GetElementType());
-  auto *ptr = tensor->mutable_data(place_, dtype);
-
-  if (platform::is_cpu_place(place_)) {
-    std::memcpy(ptr, const_cast<void *>(value.GetTensorData<void>()),
-                tensor->numel() * framework::SizeOfType(dtype));
-  } else {
-    auto src_place = place_;
-    auto dst_place = place_;
-    memory::Copy(dst_place, ptr, src_place,
-                 const_cast<void *>(value.GetTensorData<void>()),
-                 tensor->numel() * framework::SizeOfType(dtype));
-  }
-}
-
 bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                std::vector<PaddleTensor> *output_data,
                                int batch_size) {
@@ -302,31 +279,7 @@ bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
 
 bool ONNXRuntimePredictor::ZeroCopyRun() {
   try {
-    Ort::IoBinding binding(session_);
-    std::vector<Ort::Value> inputs;
-    std::vector<Ort::Value> outputs;
-    Ort::RunOptions options;
-
-    inputs.reserve(input_desc_.size());
-    const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
-    for (auto desc : input_desc_) {
-      inputs.push_back(GetOrtValue(desc, device_name));
-      binding.BindInput(desc.name.c_str(), inputs.back());
-    }
-
-    // TODO(heliqi): Optimization —— move to  Init()
-    for (auto desc : output_desc_) {
-      Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
-                                  place_.GetDeviceId(), OrtMemTypeDefault);
-      binding.BindOutput(desc.name.c_str(), memory_info);
-    }
-
-    session_.Run({}, binding);
-
-    outputs = binding.GetOutputValues();
-    for (size_t i = 0; i < output_desc_.size(); ++i) {
-      AsTensor(outputs[i], output_desc_[i]);
-    }
+    session_.Run({}, *(binding_.get()));
   } catch (const std::exception &e) {
     LOG(ERROR) << e.what();
     return false;
@@ -345,9 +298,9 @@ uint64_t ONNXRuntimePredictor::TryShrinkMemory() {
 }
 
 ONNXRuntimePredictor::~ONNXRuntimePredictor() {
-  if (sub_scope_) {
-    scope_->DeleteScope(sub_scope_);
-  }
+  binding_->ClearBoundInputs();
+  binding_->ClearBoundOutputs();
+
   memory::Release(place_);
 }
 
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
index 7fb07aa97bd2746773192456ddeba941a24e8906..d01756e4b96b132e3f9c3815e96f612433616ff2 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.h
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -94,9 +94,8 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   /// \param[in] AnalysisConfig config
   ///
   explicit ONNXRuntimePredictor(const AnalysisConfig &config)
-      : config_(config) {
+      : config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") {
     predictor_id_ = inference::GetUniqueId();
-    env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx");
   }
   ///
   /// \brief Destroy the ONNXRuntime Predictor object
@@ -177,30 +176,17 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   ///
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  std::shared_ptr<framework::Scope> scope_;
-
  private:
   ///
-  /// \brief get the Ort Value(input Tensor).
-  ///
-  /// \param[in] desc ONNXDesce(name、shape、dtype)
-  ///
-  /// \param[in] device_name "cpu" or "gpu" of device
-  ///
-  /// \return get a Ort::Value
-  ///
-  Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name);
-
-  ///
-  /// \brief Ort::Value to Paddle::ZeroCopyTensor.
+  /// \brief Whether to find in/out by name.
   ///
-  /// \param[in] value Ort::Value(output Tensor)
+  /// \param[in] name input or output name
   ///
-  /// \param[in] desc a ONNXDesce(name、shape、dtype)
+  /// \param[in] is_input input(true) or output(false)
   ///
-  /// \return get a Ort::Value
+  /// \return Whether to find by name
   ///
-  void AsTensor(const Ort::Value &value, const ONNXDesc &desc);
+  bool FindONNXDesc(const std::string &name, bool is_input);
 
  private:
   AnalysisConfig config_;
@@ -208,9 +194,9 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   // ONNXRuntime
   Ort::Env env_;
   Ort::Session session_{nullptr};
+  std::shared_ptr<Ort::IoBinding> binding_;
 
   platform::Place place_;
-  framework::Scope *sub_scope_{nullptr};
   std::vector<ONNXDesc> input_desc_;
   std::vector<ONNXDesc> output_desc_;
   int predictor_id_;
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 7b765e3fa8a24ef1b81b68da8ba12dd8e5577572..bdfe0e46e9ca4519c294a181cda6b8c4b87a6b9b 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -253,6 +253,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   ///
   void DisableGpu();
+  ///
+  /// \brief Enable GPU fp16 precision computation, in experimental state.
+  ///
+  /// \param op_list The operator type list.
+  ///
+  void Exp_EnableUseGpuFp16(std::unordered_set<std::string> op_list = {});
+  ///
+  /// \brief A boolean state telling whether the GPU fp16 precision is turned
+  /// on.
+  ///
+  /// \return bool Whether the GPU fp16 precision is turned on.
+  ///
+  bool gpu_fp16_enabled() const { return use_gpu_fp16_; }
 
   ///
   /// \brief Turn on XPU.
@@ -859,6 +872,9 @@ struct PD_INFER_DECL AnalysisConfig {
   int gpu_device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
   bool thread_local_stream_{false};
+  bool use_gpu_fp16_{false};
+  std::unordered_set<std::string> gpu_fp16_disabled_op_types_{
+      "conv2d_fusion", "conv2d", "roll", "strided_slice"};
 
   bool use_cudnn_{false};
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 22d9dedb32ebfcc229e0034cc5cf6092907dc8df..95975d8f2a892e709e5591135f96fbff07eb62e3 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -172,6 +172,40 @@ void GpuPassStrategy::EnableCUDNN() {
   use_cudnn_ = true;
 }
 
+void GpuPassStrategy::Exp_EnableUseGpuFp16() {
+  passes_.assign({
+    "is_test_pass",                               //
+        "simplify_with_basic_ops_pass",           //
+        "conv_bn_fuse_pass",                      //
+        "conv_eltwiseadd_bn_fuse_pass",           //
+        "embedding_eltwise_layernorm_fuse_pass",  //
+        "multihead_matmul_fuse_pass_v2",          //
+        "gpu_cpu_squeeze2_matmul_fuse_pass",      //
+        "gpu_cpu_reshape2_matmul_fuse_pass",      //
+        "gpu_cpu_flatten2_matmul_fuse_pass",      //
+        "gpu_cpu_map_matmul_v2_to_mul_pass",      //
+        "gpu_cpu_map_matmul_v2_to_matmul_pass",   //
+        "gpu_cpu_map_matmul_to_mul_pass",         //
+        // "fc_fuse_pass",                        //
+        "fc_elementwise_layernorm_fuse_pass",  //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
+                           // guaranteed at least v7
+// cudnn8.0 has memory leak problem in conv + eltwise + act, so we
+// disable the pass.
+#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100)
+        "conv_elementwise_add_act_fuse_pass",   //
+        "conv_elementwise_add2_act_fuse_pass",  //
+#endif
+        "conv_elementwise_add_fuse_pass",      //
+#endif                                         //
+        "transpose_flatten_concat_fuse_pass",  //
+        "mixed_precision_configure_pass",      //
+        "runtime_context_cache_pass"           //
+  });
+
+  use_gpu_fp16_ = true;
+}
+
 void GpuPassStrategy::EnableMKLDNN() {
   LOG(ERROR) << "GPU not support MKLDNN yet";
 }
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 351cf71e5ca7493928dfd81d776d847463f3b7bf..02290ed33ff1cd4f72d707d6f9d23f16e05c321b 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -125,6 +125,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \brief Enable the use of cuDNN kernel.
   virtual void EnableCUDNN() {}
 
+  /// \brief Enable use gpu fp16 kernel.
+  virtual void Exp_EnableUseGpuFp16() {}
+
   /// \brief Enable the use of MKLDNN.
   /// The MKLDNN control exists in both CPU and GPU mode, because there can
   /// still be some CPU kernels running in GPU mode.
@@ -140,6 +143,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \return A bool variable implying whether we are in gpu mode.
   bool use_gpu() const { return use_gpu_; }
 
+  /// \brief Check if we are using gpu fp16 kernel.
+  /// \return A bool variable implying whether we are in gpu fp16 mode.
+  bool use_gpu_fp16() const { return use_gpu_fp16_; }
+
   /// \brief Check if we are using xpu.
   /// \return A bool variable implying whether we are in xpu mode.
   bool use_xpu() const { return use_xpu_; }
@@ -162,6 +169,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   bool use_npu_{false};
   bool use_ipu_{false};
   bool use_mkldnn_{false};
+  bool use_gpu_fp16_{false};
   /// \endcond
 };
 
@@ -223,6 +231,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
   /// \brief Enable the use of cuDNN kernel.
   void EnableCUDNN() override;
 
+  /// \brief Enable the use of gpu fp16 kernel.
+  void Exp_EnableUseGpuFp16() override;
+
   /// \brief Not supported in GPU mode yet.
   void EnableMKLDNN() override;
 
@@ -238,6 +249,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
  protected:
   /// \cond Protected
   bool use_cudnn_{false};
+  bool use_gpu_fp16_{false};
   /// \endcond
 };
 
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 5a98d109aed79cc5bcefdc01b47a166bdf9c01d9..2afe2d32e2f60e47136b1e2f002b0e98c9b17cd2 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -18,6 +18,11 @@
 
 #include "paddle_infer_declare.h"  // NOLINT
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "onnxruntime_c_api.h"    // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
+#endif
+
 namespace paddle_infer {
 
 /// \brief  Experimental.
@@ -175,6 +180,23 @@ class PD_INFER_DECL Tensor {
   PlaceType place_;
   int device_;
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  bool is_ort_tensor_{false};
+  std::vector<int64_t> shape_;
+  std::weak_ptr<Ort::IoBinding> binding_;
+  int idx_{-1};
+
+  void SetOrtMark(bool is_ort_tensor);
+
+  void SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding);
+
+  template <typename T>
+  void ORTCopyFromCpu(const T* data);
+
+  template <typename T>
+  void ORTCopyToCpu(T* data) const;
+#endif
+
   friend class paddle_infer::contrib::TensorUtils;
 #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
   friend class paddle_infer::InferApiTesterUtils;
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index 67e7c78b62e9d212b5c1738403361d77d7a3925b..496e8932a690dbcd87001da4f7e017fc86d6bff5 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/layer_norm_op.h"
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 1946f9e28388e3ab6d1d580d0f7d91c1ef3e604f..1ad82df41737c4093d0b5518c754ed85c505b8be 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -53,6 +53,6 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
 }  // namespace paddle
 
 USE_OP_ITSELF(relu);
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(tanh);
 USE_OP(relu6);
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 861e98e4437564bfe5fae2a575741beb1d8823de..67d44184a76d0552b667c6d5a3d9466582e33558 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -17,7 +17,7 @@
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"
 
 namespace paddle {
 namespace inference {
@@ -83,7 +83,7 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
   cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
                   cudaMemcpyHostToDevice, stream);
 
-  paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+  phi::LayerNormDirectCUDAFunctor<float> layer_norm;
   layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
              variance_d, begin_norm_axis, eps);
   return cudaGetLastError() != cudaSuccess;
@@ -177,7 +177,7 @@ int LayerNormPluginDynamic::enqueue(
     cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
                     cudaMemcpyHostToDevice, stream);
 
-    paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+    phi::LayerNormDirectCUDAFunctor<float> layer_norm;
     layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
                variance_d, begin_norm_axis, eps);
   } else {
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index c835cf8ea148064648352bb5c6fbd533b02acda0..845d0ed073b32cc136ec6b9d76c9e3073d7b051a 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1492,6 +1492,10 @@ REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor,
 REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor,
                        TanhShrinkGradFunctor);
 REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor);
+REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
+                       HardSigmoidGradFunctor);
+REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor,
+                       LogSigmoidGradFunctor);
 
 /* ==========================    sigmoid register  =============================
  */
@@ -1526,30 +1530,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad,
                       ops::SigmoidTripleGradFunctor<float>::FwdDeps()>,
                   ops::ActivationTripleGradOpInplaceInferer);
 
-// Register Sigmoid/GradSigmoid Kernels
-REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
-                               SigmoidGradFunctor);
-
-// Register DoubleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_grad_grad,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<float>>,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<double>>,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>);
-
-// Register TripleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_triple_grad,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<float>>,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<double>>,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>);
-
 /* ========================================================================== */
 
 /* ==========================    tanh register  ============================= */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 4f197b95b21742e4af0889aa230f58821bf542ba..f1984af6e15eac6682bd341f470727b899e82f3a 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -238,15 +238,6 @@ struct BaseActivationFunctor {
   AttrPair GetAttrs() { return AttrPair(); }
 };
 
-// sigmoid(x) = 1 / (1 + exp(-x))
-template <typename T>
-struct SigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-  }
-};
-
 #define USE_PHI_FUNCTOR(name)                         \
   template <typename T>                               \
   using name##Functor = phi::funcs::name##Functor<T>; \
@@ -285,160 +276,15 @@ USE_PHI_FUNCTOR(TanhShrink)
 USE_PHI_FUNCTOR(Silu)
 USE_PHI_FUNCTOR(ELU)
 USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU)
+USE_PHI_FUNCTOR(Sigmoid)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid)
+USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid)
+USE_PHI_FUNCTOR(LogSigmoid)
+USE_PHI_FUNCTOR(HardSigmoid)
 
 template <typename T>
 using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
 
-template <typename T>
-struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out * (static_cast<T>(1) - out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-/*
-    Out
-    DOut -> SigmoidGradGrad -> DOutNew
-    DDX                        DDOut
-
-    DDOut = (1-Out)*Out*DDX
-    DOutNew = (1-2*Out)*DOut*DDX
-*/
-template <typename T>
-struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
-
-    if (dOutNew) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
-      auto dout_new = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad"));
-      dout_new.device(*d) =
-          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad"));
-      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-/*
-    Out
-    DOut                            D_Dout
-    DDx     -> SigmoidTripleGrad -> D_DDx
-    D_DDout                         d_OutNew
-    D_Dout_new
-
-    D_Dout = (1-2*Out)*DDx*D_Dout_new
-    D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
-    D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
-
-    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-    D_OutNew, D_DOut, D_DDx               // output
-*/
-template <typename T>
-struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  const framework::Tensor* d_DDOut,
-                  const framework::Tensor* d_dOut_New,
-                  framework::Tensor* d_d_Out, framework::Tensor* d_Out_New,
-                  framework::Tensor* d_DDx) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad"));
-    auto dout = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad"));
-    auto d_ddOut = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
-    auto d_dOutNew = framework::EigenVector<T>::Flatten(GET_DATA_SAFELY(
-        d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad"));
-
-    if (d_Out_New) {
-      auto d_OutNew = framework::EigenVector<T>::Flatten(GET_DATA_SAFELY(
-          d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad"));
-      d_OutNew.device(*d) = (ddx - static_cast<T>(2) * out * ddx) * d_ddOut -
-                            static_cast<T>(2) * dout * ddx * d_dOutNew;
-    }
-    if (d_d_Out) {
-      auto d_dOut = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad"));
-      d_dOut.device(*d) =
-          (static_cast<T>(1) - static_cast<T>(2) * out) * ddx * d_dOutNew;
-    }
-    if (d_DDx) {
-      auto d_ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad"));
-      d_ddx.device(*d) =
-          (static_cast<T>(1) - out) * out * d_ddOut +
-          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-// Originally: logsigmoid(x) = -log (1 + exp(-x))
-// For numerical stability, we can use the log-sum-exp trick:
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// We can rewrite the above equation as:
-// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
-//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
-//           max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
-//
-// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
-// + exp(-x - max(-x, 0))))
-template <typename T>
-struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
-  }
-};
-
-// Originally: f' = exp(-x) / (1 + exp(-x))
-// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
-// exp(-x - max(-x, 0)))
-template <typename T>
-struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    dx.device(d) =
-        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 // exp(x) = e^x
 template <typename T>
 struct ExpFunctor : public BaseActivationFunctor<T> {
@@ -1101,43 +947,6 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    out.device(d) =
-        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
-  }
-};
-
-template <typename T>
-struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
-                       .template cast<T>() *
-                   static_cast<T>(slope);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct SwishFunctor : public BaseActivationFunctor<T> {
   float beta;
@@ -1365,211 +1174,6 @@ inline void ExtractDoubleGradTensorWithInputDOut(
   }
 }
 
-template <typename DeviceContext, typename Functor>
-class SigmoidDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut;
-    framework::Tensor *dOutNew, *ddOut;
-    Out = ddX = dOut = nullptr;
-    dOutNew = ddOut = nullptr;
-    // extract ddx(input) and out(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    // set output ddout
-    ddOut = ctx.Output<framework::Tensor>("DDOut");
-    // extract dOut(intput)
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    dOutNew = ctx.Output<framework::Tensor>("DOutNew");
-    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, dOutNew, ddOut);
-  }
-};
-
-// Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-// D_OutNew, D_DOut, D_DDx               // output
-template <typename DeviceContext, typename Functor>
-class SigmoidTripleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew;
-    framework::Tensor *d_OutNew, *d_dOut, *d_ddx;
-    Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr;
-    d_OutNew = d_dOut = d_ddx = nullptr;
-
-    // extract ddx(input), out(input), dOut(input), d_ddOut(input),
-    // d_dOutNew(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    d_ddOut = ctx.Input<framework::Tensor>("D_DDOut");
-    d_dOutNew = ctx.Input<framework::Tensor>("D_DOut_New");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_ddOut, platform::errors::NotFound(
-                     "Cannot get input Variable d_ddOut, variable name = %s",
-                     ctx.InputName("D_DDOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_dOutNew,
-        platform::errors::NotFound(
-            "Cannot get input Variable d_dOutNew, variable name = %s",
-            ctx.InputName("D_DOutNew")));
-
-    // set output d_OutNew、d_dOut、d_ddx
-    d_dOut = ctx.Output<framework::Tensor>("D_DOut");
-    d_OutNew = ctx.Output<framework::Tensor>("D_OutNew");
-    d_ddx = ctx.Output<framework::Tensor>("D_DDx");
-
-    if (d_dOut) d_dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_OutNew) d_OutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_ddx) d_ddx->mutable_data<T>(ddX->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew,  // input
-            d_dOut, d_OutNew, d_ddx);                   // output
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class TanhDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut;
-    framework::Tensor *dOutNew, *ddOut;
-    Out = ddX = dOut = nullptr;
-    dOutNew = ddOut = nullptr;
-
-    // extract ddx(input) and out(input)
-    auto ddx_var = ctx.InputVar("DDX");
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddx_var, platform::errors::NotFound(
-                     "Cannot get input Variable ddx, variable name = %s",
-                     ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var, platform::errors::NotFound(
-                     "Cannot get input Variable out, variable name = %s",
-                     ctx.InputName("Out")));
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-
-    // set output ddout
-    auto ddout_var = ctx.OutputVar("DDOut");
-    if (ddout_var) {
-      ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-
-    // extract dOut(intput)
-    auto dout_var = ctx.InputVar("DOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        dout_var, platform::errors::NotFound(
-                      "Cannot get input Variable dout_var, variable name = %s",
-                      ctx.InputName("DOut")));
-    dOut = ctx.Input<framework::Tensor>("DOut");
-
-    // set output dout_new
-    auto dout_new_var = ctx.OutputVar("DOutNew");
-    if (dout_new_var) {
-      dOutNew = ctx.Output<framework::Tensor>("DOutNew");
-    }
-
-    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, dOutNew, ddOut);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class TanhTripeGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew;
-    framework::Tensor *d_OutNew, *d_dOut, *d_ddx;
-    Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr;
-    d_OutNew = d_dOut = d_ddx = nullptr;
-
-    // extract ddx(input), out(input), dOut(input), d_ddOut(input),
-    // d_dOutNew(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    d_ddOut = ctx.Input<framework::Tensor>("D_DDOut");
-    d_dOutNew = ctx.Input<framework::Tensor>("D_DOut_New");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_ddOut, platform::errors::NotFound(
-                     "Cannot get input Variable d_ddOut, variable name = %s",
-                     ctx.InputName("D_DDOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_dOutNew,
-        platform::errors::NotFound(
-            "Cannot get input Variable d_dOutNew, variable name = %s",
-            ctx.InputName("D_DOutNew")));
-
-    // set output d_OutNew、d_dOut、d_ddx
-    d_dOut = ctx.Output<framework::Tensor>("D_DOut");
-    d_OutNew = ctx.Output<framework::Tensor>("D_OutNew");
-    d_ddx = ctx.Output<framework::Tensor>("D_DDx");
-
-    if (d_dOut) d_dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_OutNew) d_OutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_ddx) d_ddx->mutable_data<T>(ddX->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew,  // input
-            d_dOut, d_OutNew, d_ddx);                   // output
-  }
-};
-
 template <typename DeviceContext, typename Functor>
 class SquareDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -1952,7 +1556,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 }  // namespace paddle
 
 #define FOR_EACH_ACTIVATION_OP(__macro)                                      \
-  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
   __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                         \
   __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                      \
   __macro(round, Round, RoundFunctor, ZeroGradFunctor);                      \
@@ -1965,8 +1568,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
   __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);         \
   __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);         \
   __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                     \
-  __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                     \
-          HardSigmoidGradFunctor);                                           \
   __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                     \
   __macro(mish, Mish, MishFunctor, MishGradFunctor);                         \
   __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index 22613cbe2a2b2cb2eb491142a58172a8a5235c59..7c1b288080162e2a5bf847a795fc640ab5e5e4e1 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -15,72 +15,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // sigmoid(x) = 1 / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(one / (one + exp(-x)));
-  }
-};
-
-template <typename T>
-struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout * out * (1 - out)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * out * (one - out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-
-  // logsigmoid(x) = log(1 / (1 + exp(-x)))
-  // For numerical stability,
-  // logsigmoid(x) =
-  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp = x > zero ? zero : -x;
-    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
-  }
-};
-
-template <typename T>
-struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-
-  // dx = dout * exp(-x) / (1 + exp(-x))
-  // For numerical stability:
-  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
-  // 0)))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp1 = x > zero ? zero : -x;
-    MPType temp2 = exp(-x - temp1);
-    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaCeilFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -549,49 +488,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  // hard_sigmoid(x) = 0, when x <= -3
-  //                   1, when x >= 3
-  //                   x * slope + offset, otherwise
-  __device__ __forceinline__ T operator()(const T x) const {
-    T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    T temp_max = temp > zero ? temp : zero;
-    T temp_min = temp_max < one ? temp_max : one;
-    return temp_min;
-  }
-};
-
-template <typename T>
-struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  // dx = (out > 0 && out < 1) ? dout * slope : 0
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaSwishFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -874,6 +770,9 @@ USE_PHI_FUNCTOR(CudaSoftShrink)
 USE_PHI_FUNCTOR(CudaTanhShrink)
 USE_PHI_FUNCTOR(CudaSilu)
 USE_PHI_FUNCTOR(CudaELU)
+USE_PHI_FUNCTOR(CudaSigmoid)
+USE_PHI_FUNCTOR(CudaLogSigmoid)
+USE_PHI_FUNCTOR(CudaHardSigmoid)
 
 template <typename T>
 using CudaELUGradNegativeAlphaFunctor =
@@ -952,35 +851,6 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::CELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ===========================    sigmoid register  ============================
- */
-REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
-                                CudaSigmoidGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_grad_grad,
-    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<float>>,
-    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<double>>,
-    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>,
-    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::bfloat16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_triple_grad,
-    ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<float>>,
-    ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<double>>,
-    ops::SigmoidTripleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>,
-    ops::SigmoidTripleGradKernel<
-        plat::CUDADeviceContext,
-        ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
-/* ========================================================================== */
-
 /* ===========================   sqrt register  ============================= */
 REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
                                 CudaSqrtGradFunctor);
@@ -1118,8 +988,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
-  __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
-          CudaLogSigmoidGradFunctor);                                         \
   __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
           CudaSoftShrinkGradFunctor);                                         \
   __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor);                  \
@@ -1139,8 +1007,6 @@ REGISTER_OP_CUDA_KERNEL(
           CudaTanhShrinkGradFunctor);                                         \
   __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor,                     \
           CudaHardShrinkGradFunctor);                                         \
-  __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor,                  \
-          CudaHardSigmoidGradFunctor);                                        \
   __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor);              \
   __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor);                  \
   __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
@@ -1148,63 +1014,221 @@ REGISTER_OP_CUDA_KERNEL(
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
 
 #ifdef PADDLE_WITH_XPU_KP
-#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor,             \
-                                       grad_functor)                           \
-  REGISTER_OP_KERNEL(                                                          \
-      act_type, KP, plat::XPUPlace,                                            \
-      ops::ActivationCudaKernel<plat::XPUDeviceContext, ops::functor<float>>); \
-  REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace,                      \
-                     ops::ActivationGradCudaKernel<plat::XPUDeviceContext,     \
-                                                   ops::grad_functor<float>>);
-
-REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
-                               CudaLeakyReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
-                               CudaSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
-                               CudaReciprocalGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
-                               CudaSoftplusGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor,
-                               CudaHardSwishGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor,
-                               CudaCELUGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
-                               CudaSqrtGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor,
-                               CudaSquareGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor,
-                               CudaSiluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
-                               CudaLogSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
-                               CudaSoftShrinkGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor,
-                               CudaZeroGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor,
-                               CudaZeroGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor,
-                               CudaLog1pGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor,
-                               CudaBReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
-                               CudaSoftReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
-                               CudaSoftsignGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor,
-                               CudaRelu6GradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
-                               CudaHardShrinkGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid,
-                               CudaHardSigmoidFunctor,
-                               CudaHardSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor,
-                               CudaSwishGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu,
-                               CudaThresholdedReluFunctor,
-                               CudaThresholdedReluGradFunctor);
+REGISTER_OP_KERNEL(
+    brelu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaBReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    brelu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaBReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(ceil, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaCeilFunctor<float>>);
+REGISTER_OP_KERNEL(
+    ceil_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaZeroGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(celu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaCELUFunctor<float>>);
+REGISTER_OP_KERNEL(
+    celu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaCELUGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaELUFunctor<float>>);
+REGISTER_OP_KERNEL(
+    elu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaELUGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(exp, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaExpFunctor<float>>);
+REGISTER_OP_KERNEL(
+    exp_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaExpGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(floor, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaFloorFunctor<float>>);
+REGISTER_OP_KERNEL(
+    floor_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaZeroGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    hard_shrink, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaHardShrinkFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_shrink_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardShrinkGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    hard_sigmoid, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaHardSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_sigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(hard_swish, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaHardSwishFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_swish_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardSwishGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    leaky_relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaLeakyReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    leaky_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaLeakyReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(log, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaLogFunctor<float>>);
+REGISTER_OP_KERNEL(
+    log_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLogGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(log1p, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaLog1pFunctor<float>>);
+REGISTER_OP_KERNEL(
+    log1p_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLog1pGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    logsigmoid, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaLogSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    logsigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLogSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    reciprocal, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaReciprocalFunctor<float>>);
+REGISTER_OP_KERNEL(
+    reciprocal_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaReciprocalGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(relu6, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaRelu6Functor<float>>);
+REGISTER_OP_KERNEL(
+    relu6_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaRelu6GradFunctor<float>>);
+
+REGISTER_OP_KERNEL(sigmoid, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    sigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(silu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSiluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    silu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSiluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(soft_relu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    soft_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(softplus, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftplusFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softplus_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftplusGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    softshrink, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaSoftShrinkFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softshrink_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftShrinkGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(softsign, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftsignFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softsign_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftsignGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(sqrt, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSqrtFunctor<float>>);
+REGISTER_OP_KERNEL(
+    sqrt_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSqrtGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(square, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSquareFunctor<float>>);
+REGISTER_OP_KERNEL(
+    square_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSquareGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(swish, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSwishFunctor<float>>);
+REGISTER_OP_KERNEL(
+    swish_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSwishGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    thresholded_relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaThresholdedReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    thresholded_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaThresholdedReluGradFunctor<float>>);
 
 #endif  // PADDLE_WITH_XPU_KP
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index 684ac5bafd0ef430f8424614104a865b3cbe29c6..ea6614cbfbdf874df029ab349f4373f27e5c8e21 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -36,26 +39,6 @@ class AssignOp : public framework::OperatorWithKernel {
            const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->HasInput("X")) {
-      auto type = ctx->GetInputsVarType("X")[0];
-      if (type == framework::proto::VarType::SELECTED_ROWS ||
-          type == framework::proto::VarType::LOD_TENSOR) {
-        ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-        if (type == framework::proto::VarType::LOD_TENSOR) {
-          ctx->ShareLoD("X", /*->*/ "Out");
-        }
-      } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
-        if (ctx->IsRuntime()) {
-          // The runtime output shape is determined in kernel.
-          return;
-        } else {
-          ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-        }
-      }
-    }
-  }
-
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
@@ -91,24 +74,6 @@ class AssignInferVarType : public framework::VarTypeInference {
   }
 };
 
-class AssignKernel {
- public:
-  void operator()(const framework::ExecutionContext &ctx) const {
-    auto *x = ctx.InputVar("X");
-    if (x == nullptr) {
-      return;
-    }
-    PADDLE_ENFORCE_EQ(
-        ctx.HasOutput("Out"), true,
-        platform::errors::NotFound("Output(Out) of assign_op is not found."));
-    auto *out = ctx.OutputVar("Out");
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(ctx.GetPlace());
-
-    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
-  }
-};
-
 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -147,23 +112,11 @@ DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
+DECLARE_INFER_SHAPE_FUNCTOR(assign, AssignInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(assign, ops::AssignOp,
                   ops::AssignGradMaker<paddle::framework::OpDesc>,
                   ops::AssignGradMaker<paddle::imperative::OpBase>,
                   ops::AssignOpProtoMaker, ops::AssignOpInplaceInferer,
-                  ops::AssignInferVarType);
-
-REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                               ops::AssignKernel, int, ops::AssignKernel,
-                               int64_t, ops::AssignKernel, uint8_t,
-                               ops::AssignKernel, bool, ops::AssignKernel,
-                               plat::float16, ops::AssignKernel, plat::bfloat16,
-                               ops::AssignKernel);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                                ops::AssignKernel, int, ops::AssignKernel,
-                                int64_t, ops::AssignKernel, uint8_t,
-                                ops::AssignKernel, bool, ops::AssignKernel,
-                                plat::float16, ops::AssignKernel);
-#endif
+                  ops::AssignInferVarType, AssignInferShapeFunctor);
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index b452dea8536dd98d6d4060d5224e39daf9137c50..b91eb50646feca30046915248d45ee2e91cabc39 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(assign);
+USE_OP_ITSELF(assign);
 USE_OP_DEVICE_KERNEL(assign, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc
index 68083c759859b420bd6e60496614234a96519935..6959b5cf811069cc66321d2129a2b69d4e922f09 100644
--- a/paddle/fluid/operators/determinant_op.cc
+++ b/paddle/fluid/operators/determinant_op.cc
@@ -13,6 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/determinant_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,11 +24,6 @@ namespace operators {
 class DeterminantOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant");
-  }
 };
 
 class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -44,19 +43,6 @@ class DeterminantGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input",
-                   "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
-                   framework::GradVarName("Out"), "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output",
-                   framework::GradVarName("Input"), "DeterminantGradOp");
-
-    ctx->SetOutputDim(framework::GradVarName("Input"),
-                      ctx->GetInputDim("Input"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -162,11 +148,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+DECLARE_INFER_SHAPE_FUNCTOR(determinant, DeterminantInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker,
                   ops::DeterminantGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DeterminantGradOpMaker<paddle::imperative::OpBase>);
+                  ops::DeterminantGradOpMaker<paddle::imperative::OpBase>,
+                  DeterminantInferShapeFunctor);
 
-REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp)
+DECLARE_INFER_SHAPE_FUNCTOR(determinant_grad, DeterminantGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
+REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp,
+                  DeterminantGradInferShapeFunctor);
 
 REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp,
                   ops::SlogDeterminantOpMaker,
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index e6de0ee3548b7442ac5e059331502cac441020e5..a1fe8a25665ec84b38a535f541a2cbe33d0a7fcf 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
@@ -30,7 +31,6 @@
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
 #include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
 #include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index 5e4c83e1a45ebdb96a0e764cfa2d3997442ae1ea..6daf05a9d778dfb194225f59321ffc3eb40235db 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -21,13 +21,13 @@
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/slice.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index a995877778e4770ea8ae64c051a71b31c1fb1e29..c28abb916b7a7d59d5a1974bed63e43b2f32ef2c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 
 // only can include the headers in paddle/phi/include dirs
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 58a3123c7e332f50b0830577436528f1e8df1cdf..6f4aba93d56e2a8227a8578067ac934d41243fb6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 838df2e1625912dad127b672228f9cc64eb7cec3..f9347d281043ecc63acdb8ca2fb0a18dae4adc47 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,100 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto tz = phi::vectorize<int64_t>(dout->dims());
-    memory::data_type dout_type = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(dout->dtype()));
-    platform::ReorderMKLDNNHandler handler(
-        tz, framework::TransToProtoVarType(dout->dtype()), dout_type,
-        onednn_engine);
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    auto reorder_src_memory_p = handler.AcquireSrcMemory(
-        dout->format(), platform::to_void_cast(dout->data<T>()));
-
-    if (dx) {
-      auto reorder_dst_memory_p =
-          handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
-      auto reorder_p =
-          handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-      platform::RecordEvent record_reorder(
-          "int_reorder", platform::TracerEventType::UserDefined, 2,
-          platform::EventRole::kUniqueOp);
-      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-      astream.wait();
-
-      dx->set_layout(DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-    }
-
-    if (dy) {
-      // Direct copy
-      if (dout->dims() == dy->dims()) {
-        auto reorder_dst_memory_p =
-            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
-        auto reorder_p =
-            handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-        platform::RecordEvent record_reorder(
-            "int_reorder", platform::TracerEventType::UserDefined, 2,
-            platform::EventRole::kUniqueOp);
-        reorder_p->execute(astream, *reorder_src_memory_p,
-                           *reorder_dst_memory_p);
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-      } else {
-        // Broadcasting
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(
@@ -116,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_add>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_add>)
 
-REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseAddMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseAddMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_add>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_add>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
index 367d602f5902e816a468d43ccfa009fe35a045fc..c68aa8d3d1b46c9013c6fe6a12510f0cdb744682 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
@@ -1,146 +1,28 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
-
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Input<framework::Tensor>("Out");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    if (dx) {
-      // dx = dout / y
-
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, y, dx, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
-      const auto dst_dx_memory = handler.AcquireDstMemory(dx);
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_y_memory},
-          {DNNL_ARG_DST, *dst_dx_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
-    }
-
-    if (dy) {
-      // dy = -dout * out / y
-
-      platform::BinaryMKLDNNHandler<T> y_handler(
-          dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y,
-          y, nullptr, 1.0f, 1.0f, 1.0f);
-
-      const auto y_memory = y_handler.AcquireSrcMemory(y);
-
-      dnnl::post_ops po;
-      po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc());
-
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, out, nullptr, -1.0f, 1.0f, 1.0f, po);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_out_memory = handler.AcquireSecondSrcMemory(out);
-
-      // If broadcasting is in use then let's write to temporary
-      // buffer allocated by oneDNN
-      const auto dst_dy_memory = (dout->dims() == dy->dims())
-                                     ? handler.AcquireDstMemory(dy)
-                                     : handler.AcquireDstMemory();
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_out_memory},
-          {DNNL_ARG_DST, *dst_dy_memory},
-          {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dy->set_layout(framework::DataLayout::kMKLDNN);
-
-      // Reduction is needed for broadcasting scenario
-      if (dout->dims() != dy->dims()) {
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-
-        // As source we use mem object with results from binary operation
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-
-      } else {
-        dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-// TODO(piotrekobi) add int8, uint8 support
-REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace,
-                   ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_div>,
-                   ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
-                                            dnnl::algorithm::binary_div>)
-
-REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace,
-                   ops::EltwiseDivMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseDivMKLDNNGradKernel<float>)
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace,
+                   ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_div>,
+                   ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
+                                            dnnl::algorithm::binary_div>)
+
+REGISTER_OP_KERNEL(
+    elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_div>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_div>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index ad8fd317013908e8908dff8bea3440e24779454e..761b401ca9a2e535e1badfee834ef9ee98a07aae 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -15,20 +15,35 @@
 #pragma once
 #include <string>
 #include <unordered_map>
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 #include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::Tensor;
+
+inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
+                                                     const Tensor* y) {
+  const auto src_tz = phi::vectorize(x->dims());
+  const auto dst_tz = phi::vectorize(y->dims());
+
+  size_t j = 0;
+  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
+  for (size_t i = 0; i < src_tz.size(); ++i) {
+    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
+    if (j == dst_tz.size()) break;
+  }
+
+  return dst_tz_ex;
+}
 
 template <typename T, dnnl::algorithm BINARY_OP>
 class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
@@ -103,7 +118,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     // operation.
     const bool reuse_x_memopry =
         x->numel() == z->numel() && x->IsSharedBufferWith(*z);
-    std::shared_ptr<dnnl::memory> dst_memory = nullptr;
+    std::shared_ptr<dnnl::memory> dst_memory;
     if (reuse_x_memopry) {
       dst_memory = src_x_memory;
       // NOTE(chenfeiyu): when the output reuses memory from other tensor rather
@@ -135,19 +150,193 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
   }
 };
 
-inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
-                                                     const Tensor* y) {
-  const auto src_tz = phi::vectorize(x->dims());
-  const auto dst_tz = phi::vectorize(y->dims());
+template <typename T, dnnl::algorithm BINARY_OP>
+class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+    using Tensor = framework::Tensor;
 
-  size_t j = 0;
-  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
-  for (size_t i = 0; i < src_tz.size(); ++i) {
-    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
-    if (j == dst_tz.size()) break;
-  }
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
 
-  return dst_tz_ex;
-}
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    int axis = ctx.Attr<int>("axis");
+
+    auto tz = phi::vectorize<int64_t>(dout->dims());
+    auto proto_type_dout = framework::TransToProtoVarType(dout->dtype());
+
+    platform::ReorderMKLDNNHandler reorder_handler(
+        tz, proto_type_dout, framework::ToMKLDNNDataType(proto_type_dout),
+        onednn_engine);
+
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        dout->format(), platform::to_void_cast(dout->data<T>()));
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    if (dx) {
+      std::shared_ptr<dnnl::memory> dst_memory;
+
+      // elementwise_add & elementwise_sub
+      if (BINARY_OP == dnnl::algorithm::binary_add ||
+          BINARY_OP == dnnl::algorithm::binary_sub) {
+        dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(),
+                                                      ctx.GetPlace());
+        auto reorder_p =
+            reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p);
+        platform::RecordEvent record_reorder(
+            "int_reorder", platform::TracerEventType::UserDefined, 2,
+            platform::EventRole::kUniqueOp);
+
+        reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory);
+      }
+
+      // elementwise_mul & elementwise_div
+      else {
+        platform::BinaryMKLDNNHandler<T> binary_handler(
+            BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f,
+            1.0f, 1.0f);
+
+        const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout);
+        const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y);
+        dst_memory = binary_handler.AcquireDstMemory(dx);
+
+        const auto binary_prim = binary_handler.AcquireForwardPrimitive();
+
+        const std::unordered_map<int, dnnl::memory> args = {
+            {DNNL_ARG_SRC_0, *src_dout_memory},
+            {DNNL_ARG_SRC_1, *src_y_memory},
+            {DNNL_ARG_DST, *dst_memory}};
+
+        binary_prim->execute(astream, args);
+      }
+      astream.wait();
+
+      dx->set_layout(framework::DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    }
+
+    if (dy) {
+      dnnl::primitive_attr broadcast_reduction_attr;
+      std::shared_ptr<dnnl::memory> broadcast_src_memory;
+      std::shared_ptr<dnnl::memory> dst_memory;
+
+      // elementwise_add & elementwise_sub
+      if (BINARY_OP == dnnl::algorithm::binary_add ||
+          BINARY_OP == dnnl::algorithm::binary_sub) {
+        if (dout->dims() == dy->dims()) {
+          auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+              dy, dout->format(), ctx.GetPlace());
+
+          dnnl::primitive_attr reorder_attr;
+          std::vector<float> scales(1);
+          scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1;
+          reorder_attr.set_output_scales(0, scales);
+          auto reorder_p = std::make_shared<dnnl::reorder>(
+              *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
+          platform::RecordEvent record_reorder(
+              "int_reorder", platform::TracerEventType::UserDefined, 2,
+              platform::EventRole::kUniqueOp);
+          reorder_p->execute(astream, *reorder_src_memory_p,
+                             *reorder_dst_memory_p);
+
+          dst_memory = reorder_dst_memory_p;
+        } else {
+          broadcast_src_memory = reorder_src_memory_p;
+        }
+      }
+
+      // elementwise_mul & elementwise_div
+      else {
+        std::unordered_map<int, dnnl::memory> args;
+        std::shared_ptr<dnnl::binary> binary_prim;
+        std::shared_ptr<dnnl::memory> post_op_memory;
+        std::shared_ptr<dnnl::memory> src_0_memory;
+        std::shared_ptr<dnnl::memory> src_1_memory;
+
+        platform::BinaryMKLDNNHandler<T> binary_handler(
+            dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(),
+            dout, x, nullptr, 1.0f, 1.0f, 1.0f);
+
+        src_1_memory = binary_handler.AcquireSecondSrcMemory(x);
+
+        if (BINARY_OP == dnnl::algorithm::binary_div) {
+          platform::BinaryMKLDNNHandler<T> post_op_binary_handler(
+              dnnl::algorithm::binary_div, axis, onednn_engine, ctx.GetPlace(),
+              y, y, nullptr, 1.0f, 1.0f, 1.0f);
+
+          post_op_memory = post_op_binary_handler.AcquireSrcMemory(y);
+
+          dnnl::post_ops po;
+          po.append_binary(dnnl::algorithm::binary_div,
+                           post_op_memory->get_desc());
+
+          binary_handler = platform::BinaryMKLDNNHandler<T>(
+              dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(),
+              dout, out, nullptr, -1.0f, 1.0f, 1.0f, po);
+
+          src_1_memory = binary_handler.AcquireSecondSrcMemory(out);
+        }
+
+        src_0_memory = binary_handler.AcquireSrcMemory(dout);
+
+        const auto dst_dy_memory = (dout->dims() == dy->dims())
+                                       ? binary_handler.AcquireDstMemory(dy)
+                                       : binary_handler.AcquireDstMemory();
+
+        binary_prim = binary_handler.AcquireForwardPrimitive();
+        args = {{DNNL_ARG_SRC_0, *src_0_memory},
+                {DNNL_ARG_SRC_1, *src_1_memory},
+                {DNNL_ARG_DST, *dst_dy_memory}};
+
+        if (BINARY_OP == dnnl::algorithm::binary_div)
+          args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1,
+                       *post_op_memory});
+
+        binary_prim->execute(astream, args);
+        broadcast_src_memory = dst_dy_memory;
+        dst_memory = dst_dy_memory;
+      }
+      astream.wait();
+      dy->set_layout(DataLayout::kMKLDNN);
+
+      if (dout->dims() != dy->dims()) {
+        // Broadcasting
+        if (BINARY_OP == dnnl::algorithm::binary_sub) {
+          dnnl::post_ops po;
+          po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
+          broadcast_reduction_attr.set_post_ops(po);
+        }
+
+        platform::ReductionMKLDNNHandler<T> reduction_handler(
+            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
+            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy),
+            broadcast_reduction_attr);
+        dst_memory = reduction_handler.AcquireDstMemory(dy);
+
+        auto reduction_p = reduction_handler.AcquireForwardPrimitive();
+
+        reduction_p->execute(astream, {
+                                          {DNNL_ARG_SRC, *broadcast_src_memory},
+                                          {DNNL_ARG_DST, *dst_memory},
+                                      });
+        astream.wait();
+        dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape(
+            phi::vectorize<int64_t>(dy->dims()))));
+      } else {
+        dy->set_format(platform::GetMKLDNNFormat(*dst_memory));
+      }
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index c03794012ff3b793684222c62f423edd6e8637f1..0ef5c5e628ce62084305fc95e66862a15822ecb3 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -1,127 +1,19 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    if (dx) {
-      // dx = dout*y
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, y, dx, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
-      const auto dst_dx_memory = handler.AcquireDstMemory(dx);
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_y_memory},
-          {DNNL_ARG_DST, *dst_dx_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
-    }
-
-    if (dy) {
-      // dy = dout*x
-      // Handler is having nullptr passed instead of output tensor as
-      // we want Dst buffer to be allocated by oneDNN not to use Tensor
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, x, nullptr, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_x_memory = handler.AcquireSecondSrcMemory(x);
-
-      // If broadcasting is in use then let's write to temporary
-      // buffer allocated by oneDNN
-      const auto dst_dy_memory = (dout->dims() == dy->dims())
-                                     ? handler.AcquireDstMemory(dy)
-                                     : handler.AcquireDstMemory();
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_x_memory},
-          {DNNL_ARG_DST, *dst_dy_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dy->set_layout(framework::DataLayout::kMKLDNN);
-
-      // Reduction is needed for broadcasting scenario
-      if (dout->dims() != dy->dims()) {
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-        // As source we use mem object with results from binary operation
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-
-      } else {
-        dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(
@@ -132,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_mul>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_mul>)
 
-REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseMulMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseMulMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_mul>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_mul>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
index 3c799008a2abcf3fc59da7b759c9d43f3e940e8e..510373831eb6db5c7ffed6e8e58cbfb0ae268a50 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
@@ -1,5 +1,4 @@
-
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,113 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto tz = phi::vectorize<int64_t>(dout->dims());
-    memory::data_type dout_type = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(dout->dtype()));
-    platform::ReorderMKLDNNHandler handler(
-        tz, framework::TransToProtoVarType(dout->dtype()), dout_type,
-        onednn_engine);
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    auto reorder_src_memory_p = handler.AcquireSrcMemory(
-        dout->format(), platform::to_void_cast(dout->data<T>()));
-
-    if (dx) {
-      auto reorder_dst_memory_p =
-          handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
-      auto reorder_p =
-          handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-      platform::RecordEvent record_reorder(
-          "int_reorder", platform::TracerEventType::UserDefined, 2,
-          platform::EventRole::kUniqueOp);
-
-      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-      astream.wait();
-
-      dx->set_layout(DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-    }
-
-    if (dy) {
-      // Direct copy
-      if (dout->dims() == dy->dims()) {
-        auto reorder_dst_memory_p =
-            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
-
-        dnnl::primitive_attr reorder_attr;
-        std::vector<float> scales = {-1};
-        reorder_attr.set_output_scales(0, scales);
-        auto reorder_p = std::make_shared<dnnl::reorder>(
-            *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
-        platform::RecordEvent record_reorder(
-            "int_reorder", platform::TracerEventType::UserDefined, 2,
-            platform::EventRole::kUniqueOp);
-        reorder_p->execute(astream, *reorder_src_memory_p,
-                           *reorder_dst_memory_p);
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-      } else {
-        // Broadcasting
-
-        dnnl::post_ops po;
-        po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
-        dnnl::primitive_attr attr;
-        attr.set_post_ops(po);
-
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr);
-
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-
-        reduction_p->execute(astream, {
-                                          {DNNL_ARG_SRC, *reorder_src_memory_p},
-                                          {DNNL_ARG_DST, *dy_memory_p},
-                                      });
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
 
 namespace ops = paddle::operators;
 
@@ -131,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_sub>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_sub>)
 
-REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseSubMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseSubMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_sub>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_sub>)
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 97a35a34f23e96707269482e29da13a15538cdca..9361edd43bf15ac0eee4a4de618027af79b78b56 100755
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -12,7 +12,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/expand_as_v2_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,27 +24,6 @@ using framework::Tensor;
 class ExpandAsV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAsV2");
-    auto x_dims = ctx->GetInputDim("X");
-    auto target_shape = ctx->Attrs().Get<std::vector<int>>("target_shape");
-    PADDLE_ENFORCE_GE(
-        target_shape.size(), static_cast<size_t>(x_dims.size()),
-        platform::errors::InvalidArgument(
-            "The rank of target_shape must be greater than or equal "
-            "to the rank of Input(X). But received Input(X): input "
-            "rank %u; received target_shape: rank %u.",
-            x_dims.size(), target_shape.size()));
-    PADDLE_ENFORCE_LE(target_shape.size(), MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank of target_shape must be less than or equal "
-                          "to %d. But received: rank %u.",
-                          MAX_RANK_SUPPORTED, target_shape.size()));
-    ctx->SetOutputDim("Out", phi::make_ddim(target_shape));
-  }
 };
 
 class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
@@ -116,9 +97,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsV2GradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(expand_as_v2, ExpandAsInferShapeFunctor,
+                            PD_INFER_META(phi::ExpandAsInferMeta));
 REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker,
                   ops::ExpandAsV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>);
+                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>,
+                  ExpandAsInferShapeFunctor);
 REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
                   ops::ExpandAsV2GradNoNeedBufVarsInferer);
 
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 9f7e4fb8d5749cf6bd54ed3e3bf9699199c0d3e6..70597be393c35e6939b83d86ce2f9be8f2c36805 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -28,13 +28,14 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
   extern __shared__ char* shared_max_data_tmp[];
   auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
   if (gridDim.x > 1) {
-    shared_max_data[tid] = T(0);
+    T local_max_data = T(0);
     for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
       T tmp = abs(in[i]);
-      if (tmp > shared_max_data[tid]) {
-        shared_max_data[tid] = tmp;
+      if (tmp > local_max_data) {
+        local_max_data = tmp;
       }
     }
+    shared_max_data[tid] = local_max_data;
   } else {
     if (bid < n) {
       shared_max_data[tid] = abs(in[bid]);
@@ -83,13 +84,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
   int channel_size = n / c;
   const T* in_c = in + blockIdx.x * channel_size;
   extern __shared__ T shared_max_data[];
-  shared_max_data[tid] = T(0);
+  T local_max_data = T(0);
   for (int i = tid; i < channel_size; i += blockDim.x) {
     T tmp = fabs(in_c[i]);
-    if (tmp > shared_max_data[tid]) {
-      shared_max_data[tid] = tmp;
+    if (tmp > local_max_data) {
+      local_max_data = tmp;
     }
   }
+  shared_max_data[tid] = local_max_data;
   __syncthreads();
   for (int i = blockDim.x / 2; i > 0; i >>= 1) {
     if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
@@ -113,13 +115,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
   int tid = threadIdx.x;
   int bid = blockIdx.x;
   const T* in_current = in + tid * cout_wh_size + bid * wh_size;
-  shared_max_data[tid] = T(0);
+  T local_max_data = T(0);
   for (int i = 0; i < wh_size; i++) {
     T tmp = fabs(in_current[i]);
-    if (tmp > shared_max_data[tid]) {
-      shared_max_data[tid] = tmp;
+    if (tmp > local_max_data) {
+      local_max_data = tmp;
     }
   }
+  shared_max_data[tid] = local_max_data;
   __syncthreads();
 
   int len = blockDim.x;
@@ -404,6 +407,19 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
+template <typename T>
+__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
+                                              const T* in_accum,
+                                              const T* cur_scale, const T rate,
+                                              T* out_state, T* out_accum,
+                                              T* out_scale) {
+  T state = rate * (*in_state) + T(1.0f);
+  T accum = rate * (*in_accum) + (*cur_scale);
+  *out_state = state;
+  *out_accum = accum;
+  *out_scale = accum / state;
+}
+
 template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
@@ -415,29 +431,14 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
                   framework::Tensor* out_accum, framework::Tensor* out_scale) {
     const auto gpu_place = ctx.GetPlace();
 
-    T accum;
-    T state;
-    T scale;
-    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
-                 sizeof(T), ctx.stream());
-    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
-                 sizeof(T), ctx.stream());
-    memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
-                 ctx.stream());
-    ctx.Wait();
-
     T rate_t = static_cast<T>(rate);
-    state = rate_t * state + static_cast<T>(1.0);
-    accum = rate_t * accum + scale;
-    scale = accum / state;
-
-    memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &accum, sizeof(T), ctx.stream());
-    memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &state, sizeof(T), ctx.stream());
-    memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &scale, sizeof(T), ctx.stream());
-    ctx.Wait();
+    T* out_state_data = out_state->mutable_data<T>(gpu_place);
+    T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
+    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+
+    FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
+        in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
+        out_state_data, out_accum_data, out_scale_data);
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index 18c7187fc8e64c9fed8a86a984954b5420c1e5b5..a9b72a9cdf397f026f6ce24d83cc13066a3fd000 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -25,14 +25,16 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace memory = paddle::memory;
 
 USE_OP_ITSELF(dropout);
-USE_OP(layer_norm);
+USE_OP_ITSELF(layer_norm);
 
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
@@ -136,18 +138,23 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
                const platform::CUDADeviceContext &ctx) {
   framework::Scope scope;
   auto place = ctx.GetPlace();
+  paddle::optional<const framework::LoDTensor &> scale_opt = paddle::none;
   if (scale.size() > 0) {
     auto var_scale = scope.Var("Scale");
     auto tensor_scale = var_scale->GetMutable<framework::LoDTensor>();
     framework::TensorFromVector(scale, ctx, tensor_scale);
     tensor_scale->Resize({cols});
+    scale_opt = *tensor_scale;
   }
 
+  paddle::optional<const framework::LoDTensor &> bias_opt = paddle::none;
   if (bias.size() > 0) {
     auto var_bias = scope.Var("Bias");
     auto tensor_bias = var_bias->GetMutable<framework::LoDTensor>();
     framework::TensorFromVector(bias, ctx, tensor_bias);
     tensor_bias->Resize({cols});
+
+    bias_opt = *tensor_bias;
   }
 
   auto var_x = scope.Var("X");
@@ -157,20 +164,19 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
 
   auto var_y = scope.Var("Y");
   auto tensor_y = var_y->GetMutable<framework::LoDTensor>();
+  tensor_y->Resize({rows, cols});
 
   auto var_mean = scope.Var("Mean");
   auto tensor_mean = var_mean->GetMutable<framework::LoDTensor>();
+  tensor_mean->Resize({rows});
 
   auto var_variance = scope.Var("Variance");
   auto tensor_variance = var_variance->GetMutable<framework::LoDTensor>();
-
-  framework::AttributeMap attrs;
-  attrs.insert({"epsilon", epsilon});
-
-  auto op = framework::OpRegistry::CreateOp(
-      "layer_norm", {{"X", {"X"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}},
-      {{"Y", {"Y"}}, {"Mean", {"Mean"}}, {"Variance", {"Variance"}}}, attrs);
-  op->Run(scope, place);
+  tensor_variance->Resize({rows});
+  ctx.Wait();
+  phi::LayerNormKernel<T>(static_cast<const phi::GPUContext &>(ctx), *tensor_x,
+                          scale_opt, bias_opt, 1e-5, 1, false, tensor_y,
+                          tensor_mean, tensor_variance);
   framework::TensorToVector(*tensor_y, ctx, y);
   framework::TensorToVector(*tensor_mean, ctx, means);
   framework::TensorToVector(*tensor_variance, ctx, vars);
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index 032440d7f0478dc087e3ba38274f2a31a9a66a23..c7e1f4a5463fe11b9fa96f147b71004140130399 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -198,7 +198,6 @@ struct TestFusedLayernormResidualDropoutBias {
             residual_vec[i * cols + j] + out2[i * cols + j];
       }
     }
-
     LayerNorm<T>(scale_vec, layernorm_bias_vec, correct_out, &correct_means,
                  &correct_vars, &correct_layernorm_out, epsilon, rows, cols,
                  *ctx);
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 6ee9582dacde372886075bb7c5619c6bc1b99c98..f6d3fd898469113dcffce76a84e4c292603707c6 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -15,9 +15,13 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -27,43 +31,6 @@ using Tensor = framework::Tensor;
 class GridSampleOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GridSampler");
-    OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "GridSampler");
-    OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "GridSampler");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto grid_dims = ctx->GetInputDim("Grid");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GridSampleOp should be 4-D Tensor, but "
-                          "received X dimension size(%d)",
-                          x_dims.size()));
-    PADDLE_ENFORCE_EQ(grid_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input(Grid) of GridSampleOp should be 4-D Tensor, "
-                          "but received X dimension size(%d)",
-                          grid_dims.size()));
-    if (ctx->IsRuntime() || grid_dims[3] > 0) {
-      PADDLE_ENFORCE_EQ(
-          grid_dims[3], 2,
-          platform::errors::InvalidArgument(
-              "Input(Grid) dimension[3] should be 2, but received %d",
-              grid_dims[3]));
-    }
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          grid_dims[0], x_dims[0],
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Grid) dimension[0] should be equal, but "
-              "received X dimension[0](%d) != Grid dimension[0](%d)",
-              x_dims[0], grid_dims[0]));
-    }
-
-    ctx->SetOutputDim("Output",
-                      {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
-    ctx->ShareLoD("X", "Output");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -173,18 +140,6 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
 class GridSampleOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
-                   framework::GradVarName("X"), "grid_sampler");
-    auto input_dims = ctx->GetInputDim("X");
-    auto grid_dims = ctx->GetInputDim("Grid");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Grid"))) {
-      ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims);
-    }
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -224,10 +179,16 @@ class GridSampleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler, GridSamplerInferShapeFunctor,
+                            PD_INFER_META(phi::GridSampleBaseInferMeta));
 REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
                   ops::GridSampleGradMaker<paddle::framework::OpDesc>,
-                  ops::GridSampleGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
+                  ops::GridSampleGradMaker<paddle::imperative::OpBase>,
+                  GridSamplerInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler_grad, GridSamplerGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralBinaryGradInferMeta));
+REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad,
+                  GridSamplerGradInferShapeFunctor);
 
 REGISTER_OP_VERSION(grid_sampler)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc
index 68d0c7978b4e45f216abd5fa5c4be93f788e8f04..60390016d66e3addf0ead14f6b9209511324961c 100644
--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
@@ -17,7 +17,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -26,27 +28,6 @@ class KronOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kron");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "kron");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kron");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_y = ctx->GetInputDim("Y");
-    auto rank_x = dim_x.size();
-    auto rank_y = dim_y.size();
-    auto rank = (rank_x > rank_y) ? rank_x : rank_y;
-
-    std::vector<int64_t> dim_out;
-    dim_out.reserve(rank);
-    for (int i = 0; i < rank; i++) {
-      int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x));
-      int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y));
-      dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi);
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(dim_out));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -173,7 +154,10 @@ class KronGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(kron, KronInferShapeFunctor,
+                            PD_INFER_META(phi::KronInferMeta));
 REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker,
                   ops::KronGradOpMaker<paddle::framework::OpDesc>,
-                  ops::KronGradOpMaker<paddle::imperative::OpBase>);
+                  ops::KronGradOpMaker<paddle::imperative::OpBase>,
+                  KronInferShapeFunctor);
 REGISTER_OPERATOR(kron_grad, ops::KronGradOp);
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 412ae3c49b5f3cc9fc2422aa220af324e6d99b69..c0a4b88fc76fd0d648b289e0d2f13536523f02d8 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -758,12 +758,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
 */
 template <typename T, typename U, typename ScaleT = U,
           typename MaskType = uint8_t>
-void ln_bwd_1024_kernel_driver(
-    const platform::CUDADeviceContext &dev_ctx, const int rows, const int cols,
-    float epsilon, const T *x_ptr, const ScaleT *scale_ptr, const U *mean_ptr,
-    const U *var_ptr, const T *dout_ptr, T *dx_ptr, ScaleT *dscale_ptr,
-    ScaleT *dbias_ptr, const MaskType *mask_ptr = nullptr,
-    T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
+void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
+                               const int cols, float epsilon, const T *x_ptr,
+                               const ScaleT *scale_ptr, const U *mean_ptr,
+                               const U *var_ptr, const T *dout_ptr, T *dx_ptr,
+                               ScaleT *dscale_ptr, ScaleT *dbias_ptr,
+                               const MaskType *mask_ptr = nullptr,
+                               T factor = static_cast<T>(0),
+                               T *d_dropout_src_ptr = nullptr) {
   auto stream = dev_ctx.stream();
   if (cols == 1024) {
     // step-1: compute dx and reduced part results of dscale and dbias.
@@ -1334,8 +1336,7 @@ static void LayerNormBackward(
     const U *mean, const U *var, T *d_x,
     LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_scale,
     LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_bias, float epsilon,
-    int64_t batch_size, int64_t feature_size,
-    const platform::CUDADeviceContext &dev_ctx) {
+    int64_t batch_size, int64_t feature_size, const phi::GPUContext &dev_ctx) {
   auto stream = dev_ctx.stream();
 #ifdef __HIPCC__
   const int kMaxBlockDim = 256;
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index e7d676479be0cc1176fa27c477bd35a5d6787cd3..224ab748dab6cdf8be246c4b400b4e55b6faf675 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
-
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -278,10 +277,3 @@ REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
                   ops::LayerNormGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp,
                   ops::LayerNormGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
deleted file mode 100644
index dfe73d3727132ae9b8f71e2a415ef5193f303493..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/operators/layer_norm_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
-                                               const T *input,
-                                               std::vector<int> input_shape,
-                                               const T *bias, const T *scale,
-                                               T *output, T *mean, T *variance,
-                                               int begin_norm_axis, float eps) {
-  const auto x_dims = phi::make_ddim(input_shape);
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-  switch (GetDesiredBlockDim(feature_size)) {
-    FIXED_BLOCK_DIM_CASE(
-        LayerNormForward<T, T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-            input, scale, bias, output, mean, variance, eps, feature_size));
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Product from begin_norm_axis to end in layer_norm must be larger "
-          "than 1"));
-      break;
-  }
-}
-
-template <typename T>
-class LayerNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using U = LayerNormParamType<T>;
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *x = ctx.Input<Tensor>("X");
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x->dims();
-    auto *x_data = x->data<T>();
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    auto *mean_data = mean->mutable_data<U>(ctx.GetPlace());
-    auto *var_data = var->mutable_data<U>(ctx.GetPlace());
-
-    auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
-    auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
-
-    framework::proto::VarType::Type x_dtype =
-        framework::TransToProtoVarType(x->dtype());
-    framework::proto::VarType::Type scale_bias_dtype;
-    if (void_scale_data != nullptr) {
-      scale_bias_dtype = framework::TransToProtoVarType(scale->dtype());
-      if (void_bias_data != nullptr) {
-        PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                          framework::TransToProtoVarType(bias->dtype()),
-                          platform::errors::InvalidArgument(
-                              "Thie Scale and Bias of layer_norm op "
-                              "should have the same data type."));
-      }
-    } else {
-      scale_bias_dtype = (void_bias_data != nullptr
-                              ? framework::TransToProtoVarType(bias->dtype())
-                              : x_dtype);
-    }
-
-    bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
-    if (!is_scale_bias_same_dtype_with_x) {
-      PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                        framework::DataTypeTrait<U>::DataType(),
-                        platform::errors::InvalidArgument(
-                            "Unsupported data type of Scale and Bias: %s",
-                            framework::DataTypeToString(scale_bias_dtype)));
-    }
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-    int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-
-    auto stream = ctx.cuda_device_context().stream();
-
-#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    switch (GetDesiredBlockDim(feature_size)) {                            \
-      FIXED_BLOCK_DIM_CASE(                                                \
-          LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX><<<  \
-              batch_size, kBlockDim, 0, stream>>>(                         \
-              x_data, static_cast<const ScaleBiasT *>(void_scale_data),    \
-              static_cast<const ScaleBiasT *>(void_bias_data), y_data,     \
-              mean_data, var_data, epsilon, feature_size));                \
-      default:                                                             \
-        PADDLE_THROW(platform::errors::InvalidArgument(                    \
-            "Product from begin_norm_axis to end must be larger than 1")); \
-        break;                                                             \
-    }                                                                      \
-  } while (0)
-
-#ifdef PADDLE_WITH_CUDA
-    bool can_call_1024_kernel = false;
-    if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
-      can_call_1024_kernel = true;
-    }
-    if (can_call_1024_kernel) {
-      const int WARPS_M = 4;
-      const int WARPS_N = 1;
-      const int THREADS_PER_WARP = 32;
-      const int BYTES_PER_LDG = 16;
-      const int VecSize = BYTES_PER_LDG / sizeof(T);
-
-      const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
-      const int ROWS_PER_CTA = WARPS_M;
-
-      const int grid = static_cast<int>(
-          std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
-      if (is_scale_bias_same_dtype_with_x) {
-        ln_fwd_1024_kernel<T, U, T, VecSize, WARPS_M, WARPS_N,
-                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-            batch_size, feature_size, epsilon, x_data,
-            static_cast<const T *>(void_scale_data),
-            static_cast<const T *>(void_bias_data), mean_data, var_data,
-            y_data);
-      } else {
-        ln_fwd_1024_kernel<T, U, U, VecSize, WARPS_M, WARPS_N,
-                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-            batch_size, feature_size, epsilon, x_data,
-            static_cast<const U *>(void_scale_data),
-            static_cast<const U *>(void_bias_data), mean_data, var_data,
-            y_data);
-      }
-    } else {
-#endif
-      if (is_scale_bias_same_dtype_with_x) {
-        PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
-      } else {
-        PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
-      }
-#ifdef PADDLE_WITH_CUDA
-    }
-#endif
-
-#undef PADDLE_LAUNCH_LAYERNORM_FWD
-  }
-};
-
-template <typename T>
-class LayerNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using U = LayerNormParamType<T>;
-    const float epsilon = ctx.Attr<float>("epsilon");
-    // d_x, d_scale, d_bias may be nullptr
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *mean = ctx.Input<Tensor>("Mean");
-    auto *var = ctx.Input<Tensor>("Variance");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-    int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-
-    auto *x_data = x->data<T>();
-    auto *d_y_data = d_y->data<T>();
-
-    auto *mean_data = mean->data<U>();
-    auto *var_data = var->data<U>();
-
-    auto *d_x_data =
-        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace()));
-
-    framework::proto::VarType::Type x_dtype =
-        framework::TransToProtoVarType(x->dtype());
-    framework::proto::VarType::Type scale_bias_dtype;
-    if (scale != nullptr) {
-      scale_bias_dtype = framework::TransToProtoVarType(scale->dtype());
-    } else {
-      // FIXME(zengjinle): do not find a better way to get the right
-      // data type of the d_scale and d_bias if scale == nullptr.
-      auto *bias = ctx.Input<Tensor>("Bias");
-      if (bias != nullptr) {
-        scale_bias_dtype = framework::TransToProtoVarType(bias->dtype());
-      } else {
-        scale_bias_dtype = x_dtype;
-      }
-    }
-
-#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    auto *scale_data =                                                     \
-        (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());          \
-    auto *d_scale_data =                                                   \
-        (d_scale == nullptr ? nullptr : d_scale->mutable_data<ScaleBiasT>( \
-                                            ctx.GetPlace()));              \
-    auto *d_bias_data =                                                    \
-        (d_bias == nullptr ? nullptr : d_bias->mutable_data<ScaleBiasT>(   \
-                                           ctx.GetPlace()));               \
-    auto *d_x_data =                                                       \
-        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace())); \
-    LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(                    \
-        x_data, d_y_data, scale_data, mean_data, var_data, d_x_data,       \
-        d_scale_data, d_bias_data, epsilon, batch_size, feature_size,      \
-        ctx.cuda_device_context());                                        \
-  } while (0)
-
-    if (scale_bias_dtype == x_dtype) {
-      PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
-    } else {
-      PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
-    }
-
-#undef PADDLE_LAUNCH_LAYERNORM_BWD
-  }
-};
-
-template class LayerNormDirectCUDAFunctor<float>;
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
-#elif CUDNN_VERSION_MIN(8, 1, 0)
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::bfloat16>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
-#endif
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
deleted file mode 100644
index 9d70b7cf707437136bf358d31ea6fd4cc0f2a534..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/layer_norm_op.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__)
-#include "paddle/fluid/operators/jit/kernels.h"
-#endif
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-// Wrap RowwiseMean and ColwiseMean.
-// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
-// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
-// implementation only considers 2D.
-template <typename DeviceContext, typename T>
-struct RowwiseMean2D {
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class RowwiseMean2D<platform::CUDADeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({right_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
-  }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class RowwiseMean2D<platform::CPUDeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    row_mean_(context, input, out);
-  }
-
- private:
-  phi::funcs::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
-};
-
-template <typename DeviceContext, typename T>
-struct ColwiseSum2D {
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class ColwiseSum2D<platform::CUDADeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({left_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0);
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class ColwiseSum2D<platform::CPUDeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    col_wise_(context, input, out);
-  }
-
- private:
-  phi::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
-};
-
-template <typename T>
-struct SubAndSquareFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
-};
-
-template <typename T>
-struct DivAndSqrtFunctor {
-  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a / (sqrt(b + epsilon_));
-  }
-
- private:
-  T epsilon_;
-};
-
-template <typename T>
-struct MulInvVarFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a * std::sqrt(1.0 / b);
-  }
-};
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class LayerNormDirectCUDAFunctor {
- public:
-  void operator()(gpuStream_t stream, const T* input,
-                  std::vector<int> input_shape, const T* bias, const T* scale,
-                  T* output, T* mean, T* variance, int begin_norm_axis,
-                  float eps);
-};
-#endif
-
-template <typename DeviceContext, typename T>
-class LayerNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto x = *ctx.Input<Tensor>("X");
-
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x.dims();
-
-    y->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    x.Resize(matrix_shape);
-    Tensor out;
-    out.ShareDataWith(*y);
-    out.Resize(matrix_shape);
-
-#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
-    defined(__OSX__)
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
-
-    // get mean
-    row_mean(dev_ctx, x, mean);
-
-    // get variance
-    ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
-    row_mean(dev_ctx, out, var);
-
-    // get x_norm
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
-    ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-        ctx, &out, var, /*axis*/ 0,
-        DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
-
-    if (scale) {
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
-    }
-    if (bias) {
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
-    }
-#else
-    PADDLE_ENFORCE_EQ(mean->numel(), left,
-                      platform::errors::InvalidArgument(
-                          "mean's length (%d) is not equal with expected (%d).",
-                          mean->numel(), left));
-    PADDLE_ENFORCE_EQ(var->numel(), left,
-                      platform::errors::InvalidArgument(
-                          "var's length (%d) is not equal with expected (%d).",
-                          var->numel(), left));
-    if (scale) {
-      PADDLE_ENFORCE_EQ(
-          scale->numel(), right,
-          platform::errors::InvalidArgument(
-              "scale's length (%d) is not equal with expected (%d).",
-              scale->numel(), right));
-    }
-    if (bias) {
-      PADDLE_ENFORCE_EQ(
-          bias->numel(), right,
-          platform::errors::InvalidArgument(
-              "bias's length (%d) is not equal with expected (%d).",
-              bias->numel(), right));
-    }
-
-    auto ker =
-        jit::KernelFuncs<jit::LayerNormTuple<T>, platform::CPUPlace>::Cache()
-            .At(right);
-    ker(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
-        scale ? scale->data<T>() : nullptr, bias ? bias->data<T>() : nullptr,
-        static_cast<int>(left), static_cast<const float>(epsilon), right);
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LayerNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto x = *ctx.Input<Tensor>("X");
-    auto* mean = ctx.Input<Tensor>("Mean");
-    auto* var = ctx.Input<Tensor>("Variance");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    // init output
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    const auto& x_dims = x.dims();
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    d_y.Resize(matrix_shape);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
-                                               ctx.device_context());
-
-    Tensor temp;
-    Tensor temp_norm;
-    if (d_scale || d_x) {
-      x.Resize(matrix_shape);
-      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
-
-      temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
-      // get x_norm
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
-    }
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      colwise_sum(dev_ctx, d_y, d_bias);
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
-      colwise_sum(dev_ctx, temp, d_scale);
-    }
-
-    if (d_x) {
-      framework::DDim vec_shape({left});
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto dx_dim = d_x->dims();
-      Tensor temp_vec;
-      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
-
-      RowwiseMean2D<DeviceContext, T> row_mean(left, right,
-                                               ctx.device_context());
-
-      if (d_scale) {
-        // dy_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
-        framework::TensorCopy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, temp, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      } else {
-        // dy_dx
-        framework::TensorCopy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, d_y, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      }
-      // dy_var_dx
-      row_mean(dev_ctx, temp, &temp_vec);
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, d_x, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
-      d_x->Resize(dx_dim);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
index c88880b43fff9fccd9764f145fba8ca4c61343c7..3c7e5bf9593e0ae2b3d8c04db1467c3b8fd1e174 100644
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc
index 0480a354c8bd8fdb81c95a576f57e9a12019ffc9..3b21a55f8df0dbb532729cf5cbca4c7362223b9c 100644
--- a/paddle/fluid/operators/layer_norm_op_xpu.cc
+++ b/paddle/fluid/operators/layer_norm_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index 6e2ac4617da4df8e4ebaf92d4193ef8b3368b97a..2414ae68438fd4e3cff94d60f400063b72116714 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 812c55cdd5055186d7fd83a2057d88256f3b34a3..2e82b47e8da1c6eb6f4a05fc4f7f356110f9fff1 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/common/data_type.h"
 
 namespace paddle {
 namespace operators {
@@ -139,7 +140,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     layer_norm_p->execute(astream, args);
     astream.wait();
 
-    y->set_layout(DataLayout::kMKLDNN);
+    y->set_layout(phi::DataLayout::kMKLDNN);
     y->set_format(platform::GetMKLDNNFormat(*dst_memory));
   }
 };
diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc
index c7fb92cd5107cee12e0995948e320ef3ed616f4d..9c16ccb138f7da56568ce6224dc30deb5bbccb7f 100644
--- a/paddle/fluid/operators/mode_op.cc
+++ b/paddle/fluid/operators/mode_op.cc
@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/mode_op.h"
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -23,43 +27,6 @@ class ModeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode");
-
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_EQ(
-        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
-        paddle::platform::errors::InvalidArgument(
-            "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
-            dim_size, dim_size, axis));
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of ModeOp must have >= 1d shape"));
-    if (axis < 0) axis += dim_size;
-    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    std::vector<int64_t> dimvec;
-    for (int64_t i = 0; i < axis; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    if (keepdim) {
-      dimvec.emplace_back(static_cast<int64_t>(1));
-    }
-    for (int64_t i = axis + 1; i < dim_size; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    framework::DDim dims = phi::make_ddim(dimvec);
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
-                                                "input shape should >= 1d"));
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -138,18 +105,11 @@ class ModeGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(mode, ModeInferShapeFunctor,
+                            PD_INFER_META(phi::ModeInferMeta));
 REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker,
                   ops::ModeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ModeGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(mode,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, float>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, double>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int32_t>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int64_t>);
-
+                  ops::ModeGradOpMaker<paddle::imperative::OpBase>,
+                  ModeInferShapeFunctor);
 REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    mode_grad, ops::ModeGradCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu
deleted file mode 100644
index 2bacda8afb0eb340c4c8d4068f3013e2adbc7f91..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mode_op.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/inner_product.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mode_op.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-
-namespace paddle {
-namespace operators {
-
-int ComputeBlockSize(int col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256 && col <= 512)
-    return 512;
-  else if (col > 128 && col <= 256)
-    return 256;
-  else if (col > 64 && col <= 128)
-    return 128;
-  else
-    return 64;
-}
-
-template <typename T>
-void getModebySort(const platform::CUDADeviceContext& ctx,
-                   const framework::Tensor* input_tensor,
-                   const int64_t num_cols, const int64_t num_rows,
-                   T* out_tensor, int64_t* indices_tensor) {
-  framework::Tensor input_tmp;
-  framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp);
-  T* input_tmp_data = input_tmp.mutable_data<T>(ctx.GetPlace());
-  input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
-  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
-  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
-
-  for (int64_t i = 0; i < num_rows; ++i) {
-    T* begin = input_tmp_data + num_cols * i;
-    T* end = input_tmp_data + num_cols * (i + 1);
-    thrust::device_vector<int64_t> indices_data(num_cols);
-    thrust::sequence(thrust::device, indices_data.begin(),
-                     indices_data.begin() + num_cols);
-    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
-    int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1,
-                                           begin + 1, 0, thrust::plus<int>(),
-                                           thrust::not_equal_to<T>());
-    thrust::device_vector<T> keys_data(unique);
-    thrust::device_vector<int64_t> cnts_data(unique);
-    thrust::reduce_by_key(thrust::device, begin, end,
-                          thrust::constant_iterator<int>(1), keys_data.begin(),
-                          cnts_data.begin());
-    auto it = thrust::max_element(thrust::device, cnts_data.begin(),
-                                  cnts_data.begin() + unique);
-    T mode = keys_data[it - cnts_data.begin()];
-    int64_t counts = cnts_data[it - cnts_data.begin()];
-    auto pos = thrust::find(thrust::device, begin, end, mode);
-    int64_t index = indices_data[pos - begin + counts - 1];
-    out_tensor_ptr[i] = static_cast<T>(mode);
-    indices_tensor_ptr[i] = static_cast<int64_t>(index);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ModeOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
-
-    // get the input dims
-    const auto& in_dims = input->dims();
-    // calcluate the real axis
-    if (axis < 0) axis += in_dims.size();
-
-    auto out_dims = output->dims();
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      getModebySort<T>(dev_ctx, input, input_width, input_height, output_data,
-                       indices_data);
-    } else {
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dim);
-        indices->Resize(tmp_out_dim);
-      }
-
-      framework::DDim trans_shape(in_dims);
-      framework::DDim trans_out_shape(in_dims);
-      for (int i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = in_dims[trans_axis[i]];
-        trans_out_shape[i] = in_dims[trans_axis[i]];
-      }
-      trans_out_shape[in_dims.size() - 1] = 1;
-
-      // second step, tranpose the input
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_shape, ctx.GetPlace());
-      int ndims = trans_axis.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans_axis);
-      framework::Tensor trans_ind;
-      int64_t* trans_ind_data =
-          trans_ind.mutable_data<int64_t>(trans_out_shape, ctx.GetPlace());
-      framework::Tensor trans_out;
-      T* trans_out_data =
-          trans_out.mutable_data<T>(trans_out_shape, ctx.GetPlace());
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
-      const int64_t input_width = trans_shape[trans_shape.size() - 1];
-      getModebySort<T>(dev_ctx, &trans_input, input_width, input_height,
-                       trans_out_data, trans_ind_data);
-      // last step, tranpose back the indices and output
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans_axis);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans_axis);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ModeOpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-
-    const auto& in_dims = x->dims();
-    auto out_dims = indices->dims();
-
-    if (axis < 0) axis += in_dims.size();
-    // allocate the cuda memory for the x_grad
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-
-    // calcluate the block and grid num
-    auto& dev_ctx = context.cuda_device_context();
-    int block_size = ComputeBlockSize(post);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mode, ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    mode_grad,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/mode_op.h b/paddle/fluid/operators/mode_op.h
deleted file mode 100644
index 76d356ed16eb3f81b10d541230f49b73fd836543..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mode_op.h
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename Type>
-static void getMode(Type input_height, Type input_width, int input_dim,
-                    const framework::Tensor* input, T* t_out, Type* t_indices) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    std::sort(col_vec.begin(), col_vec.end(),
-              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                return (!std::isnan(static_cast<double>(l.first)) &&
-                        std::isnan(static_cast<double>(r.first))) ||
-                       (l.first < r.first);
-              });
-    T mode = 0;
-    int64_t indice = 0;
-    int64_t cur_freq = 0;
-    int64_t max_freq = 0;
-    for (int64_t i = 0; i < input_width; ++i) {
-      ++cur_freq;
-      if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
-        if (cur_freq > max_freq) {
-          max_freq = cur_freq;
-          mode = col_vec[i].first;
-          indice = col_vec[i].second;
-        }
-        cur_freq = 0;
-      }
-    }
-    t_out[i] = mode;
-    t_indices[i] = indice;
-  }
-}
-
-template <typename T, typename Type>
-static void ModeAssign(const Type& input_height, const Type& input_width,
-                       const int& input_dim, const framework::Tensor* input,
-                       const framework::Tensor* indices, T* output_data) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      output_data[i * input_width + e_indices(0)] = e_input(0);
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ModeCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("X");
-    auto* output = context.Output<framework::Tensor>("Out");
-    auto* indices = context.Output<framework::Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    auto out_dims = output->dims();
-    // if axis is not the last dim, transpose it to the last dim, do the
-    // calculation,
-    // then tranpose it back to orginal axis.
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      getMode<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                          output_data, indices_data);
-    } else {
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dim);
-        indices->Resize(tmp_out_dim);
-      }
-
-      // get the trans input_dims, out_dims
-      framework::DDim trans_shape(in_dims);
-      framework::DDim trans_out_shape(in_dims);
-
-      for (size_t i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = in_dims[trans_axis[i]];
-        trans_out_shape[i] = in_dims[trans_axis[i]];
-      }
-      trans_out_shape[in_dims.size() - 1] = 1;
-
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_shape, context.GetPlace());
-      int ndims = trans_axis.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // transpose the input value
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_input, trans_axis);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
-      const int64_t input_width = trans_shape[trans_shape.size() - 1];
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_shape, context.GetPlace());
-      framework::Tensor tmp_indices;
-      auto* t_ind = tmp_indices.mutable_data<int64_t>(trans_out_shape,
-                                                      context.GetPlace());
-
-      getMode<T, int64_t>(input_height, input_width, in_dims.size(),
-                          &trans_input, t_out, t_ind);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans_axis);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans_axis);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ModeGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-
-    auto in_dims = x->dims();
-    auto out_dims = indices->dims();
-
-    // axis < 0, get the real axis
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    if (!keepdim) {
-      std::vector<int> tmp_out_shape;
-      for (int i = 0; i < axis; i++) {
-        tmp_out_shape.emplace_back(out_dims[i]);
-      }
-      tmp_out_shape.emplace_back(1);
-      for (int i = axis + 1; i < in_dims.size(); i++) {
-        tmp_out_shape.emplace_back(out_dims[i - 1]);
-      }
-      out_dims = phi::make_ddim(tmp_out_shape);
-    }
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis == in_dims.size() - 1) {
-      // allocate the memory for the input_grad
-      // assign the out_grad to input_grad directly
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      // init the output grad with 0, because some input elements has no grad
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      // Assign the output_grad to input_grad
-      if (keepdim) {
-        ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices,
-                   x_grad_data);
-      } else {
-        auto& dev_context =
-            context.template device_context<platform::CPUDeviceContext>();
-        framework::Tensor out_grad_tmp;
-        framework::Tensor indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
-                   &indices_tmp, x_grad_data);
-      }
-    } else {
-      // can not assign grad to input_grad, must do the transpose
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-      framework::DDim trans_shape(out_dims);
-      framework::DDim trans_in_shape(in_dims);
-      for (size_t i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = out_dims[trans_axis[i]];
-        trans_in_shape[i] = in_dims[trans_axis[i]];
-      }
-      // transpose the out_grad, indices
-      framework::Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_shape, context.GetPlace());
-      framework::Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_shape, context.GetPlace());
-      int ndims = trans_axis.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      if (keepdim) {
-        // Do transpose
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, *out_grad, &trans_dO, trans_axis);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, *indices, &trans_ind, trans_axis);
-      } else {
-        framework::Tensor out_grad_tmp;
-        framework::Tensor indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        // Do transpose
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, indices_tmp, &trans_ind, trans_axis);
-      }
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
-      const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
-
-      // Assign the out_grad to tranpose input_grad
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_shape, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-
-      ModeAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                             &trans_dO, &trans_ind, t_out);
-
-      // Transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans_axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index a512e7dcd682b517f64e3b14e2f35c4c539ec8b4..9fd66590cb7298d62a4720ff3a8276eca49df884 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/roi_pool_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/kernels/roi_pool_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -57,7 +58,7 @@ class ROIPoolOp : public framework::OperatorWithKernel {
             "%d-dimensional LoDTensor",
             rois_dims.size()));
     PADDLE_ENFORCE_EQ(
-        rois_dims[1], kROISize,
+        rois_dims[1], phi::kROISize,
         platform::errors::InvalidArgument(
             "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)"
             "given as [[x1, y1, x2, y2], ...]. But the second dimension of  "
@@ -216,16 +217,7 @@ REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
                   ops::ROIPoolGradMaker<paddle::framework::OpDesc>,
                   ops::ROIPoolGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, int>);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool_grad,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+
 REGISTER_OP_VERSION(roi_pool)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
deleted file mode 100644
index b907b1114bbc0402fb253ec00610abefe83051c3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-    int roi_start_w = round(offset_input_rois[0] * spatial_scale);
-    int roi_start_h = round(offset_input_rois[1] * spatial_scale);
-    int roi_end_w = round(offset_input_rois[2] * spatial_scale);
-    int roi_end_h = round(offset_input_rois[3] * spatial_scale);
-
-    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-
-    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
-                                        static_cast<double>(roi_height) /
-                                        static_cast<double>(pooled_height)));
-    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
-                                        static_cast<double>(roi_width) /
-                                        static_cast<double>(pooled_width)));
-    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
-                                     static_cast<double>(roi_height) /
-                                     static_cast<double>(pooled_height)));
-    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
-                                     static_cast<double>(roi_width) /
-                                     static_cast<double>(pooled_width)));
-    hstart = min(max(hstart + roi_start_h, 0), height);
-    hend = min(max(hend + roi_start_h, 0), height);
-    wstart = min(max(wstart + roi_start_w, 0), width);
-    wend = min(max(wend + roi_start_w, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
-    int maxidx = -1;
-    const T* offset_input_data =
-        input_data + (roi_batch_ind * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int input_data_index = h * width + w;
-        if (offset_input_data[input_data_index] > maxval) {
-          maxval = offset_input_data[input_data_index];
-          maxidx = input_data_index;
-        }
-      }
-    }
-    output_data[i] = maxval;
-    if (argmax_data) {
-      argmax_data[i] = maxidx;
-    }
-  }
-}
-
-template <typename T>
-__global__ void GPUROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad,
-    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, int* roi_batch_id_data,
-    T* input_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    int roi_batch_ind = roi_batch_id_data[n];
-    int input_offset = (roi_batch_ind * channels + c) * height * width;
-    int output_offset = (n * channels + c) * pooled_height * pooled_width;
-    const T* offset_output_grad = output_grad + output_offset;
-    T* offset_input_grad = input_grad + input_offset;
-    const int64_t* offset_argmax_data = argmax_data + output_offset;
-
-    int argmax = offset_argmax_data[ph * pooled_width + pw];
-    if (argmax != -1) {
-      platform::CudaAtomicAdd(
-          offset_input_grad + argmax,
-          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* argmax = ctx.Output<Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    auto in_stride = phi::stride(in_dims);
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      int rois_batch_size = rois_num_t->numel();
-
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be the same but "
-              "received batch size of input(ROIs) and input(X) is %d and %d "
-              "respectively.",
-              rois_batch_size, batch_size));
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace, rois_num_list.data(), gplace,
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be the same but "
-              "received batch size of input(ROIs) and input(X) is %d and %d "
-              "respectively.",
-              rois_batch_size, batch_size));
-
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        platform::errors::InvalidArgument(
-                            "The number of rois from input(ROIs) and its LOD "
-                            "must be the same. Received rois %d of input(ROIs) "
-                            "but the number of rois %d from its LOD is %d",
-                            rois_num, rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-
-    GPUROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()),
-        argmax->mutable_data<int64_t>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* rois_lod = ctx.Input<Tensor>("RoisNum");
-    auto* argmax = ctx.Input<Tensor>("Argmax");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (x_grad) {
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      auto cplace = platform::CPUPlace();
-      int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-
-      auto& dev_ctx = ctx.cuda_device_context();
-      auto gplace = ctx.GetPlace();
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-        int rois_batch_size = rois_num_t->numel();
-        std::vector<int> rois_num_list(rois_batch_size);
-        memory::Copy(cplace, rois_num_list.data(), gplace,
-                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_list[n]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-          start += rois_num_list[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        int rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-        }
-      }
-      int bytes = roi_batch_id_list.numel() * sizeof(int);
-      auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-      memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                   dev_ctx.stream());
-
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<Place, T> set_zero;
-      set_zero(dev_ctx, x_grad, static_cast<T>(0));
-
-      int output_grad_size = out_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            output_grad_size, rois->data<T>(), out_grad->data<T>(),
-            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
-            width, pooled_height, pooled_width, roi_id_data,
-            x_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool_grad,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
deleted file mode 100644
index a104fd49eb3e0b6d842ab6052e1181e6480a6f65..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/roi_pool_op.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr int kROISize = 4;
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = phi::stride(in_dims);
-    auto argmax_stride = phi::stride(argmax->dims());
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("The rois_batch_size and imgs "
-                                            "batch_size must be the same."));
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("The rois_batch_size and imgs "
-                                            "batch_size must be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num, rois_num_with_lod,
-          platform::errors::InvalidArgument("The rois_num from input "
-                                            "and lod must be the same."));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
-
-    const T* rois_data = rois->data<T>();
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = roi_batch_id_data[n];
-      int roi_start_w = round(rois_data[0] * spatial_scale);
-      int roi_start_h = round(rois_data[1] * spatial_scale);
-      int roi_end_w = round(rois_data[2] * spatial_scale);
-      int roi_end_h = round(rois_data[3] * spatial_scale);
-
-      // Force malformed ROIs to be 1x1
-      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
-      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
-
-      const float bin_size_h =
-          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-      const float bin_size_w =
-          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-
-      const T* batch_data = input_data + roi_batch_id * in_stride[0];
-
-      for (int c = 0; c < channels; ++c) {
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            //  Compute pooling region for this output unit:
-            //  start (included) = floor(ph * roi_height / pooled_height_)
-            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
-            int hstart =
-                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
-            int wstart =
-                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
-            int hend =
-                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
-            int wend =
-                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
-
-            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
-            hend = std::min(std::max(hend + roi_start_h, 0), height);
-            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
-            wend = std::min(std::max(wend + roi_start_w, 0), width);
-
-            const int pool_index = ph * pooled_width + pw;
-
-            // Define an empty pooling region to be zero
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            output_data[pool_index] =
-                is_empty ? 0 : -std::numeric_limits<T>::max();
-            argmax_data[pool_index] = -1;
-
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                const int index = h * width + w;
-                if (batch_data[index] > output_data[pool_index]) {
-                  output_data[pool_index] = batch_data[index];
-                  argmax_data[pool_index] = index;
-                }
-              }
-            }
-          }
-        }
-
-        batch_data += in_stride[1];
-        output_data += out_stride[1];
-        argmax_data += argmax_stride[1];
-      }
-      // Increment ROI data pointer
-      rois_data += roi_stride[0];
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-
-    if (in_grad) {
-      int rois_num = rois->dims()[0];
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      int* roi_batch_id_data =
-          roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        auto* rois_num_data = rois_num_t->data<int>();
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_data[n]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-          start += rois_num_data[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-        }
-      }
-
-      const T* rois_data = rois->data<T>();
-      const T* out_grad_data = out_grad->data<T>();
-      const int64_t* argmax_data = argmax->data<int64_t>();
-      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
-               static_cast<T>(0));
-
-      auto in_stride = phi::stride(in->dims());
-      auto argmax_stride = phi::stride(argmax->dims());
-      auto roi_stride = phi::stride(rois->dims());
-      auto out_stride = phi::stride(out_grad->dims());
-
-      int channels = in->dims()[1];
-
-      for (int n = 0; n < rois_num; ++n) {
-        int roi_batch_idx = roi_batch_id_data[n];
-        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
-        for (int c = 0; c < channels; ++c) {
-          for (int ph = 0; ph < pooled_height; ++ph) {
-            for (int pw = 0; pw < pooled_width; ++pw) {
-              int pool_index = ph * pooled_width + pw;
-              if (argmax_data[pool_index] >= 0) {
-                auto index = argmax_data[pool_index];
-                batch_grad_data[index] += out_grad_data[pool_index];
-              }
-            }
-          }
-          batch_grad_data += in_stride[1];
-          out_grad_data += out_stride[1];
-          argmax_data += argmax_stride[1];
-        }
-        rois_data += roi_stride[0];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc
index d0290795455db1546afbda80e71e79de3f1020ac..3a6fdbaa2613d1f87a84f7175d7d5b507c3479ab 100644
--- a/paddle/fluid/operators/searchsorted_op.cc
+++ b/paddle/fluid/operators/searchsorted_op.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,60 +23,6 @@ namespace operators {
 class SearchSortedOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  static bool SearchsortedDimsMatchedBeforeLastDim(
-      const framework::DDim& sequences_dims,
-      const framework::DDim& values_dims) {
-    if (sequences_dims.size() != values_dims.size()) {
-      return false;
-    }
-    const auto& sequences_dims_size = sequences_dims.size();
-    for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) {
-      if (sequences_dims[dim] != values_dims[dim]) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("SortedSequence"), "Input", "SortedSequence",
-                   "searchsorted");
-    OP_INOUT_CHECK(ctx->HasInput("Values"), "Input", "Values", "searchsorted");
-
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "searchsorted");
-
-    auto sequences_dims = ctx->GetInputDim("SortedSequence");
-    auto values_dims = ctx->GetInputDim("Values");
-    auto out_int32 = ctx->Attrs().Get<bool>("out_int32");
-
-    if (sequences_dims.size() != 1) {
-      PADDLE_ENFORCE_EQ(
-          SearchsortedDimsMatchedBeforeLastDim(sequences_dims, values_dims),
-          true,
-          platform::errors::Unavailable(
-              "The dimensions of sorted_sequence tensor ( %s ) and values "
-              "tensor ( %s ) can not match. Because the input sorted_sequence "
-              "tensor must be 1 dimension or the first N-1 dimensions of "
-              "sorted_sequence tensor and input values tensor must match. "
-              "Please input appropriate sorted_sequence and values again! ",
-              sequences_dims, values_dims));
-    }
-
-    if (out_int32) {
-      PADDLE_ENFORCE_LT(
-          sequences_dims[sequences_dims.size() - 1],
-          std::numeric_limits<int>::max(),
-          platform::errors::Unavailable(
-              "The size of sorted_sequence %d exceed the maximum limit d%. "
-              "Because the size of sorted_sequence should be less than the "
-              "output maximum value for int32 bit. Please set appropriate "
-              "sorted_sequence to meet this requirement! ",
-              sequences_dims[sequences_dims.size() - 1],
-              std::numeric_limits<int>::max()));
-    }
-
-    ctx->SetOutputDim("Out", values_dims);
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -115,4 +63,7 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker);
+DECLARE_INFER_SHAPE_FUNCTOR(searchsorted, SearchsortedInferShapeFunctor,
+                            PD_INFER_META(phi::SearchsortedInferMeta));
+REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker,
+                  SearchsortedInferShapeFunctor);
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 513ab46e9b5eebdb39faf4401d9d8b2fc387a82f..73655bcb18500e54564936eac4400a0c7b49af62 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -13,9 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/set_value_op.h"
+
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -34,6 +40,8 @@ class CPUDeviceContext;
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 class SetValue : public framework::OperatorWithKernel {
  public:
   SetValue(const std::string &type, const framework::VariableNameMap &inputs,
@@ -41,17 +49,6 @@ class SetValue : public framework::OperatorWithKernel {
            const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "SetValue");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SetValue");
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_LT(
-        in_dims.size(), 7,
-        platform::errors::InvalidArgument(
-            "The rank of input should be less than 7, but received %d.",
-            in_dims.size()));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -236,10 +233,13 @@ DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"});
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DECLARE_INFER_SHAPE_FUNCTOR(set_value, SetValueInferShapeFunctor,
+                            PD_INFER_META(phi::SetValueInferMeta));
+
 REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
                   ops::SetValueGradMaker<paddle::framework::OpDesc>,
                   ops::SetValueGradMaker<paddle::imperative::OpBase>,
-                  ops::SetValueOpInplaceInferer);
+                  ops::SetValueOpInplaceInferer, SetValueInferShapeFunctor);
 
 REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad);
 
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index d1add111e1d24cb711955a9aff06eb19feb35dc9..0a9ae789b01eea8a14952afe0d998005c59c0659 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,56 +25,6 @@ class TopkV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "topk_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "topk_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "topk_v2");
-
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_EQ(
-        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
-        paddle::platform::errors::InvalidArgument(
-            "the axis of topk must be [-%d, %d), but you set axis is %d",
-            dim_size, dim_size, axis));
-
-    if (axis < 0) axis += dim_size;
-
-    int k;
-    auto k_is_tensor = ctx->HasInput("K");
-    if (k_is_tensor) {
-      k = -1;
-    } else {
-      k = static_cast<int>(ctx->Attrs().Get<int>("k"));
-      PADDLE_ENFORCE_EQ(k >= 1, true,
-                        paddle::platform::errors::InvalidArgument(
-                            "the attribute of k in the topk must >= 1 or be a "
-                            "Tensor, but received %d .",
-                            k));
-    }
-
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of topk must have >= 1d shape"));
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(
-          input_dims[axis], k,
-          paddle::platform::errors::InvalidArgument(
-              "input of topk op must have >= %d columns in axis of %d", k,
-              axis));
-    }
-
-    framework::DDim dims = input_dims;
-
-    dims[axis] = k;
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -169,8 +121,11 @@ class TopkV2GradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(top_k_v2, TopKInferShapeFunctor,
+                            PD_INFER_META(phi::TopKInferMeta));
 REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
                   ops::TopkV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
+                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>,
+                  TopKInferShapeFunctor);
 
 REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.h b/paddle/fluid/operators/truncated_gaussian_random_op.h
index 8af6e281424eaabd8d6ea86843b3c13aa36cba47..a6ff2f686cb76bb03de8074014f82d6ff9e57bd3 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.h
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -137,9 +140,19 @@ T Erfinv(T x) {
 template <typename T>
 struct TruncatedNormal {
   T mean, std;
-  TruncatedNormal(T mean, T std) : mean(mean), std(std) {}
+  T a_normal_cdf;
+  T b_normal_cdf;
+  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
+    auto normal_cdf = [](T x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    a_normal_cdf = normal_cdf(-2.0);
+    b_normal_cdf = normal_cdf(2.0);
+  }
+
   T operator()(T value) const {
-    return std::sqrt(2.0) * Erfinv(value) * std + mean;
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
   }
 };
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
index 4ed0dd22ec086923bbe47af192cab8d001ae734f..261d9cee2d5cd25c510aacb280b9623f985eb1f7 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -84,13 +84,8 @@ class NPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     Tensor cpu_tensor(tensor->dtype());
     cpu_tensor.Resize(tensor->dims());
     T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-    auto normal_cdf = [](float x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    float a_normal_cdf = normal_cdf((-2.0 - mean) / std);
-    float b_normal_cdf = normal_cdf((2.0 - mean) / std);
-    std::uniform_real_distribution<float> dist(2.0 * a_normal_cdf - 1.0,
-                                               2.0 * b_normal_cdf - 1.0);
+    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                           1.0);
     TruncatedNormal<T> truncated_normal(mean, std);
     int64_t size = tensor->numel();
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
index 984d9f397cc655b4cfd7e0bc211db1665252272f..803b61fbe813f85f48b71d1de7fc41eb26e4b8da 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
@@ -32,13 +32,8 @@ class XPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
-    auto normal_cdf = [](float x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    float a_normal_cdf = normal_cdf((-2.0 - mean) / std);
-    float b_normal_cdf = normal_cdf((2.0 - mean) / std);
-    std::uniform_real_distribution<float> dist(2.0 * a_normal_cdf - 1.0,
-                                               2.0 * b_normal_cdf - 1.0);
+    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                           1.0);
     TruncatedNormal<T> truncated_normal(mean, std);
     int64_t size = tensor->numel();
 
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index e110432c67d395c865d934a47eaa4a803053db8b..c9e80c7b4b407456fc962f508ae441a9c07914b2 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -122,13 +122,33 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
   EAGER_TRY
   auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
   auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
-  egr::RunBackward(tensors, grad_tensors,
-                   CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
+  egr::Backward(tensors, grad_tensors,
+                CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
   Py_INCREF(Py_None);
   return Py_None;
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* eager_api_run_partial_grad(PyObject* self, PyObject* args,
+                                            PyObject* kwargs) {
+  EAGER_TRY
+  auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
+  auto inputs = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
+  auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 2), 2);
+  auto retain_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
+  auto create_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
+  auto only_inputs = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 5), 5);
+  auto allow_unused = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 6), 6);
+  auto no_grad_vars = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 7), 7);
+
+  std::vector<paddle::experimental::Tensor> result =
+      egr::Grad(tensors, inputs, grad_tensors, retain_graph, create_graph,
+                only_inputs, allow_unused, no_grad_vars);
+  VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad";
+  return ToPyObject(result, true /* return_py_none_if_not_initialize */);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
                                        PyObject* kwargs) {
   EAGER_TRY
@@ -452,6 +472,9 @@ PyMethodDef variable_functions[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"run_partial_grad",
+     (PyCFunction)(void (*)(void))eager_api_run_partial_grad,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy,
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 082ec382c79cd9c98ac75db14bc552883088b885..7f8fcd351fe2a0a9712560f913a83f2cc3580395 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -226,6 +226,19 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args,
+                                   PyObject* kwargs) {
+  EAGER_TRY
+  auto cp_tensor =
+      self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true);
+  egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
+  egr::EagerUtils::autograd_meta(&cp_tensor)
+      ->SetPersistable(
+          egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
+  return ToPyObject(cp_tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method_reconstruct_from_(TensorObject* self,
                                                  PyObject* args,
                                                  PyObject* kwargs) {
@@ -264,7 +277,7 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args,
             egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
   }
 
-  self->tensor.copy_(src_tensor, blocking);
+  self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking);
 
   VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
           << self->tensor.name();
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 2572866b8f5198b2414163d4198e06b54d11fedc..ff8980d727e70a41223878f22f019353f8b71972 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -96,7 +96,7 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value,
                      "Detected NULL grad"
                      "Please check if you have manually cleared"
                      "the grad inside autograd_meta"));
-  grad->copy_(src, true);
+  grad->copy_(src, self->tensor.inner_place(), true);
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_ZERO
 }
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 795f75ce7fd6d9072d4d7a936b027a42321271b8..ecf75da080788fb4377923d4d34088f63ebd4969 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -492,20 +492,26 @@ PyObject* ToPyObject(const std::vector<double>& value) {
   return result;
 }
 
-PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value) {
+PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
+                     bool return_py_none_if_not_initialize) {
   PyObject* result = PyList_New((Py_ssize_t)value.size());
 
   for (size_t i = 0; i < value.size(); i++) {
-    PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
-    if (obj) {
-      auto v = reinterpret_cast<TensorObject*>(obj);
-      new (&(v->tensor)) paddle::experimental::Tensor();
-      v->tensor = value[i];
+    if (!value[i].initialized() && return_py_none_if_not_initialize) {
+      Py_INCREF(Py_None);
+      PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), Py_None);
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "tp_alloc return null, can not new a PyObject."));
+      PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
+      if (obj) {
+        auto v = reinterpret_cast<TensorObject*>(obj);
+        new (&(v->tensor)) paddle::experimental::Tensor();
+        v->tensor = value[i];
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "tp_alloc return null, can not new a PyObject."));
+      }
+      PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
     }
-    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
   }
 
   return result;
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 2187555e1c3c7f64bd864e4212bfc6ebe1fb1684..1c4e2ab69a5ecba1209a11651c3c11972dff565c 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -68,7 +68,8 @@ PyObject* ToPyObject(const std::vector<int>& value);
 PyObject* ToPyObject(const std::vector<int64_t>& value);
 PyObject* ToPyObject(const std::vector<float>& value);
 PyObject* ToPyObject(const std::vector<double>& value);
-PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value);
+PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
+                     bool return_py_none_if_not_initialize = false);
 PyObject* ToPyObject(const platform::Place& value);
 PyObject* ToPyObject(const framework::LoDTensor* value);
 PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype);
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 3145a9cf7655c053c269990e00982226eae49c7a..01dae420cc6ab84edc0b0df11b0b4cf6408a87f7 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -225,7 +225,7 @@ void BindGraphPyClient(py::module* m) {
       .def("stop_server", &GraphPyClient::stop_server)
       .def("get_node_feat",
            [](GraphPyClient& self, std::string node_type,
-              std::vector<uint64_t> node_ids,
+              std::vector<int64_t> node_ids,
               std::vector<std::string> feature_names) {
              auto feats =
                  self.get_node_feat(node_type, node_ids, feature_names);
@@ -239,7 +239,7 @@ void BindGraphPyClient(py::module* m) {
            })
       .def("set_node_feat",
            [](GraphPyClient& self, std::string node_type,
-              std::vector<uint64_t> node_ids,
+              std::vector<int64_t> node_ids,
               std::vector<std::string> feature_names,
               std::vector<std::vector<py::bytes>> bytes_feats) {
              std::vector<std::vector<std::string>> feats(bytes_feats.size());
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index b008308e27d9afaa9d8c47290489d50a762f2a41..c8f0acd0b8a853f541a6fb8cbafe73f27688c71a 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -551,6 +551,9 @@ void BindAnalysisConfig(py::module *m) {
       .def("params_file", &AnalysisConfig::params_file)
       .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
            py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
+      .def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16,
+           py::arg("gpu_fp16_disabled_op_types") =
+               std::unordered_set<std::string>({}))
       .def("enable_xpu", &AnalysisConfig::EnableXpu,
            py::arg("l3_workspace_size") = 16 * 1024 * 1024,
            py::arg("locked") = false, py::arg("autotune") = true,
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 4e273f6d551edd74ec979e6ec34aedabdb58bd10..e777a8e3ab4e6a59662ce7b4eb9a31a7409d6f56 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -3,12 +3,22 @@ if (NOT WITH_INFRT)
 endif()
 
 option(INFRT_WITH_PHI  "Compile INFRT with PHI"    ON)
+option(INFRT_WITH_GPU  "Compile INFRT with GPU"    OFF)
+option(INFRT_WITH_TRT  "Compile INFRT with TensorRT"    OFF)
 
 #TODO(xiaowei) remove fluid
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 
 if (INFRT_WITH_PHI)
-    add_definitions("-DINFRT_WITH_PHI")
+  add_definitions("-DINFRT_WITH_PHI")
+
+  # TODO(wilber): Now Infrt gpu/trt depends on phi's components, Modify compile dependency options later.
+  if (INFRT_WITH_GPU)
+    add_definitions("-DINFRT_WITH_GPU")
+    if (INFRT_WITH_TRT)
+      add_definitions("-DINFRT_WITH_TRT")
+    endif()
+  endif()
 endif()
 
 # compile flags
@@ -92,7 +102,6 @@ set(infrt_mlir_incs
         test_kernels_inc
         tensor_shape_inc
         dense_tensor_inc
-        pd_ops_inc
         pd_extra_ops_inc
         trt_ops_inc
         )
@@ -106,6 +115,9 @@ if (INFRT_WITH_PHI)
 endif()
 
 cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
+if (INFRT_WITH_TRT)
+  target_link_libraries(infrt infrt_trt)
+endif()
 cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto)
 add_dependencies(infrt ${infrt_mlir_incs} mlir-headers)
 
diff --git a/paddle/infrt/backends/host/phi_allocator.h b/paddle/infrt/backends/host/phi_allocator.h
index c8f97e04a1b8376efbac749fffa70d77c7b95e72..6e3bef9299162d493825f49e3962c75f2845e2d0 100644
--- a/paddle/infrt/backends/host/phi_allocator.h
+++ b/paddle/infrt/backends/host/phi_allocator.h
@@ -13,6 +13,10 @@ limitations under the License. */
 
 #include "paddle/phi/core/allocator.h"
 
+#ifdef INFRT_WITH_GPU
+#include <cuda_runtime.h>
+#endif
+
 namespace infrt {
 namespace backends {
 
@@ -29,5 +33,22 @@ class CpuPhiAllocator : public phi::Allocator {
   }
 };
 
+#ifdef INFRT_WITH_GPU
+// TODO(wilber): Just for demo test. we need a more efficient gpu allocator.
+class GpuPhiAllocator : public phi::Allocator {
+ public:
+  static void deleter(phi::Allocation* ptr) { cudaFree(ptr->ptr()); }
+
+  AllocationPtr Allocate(size_t bytes_size) {
+    void* ptr;
+    cudaMalloc(&ptr, bytes_size);
+    return AllocationPtr(
+        new phi::Allocation(
+            ptr, bytes_size, phi::Place(phi::AllocationType::GPU)),
+        deleter);
+  }
+};
+#endif
+
 }  // namespace backends
 }  // namespace infrt
diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h
index 5713fdbbaf82b2ea2190d2ee1b1dc5d944f2c262..bcd63dbb39fe8c52499138423bc9b86fa5de9d57 100644
--- a/paddle/infrt/backends/host/phi_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace infrt {
 namespace backends {
@@ -31,5 +32,16 @@ class CpuPhiContext : public phi::CPUContext {
   std::unique_ptr<phi::Allocator> alloc_{std::make_unique<CpuPhiAllocator>()};
 };
 
+class GpuPhiContext : public phi::GPUContext {
+ public:
+  using Base = phi::GPUContext;
+  using phi::GPUContext::SetStream;
+  using phi::GPUContext::SetEigenDevice;
+  using phi::GPUContext::SetBlasHandle;
+  using phi::GPUContext::SetDnnHandle;
+  using phi::GPUContext::SetSolverHandle;
+  using phi::GPUContext::SetSparseHandle;
+};
+
 }  // namespace backends
 }  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
index 12cf14060e27c1d58e3fd9b14cc12b3c1f7f8907..0ab64dd51c88758d043fb9105ffbf0d109e44cc0 100644
--- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -37,9 +37,9 @@ namespace infrt {
 namespace backends {
 namespace tensorrt {
 
-const char* model_input = "model_input";
-const char* model_output = "model_output1";
-const char* model_output2 = "model_output2";
+const char* model_input = "input_0";
+const char* model_output = "output_0";
+const char* model_output2 = "output_1";
 
 TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
     nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
@@ -122,27 +122,26 @@ TEST(trt, run_static) {
 
   std::unordered_map<std::string, phi::DenseTensor*> inputs;
   inputs.emplace(std::make_pair(model_input, &input));
-  phi::DenseTensor output, output2;
-  std::unordered_map<std::string, phi::DenseTensor*> outputs;
-  outputs.emplace(std::make_pair(model_output, &output));
-  outputs.emplace(std::make_pair(model_output2, &output2));
-
-  static_trt_engine.SetUpInference(inference_options, inputs, &outputs);
+  static_trt_engine.PrepareOutputHandle("output_0");
+  static_trt_engine.PrepareOutputHandle("output_1");
+  static_trt_engine.SetUpInference(inference_options, inputs);
   static_trt_engine.GetEngineInfo();
   static_trt_engine.Run(context);
 
+  phi::DenseTensor* output0 = static_trt_engine.GetOutput("output_0");
+  phi::DenseTensor* output1 = static_trt_engine.GetOutput("output_1");
   std::vector<float> output_data1(inference_options.batch * 1 * 28 * 28, 0);
   std::vector<float> output_data2(inference_options.batch * 2 * 28 * 28, 0);
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data1.data(),
                        place,
-                       output.data<float>(),
+                       output0->data<float>(),
                        sizeof(float) * output_data1.size(),
                        context.stream());
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data2.data(),
                        place,
-                       output2.data<float>(),
+                       output1->data<float>(),
                        sizeof(float) * output_data2.size(),
                        context.stream());
   cudaStreamSynchronize(context.stream());
@@ -208,27 +207,27 @@ TEST(trt, run_dynamic) {
                        context.stream());
 
   std::unordered_map<std::string, phi::DenseTensor*> inputs;
-  std::unordered_map<std::string, phi::DenseTensor*> outputs;
   inputs.emplace(std::make_pair(model_input, &input));
-  outputs.emplace(std::make_pair(model_output, &output));
-  outputs.emplace(std::make_pair(model_output2, &output2));
-
-  engine.SetUpInference(inference_options, inputs, &outputs);
+  engine.PrepareOutputHandle("output_0");
+  engine.PrepareOutputHandle("output_1");
+  engine.SetUpInference(inference_options, inputs);
   engine.GetEngineInfo();
   engine.Run(context);
+  phi::DenseTensor* output0 = engine.GetOutput("output_0");
+  phi::DenseTensor* output1 = engine.GetOutput("output_1");
 
   std::vector<float> output_data1(inference_options.batch * 1 * 16 * 16, 0);
   std::vector<float> output_data2(inference_options.batch * 2 * 16 * 16, 0);
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data1.data(),
                        place,
-                       output.data<float>(),
+                       output0->data<float>(),
                        sizeof(float) * output_data1.size(),
                        context.stream());
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data2.data(),
                        place,
-                       output2.data<float>(),
+                       output1->data<float>(),
                        sizeof(float) * output_data2.size(),
                        context.stream());
   cudaStreamSynchronize(context.stream());
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
index 232653e8c41f71fd9bb32c9eac302b047d122b66..43d356b6d6983afdca220029d34d9d5cd27da009 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -21,6 +21,7 @@
 #include "paddle/phi/backends/dynload/tensorrt.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace backends {
@@ -235,10 +236,20 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
   return true;
 }
 
+void TrtEngine::PrepareOutputHandle(const std::string& out_name) {
+  phi::DenseTensor t;
+  outputs_.emplace(out_name, t);
+}
+
+phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) {
+  return &outputs_[name];
+}
+
+size_t TrtEngine::GetOutputNum() const { return outputs_.size(); }
+
 bool TrtEngine::SetUpInference(
     const InferenceOptions& inference,
-    const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
-    std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
+    const std::unordered_map<std::string, phi::DenseTensor*>& inputs) {
   // TODO(wilber): now only create one exec_context
   FreshDeviceId();
   CHECK(engine_ != nullptr);
@@ -252,10 +263,10 @@ bool TrtEngine::SetUpInference(
     bindings_.front()->AddBinding(
         bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT);
   }
-  for (auto& it : *outputs) {
+  for (auto& it : outputs_) {
     const int bind_index = engine_->getBindingIndex(it.first.c_str());
     bindings_.front()->AddBinding(
-        bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT);
+        bind_index, it.first, false, &it.second, nvinfer1::DataType::kFLOAT);
   }
 
   return true;
@@ -290,11 +301,13 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
     const int bind_index = engine_->getBindingIndex(bind.name.c_str());
     std::vector<int32_t> ddim;
     auto dims = engine_->getBindingDimensions(bind_index);
+    CHECK_NE(runtime_batch, -1) << "runtime_batch should not be -1.";
     ddim.push_back(runtime_batch);
     for (int i = 0; i < dims.nbDims; ++i) {
       ddim.push_back(dims.d[i]);
     }
     bind.buffer->Resize(phi::make_ddim(ddim));
+    // TODO(wilber): now only support float output.
     ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
     buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
   }
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
index 3c8243e3c3838e30eb70877f8c82d623c103eaff..a26474f8cbb357d42cd6d951829bbdc24a256640 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -81,11 +81,17 @@ class TrtEngine {
   // TODO(wilber): How to support multiple execution contexts?
   bool SetUpInference(
       const InferenceOptions& inference,
-      const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
-      std::unordered_map<std::string, phi::DenseTensor*>* outputs);
+      const std::unordered_map<std::string, phi::DenseTensor*>& inputs);
 
   void GetEngineInfo();
 
+  void PrepareOutputHandle(const std::string& out_name);
+
+  // TODO(wilber): The output tensor names are: output_0, output_1, ...
+  phi::DenseTensor* GetOutput(const std::string&);
+
+  size_t GetOutputNum() const;
+
  private:
   void FreshDeviceId();
 
@@ -112,6 +118,7 @@ class TrtEngine {
   std::vector<std::unique_ptr<Bindings>> bindings_;
   int device_id_{0};
   bool is_dynamic_shape_{false};
+  std::unordered_map<std::string, phi::DenseTensor> outputs_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index a3f2d0afafc417cc7a4cbba8a3d6bfa92c9bef00..cf3906c32e559d9fa33d0583be9adb1b2591e78b 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -7,16 +7,10 @@ gather_srcs(infrt_src SRCS
     dense_tensor.cc
     mlir_loader.cc
     diagnostic_utils.cc
-    pd_ops.cc
     )
 
 mlir_tablegen_on(tensor_shape DIALECT ts)
 mlir_tablegen_on(dense_tensor DIALECT dt)
-mlir_tablegen_on(pd_op_base DIALECT pd)
-mlir_tablegen_on(pd_ops)
-mlir_tablegen_on(pd_extra_ops)
-
-mlir_add_rewriter(rewrite)
 
 # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
 add_executable(infrtopt opt.cc)
@@ -24,10 +18,10 @@ target_link_libraries(infrtopt infrt)
 
 add_executable(print-ir print_ir.cc)
 target_link_libraries(print-ir infrt ${mlir_libs})
-add_dependencies(print-ir pd_ops_inc)
 cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS})
 
 add_subdirectory(infrt)
+add_subdirectory(pd)
 add_subdirectory(tensorrt)
 
 if (INFRT_WITH_PHI)
diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td
index 666c7b300af33db0c27e5b3ab8a74aa4b1591c9b..59df4e9697370e9d8db4bbc0a5d69e8ef03950a5 100644
--- a/paddle/infrt/dialect/dense_tensor.td
+++ b/paddle/infrt/dialect/dense_tensor.td
@@ -130,7 +130,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
 }
 
 def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
-  let summary = "ddt.tensor_map_get_size operation";
+  let summary = "dt.tensor_map_get_size operation";
 
   let description = [{
     An operation that get the size of a TensorMap.
@@ -141,6 +141,32 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
   let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
 }
 
+def Infrt_TensorListGetTensorOp : DT_Op<"tensor_list_get_tensor", [NoSideEffect]> {
+  let summary = "dt.tensor_list_get_tensor operation";
+
+  let description = [{
+    An operation that can get a tensor from a TensorList.
+  }];
+
+  let arguments = (ins
+          DenseTensorList:$l,
+          I32Attr:$id
+          );
+  let results = (outs DenseTensor:$output);
+  let verifier = ?;
+}
+
+def TensorListGetSizeOp : DT_Op<"tensor_list_get_size", [NoSideEffect]> {
+  let summary = "dt.tensor_list_get_size operation";
+
+  let description = [{
+    An operation that get the size of a TensorList.
+  }];
+
+  let arguments = (ins DenseTensorList:$map);
+  let results = (outs I32:$size);
+}
+
 def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> {
   let summary = "dt.get_tensor_shape operation";
 
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_base.td b/paddle/infrt/dialect/infrt/ir/infrt_base.td
index c5130e89bb13a58a0aa0cf3aeae1b00e269eb259..86cfc375330b19878528645a2e810efb797e153f 100644
--- a/paddle/infrt/dialect/infrt/ir/infrt_base.td
+++ b/paddle/infrt/dialect/infrt/ir/infrt_base.td
@@ -89,6 +89,13 @@ def DenseTensorMap :  Infrt_Type<"DenseTensorMap"> {
   let parameters = (ins);
 }
 
+// TODO(wilber): Add !infrt.vec type.
+def DenseTensorList :  Infrt_Type<"DenseTensorList"> {
+  let summary = "infrt dense tensor map";
+  let description = [{dense_tensor map}];
+  let parameters = (ins);
+}
+
 // Type Constrait for concrete DenseTensor type.
 class DenseTensor<string target, string precision, string layout> :
     Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">, 
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
index 867d854ba3c9d0954dfe2d038405daf1726a2556..8966ca13c2be08f1c744a73b4beaf20b0a3c015c 100644
--- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
@@ -90,6 +90,9 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
     return LoDTensorType::get(
         parser.getContext(), shape, elementType, lod_level);
   }
+  if (keyword == "dense_tensor_map") {
+    return DenseTensorMapType::get(parser.getContext());
+  }
   if (keyword == "dense_tensor") {
     // parse DenseTensor, for example: !i=Infrt.tensor<X86, CUDA, F32>
     llvm::StringRef target;
@@ -135,6 +138,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
         parser.getContext(), *targetType, *precisionType, *layoutType);
   }
 
+  if (keyword == "tensor_list") {
+    return infrt::DenseTensorListType::get(parser.getContext());
+  }
+
   if (keyword == "dense_tensor_map") {
     return DenseTensorMapType::get(parser.getContext());
   }
@@ -158,6 +165,10 @@ void InfrtDialect::printType(::mlir::Type type,
        << lod_tensor_type.getLod_level() << ">";
     return;
   }
+  if (type.isa<infrt::DenseTensorMapType>()) {
+    os << "dense_tensor_map";
+    return;
+  }
 
   // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
   if (type.isa<DenseTensorType>()) {
@@ -168,6 +179,9 @@ void InfrtDialect::printType(::mlir::Type type,
     return;
   }
 
+  if (type.isa<infrt::DenseTensorListType>()) {
+    os << "tensor_list";
+  }
   // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
   if (type.isa<DenseTensorMapType>()) {
     os << "dense_tensor_map";
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
index 7ae0bbae6275fdac1ea9e98084c866aa438ecce4..3d825a9c762f4833e577125d20423a5f1d41737f 100644
--- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
@@ -3,7 +3,7 @@
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "paddle/infrt/dialect/infrt/ir/infrt_ops.td"
-include "paddle/infrt/dialect/pd_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
 
 def FuseTensorCastPattern : Pat<
        (Infrt_TensorCastOp (Infrt_TensorCastOp $arg)),
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
index 9d8ce5d8dfe399d06bfbe2f0c4b6457a8b3d61f1..eec0e0bc7c5ab624e9db7744c357b58ff5107eef 100644
--- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
@@ -16,7 +16,7 @@
 
 #include <mlir/Transforms/GreedyPatternRewriteDriver.h>
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 namespace {
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc"  // NOLINT
 
diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc
index 0c5944ebf84750be8cf789552219157da3170c39..6183295cafb356e85c0fd8bf417c3fb18eb30787 100644
--- a/paddle/infrt/dialect/init_dialects.cc
+++ b/paddle/infrt/dialect/init_dialects.cc
@@ -20,12 +20,13 @@
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
 
 #include "paddle/infrt/dialect/tensor_shape.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
@@ -37,7 +38,8 @@ void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
                   phi::PHIDenseTensorDialect,
                   phi::PHICPUKernelDialect,
                   phi::PHIGPUKernelDialect,
-                  phi::PHIDialect
+                  phi::PHIDialect,
+                  infrt::trt::TensorRTDialect
 #endif
                   >();
 }
diff --git a/paddle/infrt/dialect/pd/CMakeLists.txt b/paddle/infrt/dialect/pd/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f65336453fbdf82f30948aeea8dc52b0367159b
--- /dev/null
+++ b/paddle/infrt/dialect/pd/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(common)
+add_subdirectory(ir)
+add_subdirectory(pass)
diff --git a/paddle/infrt/dialect/pd/common/CMakeLists.txt b/paddle/infrt/dialect/pd/common/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ee1b0d4c30deb2e7fbf19aa91ec3dd3bdcd449af
--- /dev/null
+++ b/paddle/infrt/dialect/pd/common/CMakeLists.txt
@@ -0,0 +1,4 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    )
diff --git a/paddle/infrt/dialect/pd/ir/CMakeLists.txt b/paddle/infrt/dialect/pd/ir/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8aacfc97623c0dadc0ccb604440ce19427d860ba
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    pd_ops.cc
+    )
+add_mlir_dialect(pd_ops pd)
+mlir_tablegen_on(pd_extra_ops)
diff --git a/paddle/infrt/dialect/pd_extra_ops.td b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td
similarity index 90%
rename from paddle/infrt/dialect/pd_extra_ops.td
rename to paddle/infrt/dialect/pd/ir/pd_extra_ops.td
index c6d3f530455f76d0352ef5ac42297c30ce521da2..cf17db211cbe98c586423c7db050dfdc12576cff 100644
--- a/paddle/infrt/dialect/pd_extra_ops.td
+++ b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td
@@ -4,7 +4,7 @@
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/pd_op_base.td"
+include "paddle/infrt/dialect/pd/ir/pd_op_base.td"
 
 def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> {
     let summary = "Computes the Fully Connected result of two tensors";
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd/ir/pd_op_base.td
similarity index 96%
rename from paddle/infrt/dialect/pd_op_base.td
rename to paddle/infrt/dialect/pd/ir/pd_op_base.td
index f6af4c83aed8bd0b7ce04c172169b036e674777b..7cab0eca45a1e7f74115f906db10a77f2eb1023b 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd/ir/pd_op_base.td
@@ -8,7 +8,7 @@ include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 
-def PD_Dialect : Dialect {
+def Paddle_Dialect : Dialect {
   let name = "pd";
 
   let description = [{
@@ -16,12 +16,12 @@ def PD_Dialect : Dialect {
 
     This dialect contains the PaddlePaddle operators.
   }];
-
+  let hasConstantMaterializer = 1;
   let cppNamespace = "mlir::pd";
 }
 
 class PD_Op<string mnemonic, list<OpTrait> traits = []> :
-      Op<PD_Dialect, mnemonic, traits>;
+      Op<Paddle_Dialect, mnemonic, traits>;
 
 
 class PD_PaddleAttr <string name, string description> :
diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.cc b/paddle/infrt/dialect/pd/ir/pd_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d105aa07dd06a9a9c3aba870702b1e304a3a938a
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/pd_ops.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/PatternMatch.h>
+
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.cpp.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc"  // NOLINT
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc"  // NOLINT
+
+namespace mlir {
+namespace pd {
+void PaddleDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc"  // NOLINT
+      ,
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc"  // NOLINT
+      >();
+}
+
+mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
+                                                    mlir::Attribute value,
+                                                    mlir::Type type,
+                                                    mlir::Location loc) {
+  return builder.create<ConstantOp>(loc, value);
+}
+
+void ConstantOp::build(OpBuilder &builder,
+                       OperationState &state,
+                       Attribute value) {
+  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
+    return ConstantOp::build(builder, state, elem_attr);
+  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
+    ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType());
+    state.addAttribute("value", DenseElementsAttr::get(type, value));
+    state.addTypes(type);
+    return;
+  }
+  llvm_unreachable("unsupported attribute type for building pd.constant");
+}
+
+LogicalResult ConstantOp::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(attributes.get("value").getType());
+  return success();
+}
+mlir::OpFoldResult ConstantOp::fold(
+    ::llvm::ArrayRef<mlir::Attribute> operands) {
+  return value();
+}
+}  // namespace pd
+}  // namespace mlir
diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.h b/paddle/infrt/dialect/pd/ir/pd_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..8383ff6ed8201c4f8948ebaa4effaac3d783cc52
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/pd_ops.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+//===----------------------------------------------------------------------===//
+// Dialect
+//===----------------------------------------------------------------------===//
+#include <llvm/ADT/StringMap.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.hpp.inc"
diff --git a/paddle/infrt/dialect/pd/pass/CMakeLists.txt b/paddle/infrt/dialect/pd/pass/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..827df597b76e2ec5b4cf639c984a425f9be8b6c9
--- /dev/null
+++ b/paddle/infrt/dialect/pd/pass/CMakeLists.txt
@@ -0,0 +1,8 @@
+
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    pd_op_fuse_pass.cc
+    )
+
+mlir_add_rewriter(pd_op_fuse)
diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td
similarity index 97%
rename from paddle/infrt/dialect/rewrite.td
rename to paddle/infrt/dialect/pd/pass/pd_op_fuse.td
index 62e7471a390dfeee1a9ddfc15033e85db0adca2e..f5a8ea78d7d9da5cc70b50d31836b4f4933d5853 100644
--- a/paddle/infrt/dialect/rewrite.td
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td
@@ -3,8 +3,8 @@
 
 include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/pd_ops.td"
-include "paddle/infrt/dialect/pd_extra_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_extra_ops.td"
 
 //===----------------------------------------------------------------------===//
 // This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'.
diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8bdf957db27d8c2b20025931a76826628feddbdd
--- /dev/null
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h"  // NOLINT
+
+#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
+namespace {
+#include "paddle/infrt/dialect/pd/pass/pd_op_fuse.cpp.inc"  // NOLINT
+
+/*
+ * PdOpFusePass.
+ */
+struct PdOpFusePass
+    : public mlir::PassWrapper<PdOpFusePass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "PdOpFusePass"; }
+
+  llvm::StringRef getArgument() const override { return "pd-op-fuse"; }
+
+  void runOnFunction() override;
+};
+
+// Implementation of the PdOpFusePass.
+void PdOpFusePass::runOnFunction() {
+  ::mlir::RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+}
+
+}  // namespace
+
+mlir::PassRegistration<PdOpFusePass> infrt_op_fuse_pass;
diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..854545ab1a2638224e16a300bfccb1f953f81c77
--- /dev/null
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/Pass/Pass.h>
+
+namespace infrt {
+/*
+ * PdOpFusePass.
+ */
+std::unique_ptr<mlir::Pass> CreatePdOpFusePass();
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
deleted file mode 100644
index 96e9e307f2fd3f33be3d2273a7aa66c363e4beb1..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/pd_ops.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/pd_ops.h"
-
-#include <mlir/IR/Matchers.h>
-#include <mlir/IR/PatternMatch.h>
-
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc"  // NOLINT
-
-namespace mlir {
-namespace pd {
-
-#include "paddle/infrt/dialect/rewrite.cpp.inc"  // NOLINT
-
-PaddleDialect::PaddleDialect(MLIRContext *context)
-    : Dialect("pd", context, TypeID::get<PaddleDialect>()) {
-  addOperations<
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
-      ,
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc"  // NOLINT
-      >();
-}
-
-mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
-                                                    mlir::Attribute value,
-                                                    mlir::Type type,
-                                                    mlir::Location loc) {
-  return builder.create<ConstantOp>(loc, value);
-}
-
-void ConstantOp::build(OpBuilder &builder,
-                       OperationState &state,
-                       Attribute value) {
-  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
-    return ConstantOp::build(builder, state, elem_attr);
-  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
-    ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType());
-    state.addAttribute("value", DenseElementsAttr::get(type, value));
-    state.addTypes(type);
-    return;
-  }
-  llvm_unreachable("unsupported attribute type for building pd.constant");
-}
-
-LogicalResult ConstantOp::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(attributes.get("value").getType());
-  return success();
-}
-mlir::OpFoldResult ConstantOp::fold(
-    ::llvm::ArrayRef<mlir::Attribute> operands) {
-  return value();
-}
-/*
-LogicalResult ElementwiseAdd::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-*/
-
-void Elementwise_addOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseMulAdd>(context);
-}
-
-/*
-mlir::OpFoldResult ElementwiseAdd::fold(
-    llvm::ArrayRef<mlir::Attribute> operands) {
-  if (getElementTypeOrSelf(getType()).isa<FloatType>()) {
-    if (!operands[0] || !operands[1]) return {};
-    DenseElementsAttr lhs = operands[0].dyn_cast<DenseElementsAttr>();
-    DenseElementsAttr rhs = operands[1].dyn_cast<DenseElementsAttr>();
-    if (!lhs || !rhs) return {};
-    ShapedType type = getType().template cast<ShapedType>();
-    if (!type.hasStaticShape()) return {};
-    Type etype = type.getElementType();
-    if (!etype.isa<FloatType>()) return {};
-    SmallVector<APFloat, 6> values;
-    values.reserve(lhs.getNumElements());
-    for (const auto zip :
-         llvm::zip(lhs.getValues<APFloat>(), rhs.getValues<APFloat>())) {
-      values.push_back(
-          std::plus<APFloat>()(std::get<0>(zip), std::get<1>(zip)));
-    }
-    return DenseElementsAttr::get(type, values);
-  }
-  return {};
-}
-
-LogicalResult ElementwiseDiv::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult ElementwiseMul::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult ElementwiseSub::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult MulOp::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-void ReluOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseFCRelu>(context);
-}
-
-void FusedRepeatedFCRelu::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseRepeatedFCRelu2>(context);
-}
-
-void BatchNormOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseBatchNormWithConvPattern>(context);
-}*/
-
-}  // namespace pd
-}  // namespace mlir
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
deleted file mode 100644
index e6b0f30c059054189fe3a86bb112da923ad76423..0000000000000000000000000000000000000000
--- a/paddle/infrt/dialect/pd_ops.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <mlir/Dialect/Traits.h>
-#include <mlir/IR/Attributes.h>
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/BuiltinOps.h>
-#include <mlir/IR/BuiltinTypes.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/IR/Matchers.h>
-#include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/Interfaces/CallInterfaces.h>
-#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
-#include <mlir/Interfaces/InferTypeOpInterface.h>
-#include <mlir/Interfaces/LoopLikeInterface.h>
-#include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-
-namespace mlir {
-namespace pd {
-
-class PaddleDialect : public Dialect {
- public:
-  explicit PaddleDialect(MLIRContext* context);
-
-  static StringRef getDialectNamespace() { return "pd"; }
-
-  /// A hook used to materialize constant values with the given type.
-  Operation* materializeConstant(OpBuilder& builder,
-                                 Attribute value,
-                                 Type type,
-                                 Location loc) override;
-
-  Type parseType(DialectAsmParser& parser) const override {
-    return Dialect::parseType(parser);
-  }
-  void printType(Type type, DialectAsmPrinter& printer) const override {
-    Dialect::printType(type, printer);
-  }
-};
-
-}  // namespace pd
-}  // namespace mlir
-
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.hpp.inc"
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_extra_ops.hpp.inc"
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index 8c3a79498d74d3b80e1590bbc2c0530c7af6411e..1fda2d9d8886008c6415b5a1cf36d53c1500707a 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -21,8 +21,8 @@ def PHI_DenseTensorDialect : Dialect {
 class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect,
   mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
 
-class CreateDenseTensorOp 
-      : PDT_Op<"create_dense_tensor", [NoSideEffect]> {
+class CreateDenseTensorOp<string target>
+      : PDT_Op<"create_dense_tensor." # target, [NoSideEffect]> {
   let arguments = (ins Context:$context, I64ArrayAttr:$dims, 
     LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision);
   let results = (outs DenseTensor:$output);
@@ -51,9 +51,11 @@ class CreateContextOp<string target>
   let results = (outs Context:$output);
 }
 
-def PDT_CreateDenseTensorOp : CreateDenseTensorOp;
+def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">;
+def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">;
 def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
 def PDT_CreateCPUContextOp : CreateContextOp<"cpu">;
+def PDT_CreateGPUContextOp : CreateContextOp<"gpu">;
 def PDT_PrintDenseTensor : PrintDenseTensorOp;
 
 def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
index f9e124aba6c28695f2c0fafa91404d2d10db8eea..13cba6eeabb669cf93deb9a37d87d2ddff66e5c0 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
@@ -32,6 +32,7 @@
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
 #include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/ops/compat/signatures.h"
 
 namespace {
@@ -94,42 +95,49 @@ void PhiOpConvertPass::convertStage() {
       // Todo: print log
       continue;
     }
-
-    ::phi::KernelSignature kernel_sign =
-        ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
-            infrt::ProtoArgumentMappingContext(op));
-    // resort input&output according to kernel_sign
-    ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
-    ::llvm::SmallVector<mlir::Type, 4> output_types;
-    for (const std::string &str : std::get<0>(kernel_sign.args)) {
-      if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
-        LOG(ERROR) << "No input info for Op " << op_name << " and argument "
-                   << str;
-        return;
+    auto loc = getFunction().getLoc();
+    builder.setInsertionPoint(op);
+    if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) {
+      std::string kernel_name = phi::TransToPhiKernelName(op_name);
+      auto kernel_op = builder.create<infrt::KernelOp>(loc,
+                                                       op->getResultTypes(),
+                                                       op->getOperands(),
+                                                       kernel_name,
+                                                       op->getAttrDictionary());
+      op->replaceAllUsesWith(kernel_op.getResults());
+    } else {
+      ::phi::KernelSignature kernel_sign =
+          ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
+              infrt::ProtoArgumentMappingContext(op));
+      // resort input&output according to kernel_sign
+      ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
+      ::llvm::SmallVector<mlir::Type, 4> output_types;
+      for (const std::string &str : std::get<0>(kernel_sign.args)) {
+        if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
+          LOG(ERROR) << "No input info for Op " << op_name << " and argument "
+                     << str;
+          return;
+        }
+        uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
+        inputs.push_back(op->getOperands()[index]);
       }
-      uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
-      inputs.push_back(op->getOperands()[index]);
-    }
 
-    for (const std::string &str : std::get<2>(kernel_sign.args)) {
-      if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
-        LOG(ERROR) << "No output info for Op " << op_name << " and argument "
-                   << str;
-        return;
+      for (const std::string &str : std::get<2>(kernel_sign.args)) {
+        if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
+          LOG(ERROR) << "No output info for Op " << op_name << " and argument "
+                     << str;
+          return;
+        }
+        uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
+        output_types.push_back(op->getResultTypes()[index]);
+        ori_output.push_back(op->getResult(index));
+      }
+      auto kernel_op = builder.create<infrt::KernelOp>(
+          loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
+      for (size_t index = 0; index < ori_output.size(); ++index) {
+        ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
       }
-      uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
-      output_types.push_back(op->getResultTypes()[index]);
-      ori_output.push_back(op->getResult(index));
-    }
-
-    auto loc = getFunction().getLoc();
-    builder.setInsertionPoint(op);
-    auto kernel_op = builder.create<infrt::KernelOp>(
-        loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
-    for (size_t index = 0; index < ori_output.size(); ++index) {
-      ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
     }
-
     CHECK(op->use_empty());
     op->erase();
   }
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
index 64b184359700ee2625e3c61d21617619a50771e3..1cd5b5a85511fe20e8029185caf4c93d95979b72 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
@@ -60,6 +60,10 @@ bool ProtoArgumentMappingContext::IsSelectedRowsInput(
     const std::string& name) const {
   return false;
 }
+bool ProtoArgumentMappingContext::IsDenseTensorVectorInput(
+    const std::string& name) const {
+  return false;
+}
 
 bool ProtoArgumentMappingContext::IsDenseTensorOutput(
     const std::string& name) const {
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index e4e9b5c3ff8a15dbe00dc1bd57fdce1a087437d8..5cf2ef979076d697f1991ad33cd38c36dda16cab 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <mlir/IR/Operation.h>
 #include <unordered_map>
-#include "paddle/infrt/dialect/pd_ops_info.h"
+#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace infrt {
@@ -42,6 +42,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {
 
   bool IsDenseTensorInput(const std::string& name) const override;
   bool IsSelectedRowsInput(const std::string& name) const override;
+  bool IsDenseTensorVectorInput(const std::string& name) const override;
 
   bool IsDenseTensorOutput(const std::string& name) const override;
   bool IsSelectedRowsOutput(const std::string& name) const override;
diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
index 46c250b05492cefe61d8e677a352a217718189b8..6467c1285f85e0c8bfca7b873ce64a09a52074ff 100644
--- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
+++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
@@ -3,7 +3,7 @@
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
-include "paddle/infrt/dialect/pd_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
 include "paddle/infrt/dialect/tensorrt/trt_ops.td"
 
 def PD2TRT_Matmul_Lower : Pat<
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index ad6b136463a71dcc2fcd9ce2b4e2da6f68e88dd2..e22a2309cbe2d343fab4e6e918d3c5a3f98cbb4e 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -17,11 +17,12 @@
 #include <llvm/ADT/SetVector.h>
 #include <mlir/Analysis/SliceAnalysis.h>
 #include <mlir/IR/Builders.h>
-#include <paddle/infrt/dialect/pd_ops.h>
 #include <list>
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
 namespace infrt {
 namespace trt {
 namespace {
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index e3a7b455024c65d40ccbafb28fba9e9b0ead0369..f81179e548fd5fb15850e9b8943bce440dc3091c 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -15,7 +15,7 @@
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
 
 #include <mlir/IR/Builders.h>
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
index 83bebdb6bf19bdf8f75d11d693813b8169e297a0..1e6a3e1380555ea94b0d5de9d64cdc42a27e894e 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -14,7 +14,7 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
 #include <mlir/IR/Builders.h>
 #include <mlir/Transforms/DialectConversion.h>
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 9f348b4122fc74033703c92459e6cfa5b3a1f3a2..2c6f08277c803fe744bbfe559f21a6b8b085b816 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -17,7 +17,7 @@
 #include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index d5222976625a2adece9a87c8952dba10137ae9ba..415a78a6967ab6fd4e2a38380d09a5d5c64b1c2f 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -21,6 +21,10 @@
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+
 namespace infrt {
 namespace trt {
 
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index 78d960b5120454bdd01b779abedbe2f7ec0d5853..76768037dbdb3072976d9f6cf0cdfb4f7956bdd4 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -30,7 +30,7 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
index 132a1d7805bdb85af8716e384ec29357a6ff68ad..31b28a38e7cfee4eb6da68302d482218d97f8350 100755
--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -7,6 +7,8 @@ include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
 
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 
 def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
   let summary = "trt CreateEngine Op";
@@ -14,8 +16,8 @@ def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<
     Describe a tensorrt subgraph.
   }];
   let regions = (region SizedRegion<1>:$body);
-  let arguments = (ins Variadic<TRT_Tensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
-  let results = (outs TRT_EngineType:$output);
+  let arguments = (ins Variadic<DenseTensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
+  let results = (outs TRT_EngineType:$engine);
 }
 
 def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
@@ -23,8 +25,25 @@ def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
   let description = [{
     Describe a tensorrt runtime.
   }];
-  let arguments = (ins TRT_EngineType:$engine, Variadic<TRT_Tensor>:$inputs);
-  let results = (outs Variadic<TRT_Tensor>:$output);
+  let arguments = (ins TRT_EngineType:$engine, Variadic<DenseTensor>:$inputs);
+  let results = (outs Variadic<DenseTensor>:$output);
+}
+
+def TRT_EngineComputeOp : TRT_Op<"compute", [NoSideEffect]> {
+  let summary = "trt compute engine";
+  let description = [{
+    execute engine
+  }];
+  let arguments = (ins TRT_EngineType:$engine, Context:$context);
+  let results = (outs DenseTensorList:$outputs);
+}
+
+def TRT_InspectEngineOp : TRT_Op<"inspect_engine", [NoSideEffect]> {
+  let summary = "trt inspect engine";
+  let description = [{
+    Show engine
+  }];
+  let arguments = (ins TRT_EngineType:$engine);
 }
 
 def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
@@ -34,11 +53,11 @@ def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
     TensorRT IActivationLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input, SI32Attr:$activation_type,
+  let arguments = (ins  DenseTensor:$input, SI32Attr:$activation_type,
                         DefaultValuedAttr<F32Attr, "0.0">:$alpha,
                         DefaultValuedAttr<F32Attr, "0.0">:$beta);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
 }
 
 def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
@@ -48,9 +67,9 @@ def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
     TensorRT IElementWiseLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation);
+  let arguments = (ins  DenseTensor:$input1, DenseTensor:$input2, SI32Attr:$elementwise_operation);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
 }
 
 def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
@@ -60,10 +79,10 @@ def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
     TensorRT IMatrixMultiplyLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input1, BoolAttr:$transpose1,
-                        TRT_Tensor:$input2, BoolAttr:$transpose2);
+  let arguments = (ins  DenseTensor:$input1, BoolAttr:$transpose1,
+                        DenseTensor:$input2, BoolAttr:$transpose2);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
 }
 
 #endif  // TRT_OPS
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 319df90d3eec133d3f02be6749e9ad379fd225fd..81bf873ddf0cf3f1a94489bd3b0b2769274b1b4a 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -33,7 +33,10 @@
 #include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
 #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
 #include "paddle/infrt/kernel/phi/registry.h"
-#endif
+#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
+#include "paddle/infrt/kernel/tensorrt/registry.h"
+#endif  // INFRT_WITH_GPU && INFRT_WITH_TRT
+#endif  // INFRT_WITH_PHI
 
 static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
     "shared_libs",
@@ -62,6 +65,9 @@ int main(int argc, char** argv) {
 #ifdef INFRT_WITH_PHI
   kernel::RegisterPhiKernels(&registry);
   kernel::RegisterInferShapeLaunchers(&registry);
+#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
+  kernel::RegisterTrtKernels(&registry);
+#endif  // INFRT_WITH_GPU && INFRT_WITH_TRT
 #endif
 
   // load extra shared library
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index c613843cd1779599fbac5aea6042b26b151534e8..3d5cccb5c32694ff05d10811bbff0f068bd6bc51 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -16,12 +16,14 @@
 
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinAttributes.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 
+#include <glog/logging.h>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -42,6 +44,13 @@
 #include "paddle/infrt/host_context/value.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
+#ifdef INFRT_WITH_PHI
+#ifdef INFRT_WITH_TRT
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#endif
+#include "paddle/phi/core/dense_tensor.h"
+#endif
+
 namespace infrt {
 namespace host_context {
 
@@ -277,33 +286,58 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
       impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
 
   VLOG(3) << "processing general op : " << op->getName().getStringRef().str();
+  // TODO(wilber): Find a more appropriate way to handle special cases.
+  if (op->getName().getStringRef() == "trt.create_engine") {
+#ifdef INFRT_WITH_TRT
+    auto* symbols = impl_->runtime->symbol_table();
+    ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol mlir_operation;
+    mlir_operation.operation = op;
+    mlir_operation.symbol_table = symbols;
+    impl_->cur_op->AppendArgument(new Value(mlir_operation));
+    // TODO(wilber): how to pass DenseTensor to create_engine op? temporialiy
+    // add a naive implement.
+    for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
+      auto operand = op->getOperand(i);
+      if (operand.isa<mlir::BlockArgument>()) {
+        mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
+        Value* arg_value = GetValue(arg);
+        if (arg_value->is_type<phi::DenseTensor>()) {
+          impl_->runtime->FeedInArgs(
+              std::make_pair(std::to_string(i), ValueRef(arg_value)));
+        }
+      }
+    }
+#else
+    CHECK(false) << "should not reach here";
+#endif
+  } else {
+    // process operands
+    for (int i = 0, e = op->getNumOperands(); i < e; i++) {
+      // function argument as value
+      auto operand = op->getOperand(i);
+      /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+      if (operand.isa<mlir::BlockArgument>()) {
+        mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
+        Value* arg_value = GetValue(arg);
+        impl_->cur_op->AppendArgument(arg_value);
+        VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
+                << GetValue(arg);
+        continue;
+      }
 
-  // process operands
-  for (int i = 0, e = op->getNumOperands(); i < e; i++) {
-    // function argument as value
-    auto operand = op->getOperand(i);
-    /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
-    if (operand.isa<mlir::BlockArgument>()) {
-      mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
-      Value* arg_value = GetValue(arg);
+      // normal value
+      Value* arg_value = GetValue(operand);
+      if (!arg_value) {
+        auto upstream_op = operand.getDefiningOp();
+        arg_value = GetOpResult(upstream_op);
+      }
+      CHECK(arg_value) << "No-exist argument value found: "
+                       << DumpToString(operand);
       impl_->cur_op->AppendArgument(arg_value);
-      VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
-              << GetValue(arg);
-      continue;
-    }
 
-    // normal value
-    Value* arg_value = GetValue(operand);
-    if (!arg_value) {
-      auto upstream_op = operand.getDefiningOp();
-      arg_value = GetOpResult(upstream_op);
+      VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
+              << GetValue(operand) << " vs " << arg_value;
     }
-    CHECK(arg_value) << "No-exist argument value found: "
-                     << DumpToString(operand);
-    impl_->cur_op->AppendArgument(arg_value);
-
-    VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
-            << GetValue(operand) << " vs " << arg_value;
   }
 
   // process attributes
@@ -383,33 +417,6 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
     impl_->cur_op->AppendAttribute(tmp[i]);
   }
 
-  // process results
-  llvm::SmallVector<Value*, 4> res_values;
-  for (int i = 0, e = op->getNumResults(); i < e; i++) {
-    auto res = op->getResult(i);
-    if (res.getType().isa<::infrt::DenseTensorType>()) {
-      auto r = impl_->value_map.try_emplace(
-          res, ValueRef(new Value{::phi::DenseTensor()}));
-      CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
-                      << "]";
-      res_values.push_back(r.first->second.get());
-    } else {
-      res_values.push_back(AddValue(res));
-    }
-
-    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
-  }
-  impl_->cur_op->SetResults(res_values);
-
-#ifdef INFRT_DEBUG
-  {
-    VLOG(3) << "check result";
-    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
-      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
-    }
-  }
-#endif
-
   // process regions, we treat regions as attribute.
   auto num_regions = op->getNumRegions();
   if (num_regions > 0) {
@@ -438,6 +445,33 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
     impl_->cur_op->AppendAttribute(new Value(function));
   }
 
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    if (res.getType().isa<::infrt::DenseTensorType>()) {
+      auto r = impl_->value_map.try_emplace(
+          res, ValueRef(new Value{::phi::DenseTensor()}));
+      CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
+                      << "]";
+      res_values.push_back(r.first->second.get());
+    } else {
+      res_values.push_back(AddValue(res));
+    }
+
+    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
+  }
+  impl_->cur_op->SetResults(res_values);
+
+#ifdef INFRT_DEBUG
+  {
+    VLOG(3) << "check result";
+    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
+      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
+    }
+  }
+#endif
+
   return true;
 }
 
diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc
index 18c25827b8ec5a71907e694cea4e7680b598e883..48999a23ef34cd119081810fb4baac77f5fb123b 100644
--- a/paddle/infrt/host_context/paddle_mlir.cc
+++ b/paddle/infrt/host_context/paddle_mlir.cc
@@ -13,15 +13,17 @@
 // limitations under the License.
 
 #include "paddle/infrt/host_context/paddle_mlir.h"
-#include "paddle/infrt/dialect/pd_ops_info.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
 
 MLIRModelGenImpl::MLIRModelGenImpl()
     : context_(infrt::Global::getMLIRContext()), builder_(context_) {
-  context_->allowUnregisteredDialects();
   context_->getOrLoadDialect<mlir::StandardOpsDialect>();
   context_->getOrLoadDialect<infrt::ts::TensorShapeDialect>();
   context_->getOrLoadDialect<infrt::dt::DTDialect>();
   context_->getOrLoadDialect<mlir::pd::PaddleDialect>();
+  context_->getOrLoadDialect<::infrt::InfrtDialect>();
   module_ = mlir::ModuleOp::create(mlir::UnknownLoc::get(context_));
 }
 
@@ -55,7 +57,6 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
   UpdateModelParams(program, &mainFunc);
   UpdateModelOps(program);
   UpdateModelOutputs(program);
-
   return module_;
 }
 
@@ -171,7 +172,11 @@ void MLIRModelGenImpl::UpdateModelParams(
       ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
                       builder_,
                       &precision_);
-      mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+      mlir::Type type_ =
+          infrt::DenseTensorType::get(context_,
+                                      infrt::TargetType::CPU,
+                                      infrt::PrecisionType::FLOAT32,
+                                      infrt::LayoutType::NCHW);
       auto op = builder_.create<infrt::dt::TensorMapGetTensorOp>(
           mlir::UnknownLoc::get(context_), type_, map, name);
       params_map_.insert(std::pair<std::string, mlir::Value>(
@@ -197,8 +202,9 @@ void MLIRModelGenImpl::UpdateModelOutputs(
 
         llvm::SmallVector<mlir::Type, 4> resultTypes;
         llvm::SmallVector<mlir::NamedAttribute, 4> attrs;
+
         mlir::OperationState state(loc,
-                                   mlir::ReturnOp::getOperationName(),
+                                   ::infrt::ReturnOp::getOperationName(),
                                    operands,
                                    resultTypes,
                                    attrs);
@@ -321,7 +327,7 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
     switch (type) {
       ATTR_IMPL_CASE(FLOAT, f, getF32FloatAttr);
       ATTR_IMPL_CASE(BOOLEAN, b, getBoolAttr);
-      ATTR_IMPL_CASE(INT, i, getI32IntegerAttr);
+      ATTR_IMPL_CASE(INT, i, getSI32IntegerAttr);
       ATTR_IMPL_CASE(LONG, l, getI64IntegerAttr);
       ATTR_IMPL_CASE(STRING, s, getStringAttr);
 
diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h
index e825cbb5a11ea0dfcacfc2b1bbb63bf201219c9d..d5f1209b9925b6f2bb916cdd99024a5782485365 100644
--- a/paddle/infrt/host_context/paddle_mlir.h
+++ b/paddle/infrt/host_context/paddle_mlir.h
@@ -14,22 +14,22 @@
 #ifndef PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
 #define PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
 
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/AsmState.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/MLIRContext.h>
 #include <fstream>
 #include <iostream>
 #include <string>
 
-#include "llvm/Support/CommandLine.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/AsmState.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/MLIRContext.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/common/string.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
-
 #include "paddle/infrt/dialect/init_dialects.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 #include "paddle/infrt/paddle/model_parser.h"
 
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 957d852442b10620244e230a2f7704eb7fa0a33e..1f0b1dabd94d8dcf28e8e0543a8e3b12ed250704 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -24,6 +24,7 @@
 #include "paddle/infrt/common/shared.h"
 #include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/host_context/function.h"
+#include "paddle/infrt/host_context/symbol_table.h"
 #include "paddle/infrt/support/variant.h"
 #include "paddle/infrt/tensor/dense_host_tensor.h"
 #include "paddle/infrt/tensor/dense_tensor_view.h"
@@ -41,7 +42,15 @@
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/meta_tensor.h"
-#endif
+
+#ifdef INFRT_WITH_GPU
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#endif  // INFRT_WITH_GPU
+#ifdef INFRT_WITH_TRT
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#endif  // INFRT_WITH_TRT
+#endif  // INFRT_WITH_PHI
 
 namespace infrt {
 namespace host_context {
@@ -72,8 +81,13 @@ using ValueVariantType =
             ::phi::MetaTensor,
             ::phi::DenseTensor,
             backends::CpuPhiContext,
+#ifdef INFRT_WITH_GPU
+            backends::GpuPhiContext,
+            ::phi::GPUContext,
+#endif
             ::phi::CPUContext,
             std::vector<const phi::DenseTensor*>,
+            std::vector<phi::DenseTensor*>,
             paddle::experimental::ScalarBase<phi::DenseTensor>,
             paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
             std::vector<phi::MetaTensor*>,
@@ -81,6 +95,10 @@ using ValueVariantType =
             paddle::experimental::Backend,
             paddle::experimental::DataLayout,
             paddle::experimental::DataType,
+#ifdef INFRT_WITH_TRT
+            ::infrt::backends::tensorrt::TrtEngine,
+            ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol,
+#endif  // INFRT_WITH_TRT
 #endif
             std::vector<int16_t>,
             std::vector<int32_t>,
@@ -120,8 +138,18 @@ class Value : public common::Object {
 #ifdef INFRT_WITH_PHI
   explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
   explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
+#ifdef INFRT_WITH_GPU
+  explicit Value(::phi::GPUContext&& x) : data(std::move(x)) {}
+  explicit Value(backends::GpuPhiContext&& x) : data(std::move(x)) {}
+#endif
   explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
+#ifdef INFRT_WITH_TRT
+  explicit Value(::infrt::backends::tensorrt::TrtEngine&& x)
+      : data(std::move(x)) {}
+  explicit Value(::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol x)
+      : data(x) {}
+#endif  // INFRT_WITH_TRT
 #endif
 
   template <typename T>
diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt
index f1cbfba1c46b33e461a7c9f08cf646625fbafb24..f20344f6f6b84ae8e63f44c7b7b83c6ba9d8d6da 100644
--- a/paddle/infrt/kernel/CMakeLists.txt
+++ b/paddle/infrt/kernel/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(phi)
+add_subdirectory(tensorrt)
 
 core_gather_headers()
 
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 39ef172fadef9e0f6317dec192c251c6a1df6828..b27eacf9e522d2bbb8b7ffd70ad57f54e5775499 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -25,6 +25,16 @@ namespace phi {
   return ctx;
 }
 
+#ifdef INFRT_WITH_GPU
+::phi::GPUContext CreateGPUContext() {
+  ::phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{});
+  context.PartialInitWithAllocator();
+  return context;
+}
+#endif
+
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
index 3e9580b91da5724b42c72224847e45715f47dbb7..ae3f76c8fe536f96689680668cc52e4981894063 100644
--- a/paddle/infrt/kernel/phi/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -25,6 +25,10 @@ namespace phi {
 
 ::phi::CPUContext CreateCPUContext();
 
+#ifdef INFRT_WITH_GPU
+::phi::GPUContext CreateGPUContext();
+#endif
+
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index 777fb29ac60d9c7125898752747bbdf553f370c0..6d16b814c6b02b08e279190d5a685d65c124942d 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -15,6 +15,12 @@
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
 #include "paddle/infrt/dialect/phi/data_type.h"
 #include "paddle/infrt/kernel/phi/context_kernels.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/place.h"
+
+#ifdef INFRT_WITH_GPU
+#include <cuda_runtime.h>
+#endif
 
 namespace infrt {
 namespace kernel {
@@ -34,26 +40,83 @@ namespace phi {
                              {}));
 }
 
+::phi::DenseTensor CreateGPUDenseTensor(
+    const ::phi::GPUContext& context,
+    host_context::Attribute<std::vector<int64_t>> dims,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision) {
+  return ::phi::DenseTensor(
+      const_cast<::phi::Allocator*>(&context.GetAllocator()),
+      ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
+                             ::phi::make_ddim(dims.get()),
+                             ConvertLayoutToPhi(layout.get()),
+                             {}));
+}
+
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<float>> value) {
-  auto place = ::phi::CPUPlace();
+  auto place = dense_tensor->place();
   float* a_data = dense_tensor->mutable_data<float>(place);
-  for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
-    a_data[i] = (value.get())[i];
+  if (place.GetType() == ::phi::AllocationType::CPU) {
+    for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
+      a_data[i] = (value.get())[i];
+    }
+  } else if (place.GetType() == ::phi::AllocationType::GPU) {
+#ifdef INFRT_WITH_GPU
+    // TODO(wilber): how to set the stream parameter to copy with stream.
+    cudaMemcpy(a_data,
+               value.get().data(),
+               sizeof(float) * value.get().size(),
+               cudaMemcpyHostToDevice);
+#endif
+  } else {
+    llvm_unreachable("temporarily not support other target.");
   }
 }
 
 void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
-#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)              \
-  case ::phi::DataType::PHI_DATATYPE: {                   \
-    DTYPE* data = dense_tensor->data<DTYPE>();            \
-    if (dense_tensor->numel() == 0) break;                \
-    std::cout << data[0];                                 \
-    for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
-      std::cout << "," << data[i];                        \
-    }                                                     \
-    break;                                                \
+#ifndef INFRT_WITH_GPU
+#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)                \
+  case ::phi::DataType::PHI_DATATYPE: {                     \
+    auto place = dense_tensor->place();                     \
+    if (place.GetType() == ::phi::AllocationType::CPU) {    \
+      DTYPE* data = dense_tensor->data<DTYPE>();            \
+      if (dense_tensor->numel() == 0) break;                \
+      std::cout << data[0];                                 \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
+        std::cout << "," << data[i];                        \
+      }                                                     \
+    }                                                       \
+    break;                                                  \
+  }
+#else
+#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)                     \
+  case ::phi::DataType::PHI_DATATYPE: {                          \
+    auto place = dense_tensor->place();                          \
+    DTYPE* data = dense_tensor->data<DTYPE>();                   \
+    if (dense_tensor->numel() == 0) break;                       \
+    if (place.GetType() == ::phi::AllocationType::CPU) {         \
+      std::cout << data[0];                                      \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) {      \
+        std::cout << "," << data[i];                             \
+      }                                                          \
+    } else if (place.GetType() == ::phi::AllocationType::GPU) {  \
+      std::vector<DTYPE> host_data(dense_tensor->numel(), 0);    \
+      cudaMemcpy(host_data.data(),                               \
+                 data,                                           \
+                 sizeof(DTYPE) * dense_tensor->numel(),          \
+                 cudaMemcpyDeviceToHost);                        \
+      std::cout << host_data[0];                                 \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) {      \
+        std::cout << "," << host_data[i];                        \
+      }                                                          \
+    } else {                                                     \
+      llvm_unreachable("temporarily not support other target."); \
+    }                                                            \
+    break;                                                       \
   }
+#endif
 
   ::phi::DDim dims = dense_tensor->dims();
   std::cout << "dense_tensor: shape=shape" << dims.to_str() << ","
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 8cc0e39e0e4431f073ac37a7f0557f2c837dc753..47d89506e2aa615b0bc425a4c373c904d937e03f 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -30,6 +30,13 @@ namespace phi {
     host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<::infrt::PrecisionType> precision);
 
+::phi::DenseTensor CreateGPUDenseTensor(
+    const ::phi::GPUContext& context,
+    host_context::Attribute<std::vector<int64_t>> dims,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision);
+
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<float>> values);
 void PrintDenseTensor(::phi::DenseTensor* dense_tensor);
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index 0e071418603f8390ca3283f617b06cf1fa91b94c..36d40118f16a0bd1779765064caaac6dbe414772 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -35,7 +35,7 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
   registry->AddKernel("phi_dt.create_context.cpu",
                       INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext));
   registry->AddKernelWithAttrs(
-      "phi_dt.create_dense_tensor",
+      "phi_dt.create_dense_tensor.cpu",
       INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor),
       {"dims", "lod", "layout", "precision"});
   registry->AddKernelWithAttrs(
@@ -44,6 +44,15 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
       {"value"});
   registry->AddKernel("phi_dt.print_tensor",
                       INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor));
+
+#ifdef INFRT_WITH_GPU
+  registry->AddKernel("phi_dt.create_context.gpu",
+                      INFRT_KERNEL(infrt::kernel::phi::CreateGPUContext));
+  registry->AddKernelWithAttrs(
+      "phi_dt.create_dense_tensor.gpu",
+      INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor),
+      {"dims", "lod", "layout", "precision"});
+#endif
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index b7503aa4ef35894dda514fdb7fa4336485323094..79502f9fdfd4bd88666f61ff30bc526325b91341 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -25,6 +25,10 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
+#ifdef INFRT_WITH_PHI
+#include "paddle/phi/core/dense_tensor.h"
+#endif
+
 namespace infrt {
 namespace kernel {
 using namespace host_context;  // NOLINT
@@ -62,6 +66,20 @@ DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
 
 int32_t TensorMapGetSize(TensorMap map) { return map.size(); }
 
+// TODO(wilber): Maybe we should place TensorList type in dt dialect.
+#ifdef INFRT_WITH_PHI
+phi::DenseTensor TensorListGetTensor(std::vector<phi::DenseTensor *> list,
+                                     Attribute<int32_t> idx) {
+  CHECK_LT(idx.get(), static_cast<int>(list.size()))
+      << "idx should less than list size";
+  return *list[idx.get()];
+}
+
+int32_t TensorListGetSize(const std::vector<phi::DenseTensor *> &list) {
+  return list.size();
+}
+#endif
+
 DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; }
 
 template <typename T>
@@ -126,6 +144,14 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) {
                       INFRT_KERNEL(TensorMapGetTensor));
   registry->AddKernel("dt.tensor_map_get_size", INFRT_KERNEL(TensorMapGetSize));
 
+// TensorList related methods.
+#ifdef INFRT_WITH_PHI
+  registry->AddKernel("dt.tensor_list_get_tensor",
+                      INFRT_KERNEL(TensorListGetTensor));
+  registry->AddKernel("dt.tensor_list_get_size",
+                      INFRT_KERNEL(TensorListGetSize));
+#endif
+
   registry->AddKernel("dt.shallow_copy_tensor",
                       INFRT_KERNEL(ShallowCopyTensor));
 
diff --git a/paddle/infrt/kernel/tensorrt/CMakeLists.txt b/paddle/infrt/kernel/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cd35fccbe2aa35453a4d4ac13364ef6bb5a6b6aa
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/CMakeLists.txt
@@ -0,0 +1,10 @@
+if (NOT (INFRT_WITH_PHI AND INFRT_WITH_GPU AND INFRT_WITH_TRT))
+  return()
+endif()
+
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    registry.cc
+    trt_kernels.cc
+)
diff --git a/paddle/infrt/kernel/tensorrt/registry.cc b/paddle/infrt/kernel/tensorrt/registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a37e3c0f7f2785e23c8a0b9a25d3283396215f70
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/registry.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensorrt/registry.h"
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+
+namespace infrt {
+namespace kernel {
+
+void RegisterTrtKernels(host_context::KernelRegistry* registry) {
+  registry->AddKernel("trt.create_engine",
+                      INFRT_KERNEL(tensorrt::CreateTrtEngine));
+  registry->AddKernel("trt.inspect_engine",
+                      INFRT_KERNEL(tensorrt::PrintTrtLayer));
+  registry->AddKernel("trt.compute", INFRT_KERNEL(tensorrt::TrtEngineCompute));
+}
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/registry.h b/paddle/infrt/kernel/tensorrt/registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..762329ca61d02a16edc150854afcc3dd431a941d
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/registry.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace infrt {
+namespace host_context {
+
+struct KernelRegistry;
+
+}  // namespace host_context
+}  // namespace infrt
+
+namespace infrt {
+namespace kernel {
+
+/**
+ * Register all the trt kernels to registry.
+ */
+void RegisterTrtKernels(host_context::KernelRegistry* registry);
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04847ac8982f861ab2799bd23b1c2ab723422327
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#include <string>
+#include "NvInfer.h"
+#include "NvInferRuntime.h"
+#include "NvInferRuntimeCommon.h"
+#include "glog/logging.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/infrt/backends/tensorrt/trt_options.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace kernel {
+namespace tensorrt {
+
+::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
+    MlirOperationWithInfrtSymbol
+        create_engine_op /*, input_tensors, output_tensors, weights*/) {
+  // TODO(wilber): The device_id needs to get from mlir.
+  int device_id = 0;
+  backends::tensorrt::TrtEngine engine(device_id);
+
+  auto* builder = engine.GetTrtBuilder();
+  // TODO(wilber): How to process weights?
+  backends::tensorrt::TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  // TODO(wilber): static_shape or dynamic_shape network? The code is just
+  // static_shape test.
+  network.reset(builder->createNetworkV2(0));
+
+  // TODO(wilber): The build option shoule be fiiled from mlir info.
+  backends::tensorrt::BuildOptions options;
+  options.max_batch = 4;
+
+  // Parse mlir Region which only has one block.
+  mlir::Operation& operation = *create_engine_op.operation;
+  auto* symbol_table = create_engine_op.symbol_table;
+  CHECK_NOTNULL(symbol_table);
+
+  unsigned int num_regions = operation.getNumRegions();
+  CHECK_EQ(num_regions, 1U) << "only support one region case.";
+  auto& region = operation.getRegion(0);
+  auto& block = region.getBlocks().front();
+
+  llvm::DenseMap<mlir::Value, nvinfer1::ITensor*> map_info;
+  std::unordered_map<std::string, phi::DenseTensor*> trt_bind_inputs;
+
+  for (auto index_operand : llvm::enumerate(operation.getOperands())) {
+    mlir::Value operand = index_operand.value();
+    size_t idx = index_operand.index();
+
+    const std::string input_name = "input_" + std::to_string(idx);
+    auto* v = symbol_table->GetValue(std::to_string(idx));
+    CHECK_NOTNULL(v);
+    auto* t = &v->get<phi::DenseTensor>();
+    trt_bind_inputs[input_name] = t;
+    // TODO(wilber): get input info from mlir.
+    // TODO(wilber): input dims, now only support static_shape, and just remove
+    // the first dimension.
+    // TODO(wilber): now only suppot float input.
+    nvinfer1::Dims dims;
+    dims.nbDims = t->dims().size() - 1;
+    for (int i = 0; i < dims.nbDims; ++i) {
+      dims.d[i] = t->dims()[i + 1];
+    }
+    auto* in =
+        network->addInput(input_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
+    map_info[operand] = in;
+  }
+
+  // TODO(wilber): Find a way to add layer.
+  for (auto& inner_op : block.without_terminator()) {
+    if (inner_op.getName().getStringRef() == "trt.Activation") {
+      trt::ActivationOp act_op = llvm::dyn_cast<trt::ActivationOp>(inner_op);
+      auto in_arg = act_op.getOperand();
+      if (!map_info.count(in_arg)) {
+        CHECK(false) << "map_info not has in_arg.";
+      }
+      nvinfer1::ActivationType act_type =
+          static_cast<nvinfer1::ActivationType>(act_op.activation_type());
+      auto* act_layer = network->addActivation(*map_info[in_arg], act_type);
+      act_layer->setAlpha(act_op.alpha().convertToFloat());
+      act_layer->setBeta(act_op.beta().convertToFloat());
+      for (size_t i = 0; i < act_op->getNumResults(); ++i) {
+        nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i);
+        mlir::Value act_out = act_op->getResult(i);
+        map_info[act_out] = act_out_tensor;
+      }
+    }
+
+    // if (inner_op.getName().getStringRef() == "trt.Constant") {
+    //   trt::ConstantOp op = llvm::dyn_cast<trt::ConstantOp>(inner_op);
+    //   mlir::Value op_out = op.getResult();
+    //   std::vector<float> weight_data{1};
+    //   auto* layer = network->addConstant(nvinfer1::Dims2(1, 1),
+    //   nvinfer1::Weights{nvinfer1::DataType::kFLOAT, weight_data.data(), 1});
+    //   auto* op_out_tenor = layer->getOutput(0);
+    //   map_info[op_out] = op_out_tenor;
+    // }
+  }
+  for (auto& inner_op : block.without_terminator()) {
+    for (mlir::Value v : inner_op.getResults()) {
+      for (mlir::Operation* user : v.getUsers()) {
+        if (user->getName().getStringRef() == "infrt.return") {
+          if (!map_info.count(v)) {
+            CHECK(false) << "map_info not has value";
+          }
+          network->markOutput(*map_info[v]);
+        }
+      }
+    }
+  }
+  // std::unordered_map<std::string, phi::DenseTensor*> trt_bind_outputs;
+  mlir::Operation* ret = block.getTerminator();
+  for (unsigned int i = 0; i < ret->getNumOperands(); ++i) {
+    mlir::Value arg = ret->getOperand(i);
+    CHECK(map_info.count(arg));
+    map_info[arg]->setName(("output_" + std::to_string(i)).c_str());
+  }
+  for (int i = 0; i < network->getNbOutputs(); ++i) {
+    engine.PrepareOutputHandle(network->getOutput(i)->getName());
+  }
+
+  VLOG(3) << "trt engine build start.";
+  engine.Build(std::move(network), options);
+  VLOG(3) << "trt engine build done.";
+
+  // TODO(wilber): get inference options from mlir.
+  backends::tensorrt::InferenceOptions inference_options;
+  inference_options.batch = 1;
+  // TODO(wilber): bind trt input/output tensors.
+  engine.SetUpInference(inference_options, trt_bind_inputs);
+  return engine;
+}
+
+void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) {
+  engine->GetEngineInfo();
+}
+
+std::vector<phi::DenseTensor*> TrtEngineCompute(
+    backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) {
+  engine->Run(context);
+  std::vector<phi::DenseTensor*> res;
+  for (size_t i = 0; i < engine->GetOutputNum(); ++i) {
+    res.push_back(engine->GetOutput("output_" + std::to_string(i)));
+  }
+  return res;
+}
+
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..546ee9dc78852e6967bf8b61ae81563d32beae66
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "mlir/IR/Operation.h"
+
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+
+namespace infrt {
+namespace host_context {
+class SymbolTable;
+}  // namespace host_context
+
+namespace kernel {
+namespace tensorrt {
+
+struct MlirOperationWithInfrtSymbol {
+  mlir::Operation* operation;
+  ::infrt::host_context::SymbolTable* symbol_table;
+};
+
+::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
+    MlirOperationWithInfrtSymbol engine_op);
+
+void PrintTrtLayer(backends::tensorrt::TrtEngine* engine);
+
+std::vector<phi::DenseTensor*> TrtEngineCompute(
+    backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context);
+
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt
index e5cc1ec1121fb7bbff2fad7856151916d8ea0924..58543a6864258bd6c0153150bb535262d9a8f00d 100644
--- a/paddle/infrt/tests/CMakeLists.txt
+++ b/paddle/infrt/tests/CMakeLists.txt
@@ -1,3 +1,5 @@
+cc_test_tiny(test_abs_model SRCS model/test_abs.cc DEPS infrt ${MLIR_IR_LIBS})
+
 configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py")
 
 add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\""
diff --git a/paddle/infrt/tests/dialect/disabled_trt.mlir b/paddle/infrt/tests/dialect/disabled_trt.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ef86dcf1e72a04c478a7763000cf366715665d81
--- /dev/null
+++ b/paddle/infrt/tests/dialect/disabled_trt.mlir
@@ -0,0 +1,37 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @run_trt
+func @run_trt(%0 : !infrt.dense_tensor<GPU, FP32, NCHW>, %ctx : !phi.context<GPU>) {
+  %a = "trt.create_engine"(%0) ({
+    %1 = "trt.Activation"(%0) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    "infrt.return"(%1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !trt.engine
+  "trt.inspect_engine"(%a) {} : (!trt.engine) -> ()
+
+  %res = "trt.compute"(%a, %ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
+  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
+  "infrt.print.i32"(%size) {} : (i32) -> ()
+
+  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  infrt.return
+}
+
+// CHECK-LABEL: @main
+func @main() {
+  %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
+  %t = "phi_dt.create_dense_tensor.gpu" (%ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+
+  "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  "phi_dt.print_tensor" (%t) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  //%res = 
+  infrt.call @run_trt(%t, %ctx) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !phi.context<GPU>) -> ()
+  //-> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+
+  infrt.return
+}
diff --git a/paddle/infrt/tests/dialect/rewrite.mlir b/paddle/infrt/tests/dialect/pd/rewrite.mlir
similarity index 97%
rename from paddle/infrt/tests/dialect/rewrite.mlir
rename to paddle/infrt/tests/dialect/pd/rewrite.mlir
index 9fbb09e22449ff98a28b9e22732351ddbbc49dd0..ea0248b9d95d28e0160192a44f4c542d50a4892d 100644
--- a/paddle/infrt/tests/dialect/rewrite.mlir
+++ b/paddle/infrt/tests/dialect/pd/rewrite.mlir
@@ -1,4 +1,4 @@
-// RUN: infrtopt --canonicalize %s | FileCheck %s
+// RUN: infrtopt --pd-op-fuse %s | FileCheck %s
 // CHECK-LABEL: @main
 func @main() -> tensor<?xf32> {
   %a = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
index 3657777a5b0bce1c5a5e4df8d59695f8b122da56..b8cb1a5cec2a17d3f6d15036249fcf9f7f711948 100644
--- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL: @sign_any_float32_execute
 func @sign_any_float32_execute() {
   %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
-  %t = "phi_dt.create_dense_tensor" (%ctx) {
+  %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {
     precision=#infrt.precision<FP32>, 
     layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
   "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir
index 5b0fa735897a31287bb6dea487e2f22eacd7b0aa..21ee8ebf0b705894446192b0d5d0bfeb9f10f326 100644
--- a/paddle/infrt/tests/dialect/phi/phi_test.mlir
+++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir
@@ -6,7 +6,7 @@ module  {
   }
   func @main() {
     %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
-    %t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
     "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
     %2 = infrt.call@predict(%t) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
     phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir
index e3cb9670bec015e58e2a538bb55dfbe7c8b7f554..7bdf62a277896afe2f8a5e156fa8183742f1d853 100644
--- a/paddle/infrt/tests/dialect/trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/trt_ops.mlir
@@ -1,16 +1,16 @@
 // RUN: trt-exec %s
 // CHECK-LABEL: @main
-func @main(%bias:tensor<?xf32>, %c:tensor<?xf32>, %b1:tensor<?xf32>, %b2:tensor<?xf32>, %bias1:tensor<?xf32>, %bias2:tensor<?xf32>) -> tensor<?xf32> {
-  %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
+func @main(%bias:!infrt.dense_tensor<GPU, FP32, NCHW>, %c:!infrt.dense_tensor<GPU, FP32, NCHW>, %b1:!infrt.dense_tensor<GPU, FP32, NCHW>, %b2:!infrt.dense_tensor<GPU, FP32, NCHW>, %bias1:!infrt.dense_tensor<GPU, FP32, NCHW>, %bias2:!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW> {
+  %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e = "pd.relu6"(%d) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
 
-  %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e1 = "pd.relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
+  %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e1 = "pd.relu"(%d1) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
 
-  %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
+  %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e2 = "pd.relu"(%d2) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
   
-  infrt.return %e2 : tensor<?xf32>
+  infrt.return %e2 : !infrt.dense_tensor<GPU, FP32, NCHW>
 }
diff --git a/paddle/infrt/tests/model/abs_model.py b/paddle/infrt/tests/model/abs_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd1632bc9d4d8e4e6ea0fb918d1179f4e28a441b
--- /dev/null
+++ b/paddle/infrt/tests/model/abs_model.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.nn import Layer
+from paddle.static import InputSpec
+from paddle.jit import to_static
+import sys
+
+
+class AbsNet(paddle.nn.Layer):
+    def __init__(self):
+        super(AbsNet, self).__init__()
+
+    def forward(self, x):
+        x = paddle.abs(x)
+        return x
+
+
+if __name__ == '__main__':
+    # build network
+    model = AbsNet()
+    # save inferencing format model
+    net = to_static(
+        model, input_spec=[InputSpec(
+            shape=[None, 1, 28, 28], name='x')])
+    paddle.jit.save(net, sys.argv[1])
diff --git a/paddle/infrt/tests/model/test_abs.cc b/paddle/infrt/tests/model/test_abs.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5de159b86fce29f774b07770aaaee0c1b6aebd31
--- /dev/null
+++ b/paddle/infrt/tests/model/test_abs.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
+#include <iostream>
+#include <string>
+
+#include "llvm/Support/DynamicLibrary.h"
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+#include "paddle/infrt/kernel/basic_kernels.h"
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
+#include "paddle/infrt/kernel/phi/registry.h"
+#include "paddle/infrt/kernel/tensor_kernels.h"
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+#include "paddle/infrt/kernel/test_kernels.h"
+
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
+#include "paddle/infrt/host_context/paddle_mlir.h"
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+
+static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
+    "shared_libs",
+    llvm::cl::desc("Specify shared library with kernels."),
+    llvm::cl::ZeroOrMore,
+    llvm::cl::MiscFlags::CommaSeparated);
+
+TEST(ABS_MODEL, convert_and_execute) {
+  std::string model_file_name = "./abs.pdmodel";
+  std::string params_file_name = "./abs.pdiparams";
+  // convert model
+  MLIRModelGenImpl myGen;
+  auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name);
+  module_.dump();
+  // pick kernel
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  context->allowUnregisteredDialects();
+  context->getOrLoadDialect<mlir::StandardOpsDialect>();
+
+  context->getOrLoadDialect<infrt::InfrtDialect>();
+  context->getOrLoadDialect<infrt::ts::TensorShapeDialect>();
+  context->getOrLoadDialect<infrt::InfrtDialect>();
+  context->getOrLoadDialect<infrt::dt::DTDialect>();
+  context->getOrLoadDialect<mlir::pd::PaddleDialect>();
+
+  context->getOrLoadDialect<infrt::phi::PHIDenseTensorDialect>();
+  context->getOrLoadDialect<infrt::phi::PHICPUKernelDialect>();
+  context->getOrLoadDialect<infrt::phi::PHIGPUKernelDialect>();
+  context->getOrLoadDialect<infrt::phi::PHIDialect>();
+
+  context->loadAllAvailableDialects();
+  mlir::PassManager pm(context);
+
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
+
+  if (mlir::failed(pm.run(module_))) {
+    std::cout << "\npass failed!\n" << std::endl;
+  }
+  module_.dump();
+
+  // executate
+  infrt::host_context::KernelRegistry registry;
+  infrt::kernel::RegisterBasicKernels(&registry);
+  infrt::kernel::RegisterTestKernels(&registry);
+  infrt::kernel::RegisterTensorShapeKernels(&registry);
+  infrt::kernel::RegisterTensorKernels(&registry);
+  infrt::kernel::RegisterControlFlowKernels(&registry);
+  infrt::kernel::RegisterPhiKernels(&registry);
+  infrt::kernel::RegisterInferShapeLaunchers(&registry);
+  // load extra shared library
+  for (const auto& lib_path : cl_shared_libs) {
+    std::string err;
+    llvm::sys::DynamicLibrary dynLib =
+        llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err);
+    if (!dynLib.isValid()) {
+      llvm::errs() << "Load shared library failed. Error: " << err << "\n";
+      break;
+    }
+    if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) {
+      auto reg_func =
+          reinterpret_cast<void (*)(infrt::host_context::KernelRegistry*)>(
+              reg_sym);
+      reg_func(&registry);
+    } else {
+      llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path
+                   << "\". Skip.\n";
+    }
+  }
+  infrt::host_context::TestMlir(module_, &registry);
+}
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index c268742fa567bffecb2fd17a773ab56aee019853..ce40627bb0d3742aff7f60583d2e0b9cbbd8fb02 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -324,7 +324,7 @@ class PADDLE_API Tensor final {
    *
    * @return std::shared_ptr<phi::TensorBase>
    */
-  std::shared_ptr<phi::TensorBase> impl() const;
+  const std::shared_ptr<phi::TensorBase>& impl() const;
 
   /**
    * @brief Set the implemention of current Tensor.
@@ -333,6 +333,13 @@ class PADDLE_API Tensor final {
    */
   void set_impl(const std::shared_ptr<phi::TensorBase>& impl);
 
+  /**
+   * @brief Set the implemention of current Tensor.
+   *
+   * @param impl
+   */
+  void set_impl(std::shared_ptr<phi::TensorBase>&& impl);
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   /**
    * @brief Get the stream where the tensor is currently located
@@ -397,7 +404,9 @@ class PADDLE_API Tensor final {
    * @param blocking, Should we copy this in sync way.
    * @return void
    */
-  void copy_(const Tensor& src, const bool blocking);
+  void copy_(const Tensor& src,
+             const phi::Place& target_place,
+             const bool blocking);
   /**
    * @brief Cast datatype from one to another
    *
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 42bf7a8103f837195775b33daf301a7d2e0f4c44..4cbca07236208281f38984022d17b6cb88af8ed8 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -148,4 +148,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph
 cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
 cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
 
-cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
+cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta)
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index e1ebe8c6465cfdd7f8213c0a31416bc77412221c..0c11e2df65d0db23b4e080bf041c78d976714013 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -95,12 +95,8 @@ paddle::optional<phi::MetaTensor> MakeMetaTensor(
 /* ------------------ for output ----------------------- */
 
 phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
-  if (!out->initialized()) {
-    auto dense_tensor = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
-        phi::DenseTensorMeta());
-    out->set_impl(dense_tensor);
-    return dense_tensor.get();
+  if (out->impl() == nullptr) {
+    out->set_impl(std::make_shared<phi::DenseTensor>());
   }
   return static_cast<phi::DenseTensor*>(out->impl().get());
 }
@@ -111,9 +107,7 @@ std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
   out->reserve(out_size);
   std::vector<phi::DenseTensor*> results(out_size);
   for (size_t i = 0; i < out_size; ++i) {
-    auto tensor_ptr = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
-        phi::DenseTensorMeta());
+    auto tensor_ptr = std::make_shared<phi::DenseTensor>();
     results[i] = tensor_ptr.get();
     out->emplace_back();
     out->back().set_impl(tensor_ptr);
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 79b8ac6d0b8352b2e817e6bdbefca74c835ad6b2..e280ab626da74a9b0951925f7472fa49996691cb 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -167,10 +167,7 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor,
 
   if (NeedTransformPlace(
           out.place(), target_args_def.backend, transform_flag)) {
-    phi::DenseTensor result(
-        phi::make_intrusive<paddle::experimental::SharedStorage>(
-            phi::TransToPhiPlace(target_args_def.backend)),
-        {out.dtype(), out.dims(), out.layout()});
+    phi::DenseTensor result;
     framework::TransDataDevice(
         out, phi::TransToPhiPlace(target_args_def.backend), &result);
     out = result;
@@ -190,14 +187,14 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
            tensor_in->dtype(), target_args_def.dtype, transform_flag) &&
        !NeedTransformLayout(
            tensor_in->layout(), target_args_def.layout, transform_flag))) {
-    return std::dynamic_pointer_cast<phi::DenseTensor>(tensor_in);
+    return std::static_pointer_cast<phi::DenseTensor>(tensor_in);
   }
 
   phi::DenseTensor out =
       TransformData(*(static_cast<phi::DenseTensor*>(tensor_in.get())),
                     target_args_def,
                     transform_flag);
-  return std::make_shared<phi::DenseTensor>(out);
+  return std::make_shared<phi::DenseTensor>(std::move(out));
 }
 
 std::shared_ptr<phi::DenseTensor> PrepareData(
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 40174a505dcc9b3d475254b7cec7691300c7aecf..6be85d720007e8464647974f43d42f8430a827a8 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -46,6 +46,7 @@ limitations under the License. */
  * In the future, the necessary components will be moved to the this library,
  * or the corresponding components will be re-implemented.
  */
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
@@ -142,7 +143,12 @@ PlaceType Tensor::place() const {
 }
 
 paddle::platform::Place Tensor::inner_place() const {
-  return ConvertExtPlaceToInnerPlace(place());
+  PADDLE_ENFORCE_NOT_NULL(
+      impl_,
+      phi::errors::PermissionDenied(
+          "Null pointer error, the impl_ of Tensor should not be "
+          "Null when calling Tensor::inner_place()."));
+  return impl_->place();
 }
 
 bool Tensor::is_cpu() const {
@@ -286,12 +292,16 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const {
   }
 }
 
-std::shared_ptr<phi::TensorBase> Tensor::impl() const { return impl_; }
+const std::shared_ptr<phi::TensorBase> &Tensor::impl() const { return impl_; }
 
 void Tensor::set_impl(const std::shared_ptr<phi::TensorBase> &impl) {
   impl_ = impl;
 }
 
+void Tensor::set_impl(std::shared_ptr<phi::TensorBase> &&impl) {
+  impl_ = std::move(impl);
+}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 gpuStream_t Tensor::stream() const {
   return platform::stream::get_current_stream(-1)->raw_stream();
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index 885e29b27fa8e723ad0e89f9a99c2accd3c172f6..cc797507e68ec11005d6ac35d5dca2d19418598d 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -19,9 +19,12 @@ limitations under the License. */
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/tensor_base.h"
 
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace experimental {
-
 // declare cast api
 Tensor cast(const Tensor &x, DataType out_dtype);
 Tensor copy_to(const Tensor &x, Backend backend, bool blocking);
@@ -67,12 +70,18 @@ template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<double>>(
 template PADDLE_API Tensor
 Tensor::copy_to<phi::dtype::float16>(const PlaceType &target_place) const;
 
-void Tensor::copy_(const Tensor &src, bool blocking) {
+void Tensor::copy_(const Tensor &src,
+                   const phi::Place &target_place,
+                   bool blocking) {
   if (!src.is_initialized()) {
+    VLOG(8) << "Src is empty, skip copy";
     return;
   }
+  // Prepare copy kernel key and outputs
+  auto kernel_key_set = ParseKernelKeyByInputArgs(src);
+  KernelType kernel_type = ParseKernelTypeByInputArgs(src);
   VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name();
-  if (defined()) {
+  if (is_initialized()) {
     PADDLE_ENFORCE_EQ(dtype(),
                       src.dtype(),
                       platform::errors::PreconditionNotMet(
@@ -87,10 +96,91 @@ void Tensor::copy_(const Tensor &src, bool blocking) {
                           "Copy cannot be performed!",
                           name(),
                           src.name()));
+    PADDLE_ENFORCE_EQ(target_place,
+                      inner_place(),
+                      platform::errors::PreconditionNotMet(
+                          "Place is different of dst tensor and args %s, which "
+                          "current tensor holds %s "
+                          "Copy cannot be performed!",
+                          target_place.DebugString(),
+                          inner_place().DebugString()));
+    kernel_key_set.backend_set =
+        kernel_key_set.backend_set |
+        BackendSet(phi::TransToPhiBackend(inner_place()));
+  } else {
+    // Deep Copy AutoGrad info from src to self.
+    *autograd_meta_ = *(src.autograd_meta_);
+  }
+
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  auto *dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  if (kernel_type == KernelType::DENSE_TENSOR_KENREL) {
+    auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+        "copy", {kernel_backend, kernel_layout, kernel_data_type});
+    VLOG(6) << "copy API kernel key: " << kernel_key;
+    VLOG(6) << "copy API kernel: " << kernel;
+    using kernel_signature = void (*)(const platform::DeviceContext &,
+                                      const phi::DenseTensor &,
+                                      phi::Place,
+                                      bool,
+                                      phi::DenseTensor *);
+    SetKernelOutput(kernel_backend, this);
+    phi::MetaTensor meta_out(impl_.get());
+    phi::UnchangedInferMeta(
+        MakeMetaTensor(
+            *(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
+        &meta_out);
+    auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    (*kernel_fn)(*dev_ctx,
+                 (*(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
+                 target_place,
+                 blocking,
+                 static_cast<phi::DenseTensor *>(impl_.get()));
+  } else if (kernel_type == KernelType::SELECTED_ROWS_KENREL) {
+    auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+        "copy_sr", {kernel_backend, kernel_layout, kernel_data_type});
+    VLOG(6) << "copy API kernel key: " << kernel_key;
+    VLOG(6) << "copy API kernel: " << kernel;
+    using kernel_signature = void (*)(const platform::DeviceContext &,
+                                      const phi::SelectedRows &,
+                                      phi::Place,
+                                      bool,
+                                      phi::SelectedRows *);
+    SetSelectedRowsKernelOutput(kernel_backend, this);
+    phi::MetaTensor meta_out(impl_.get());
+    phi::UnchangedInferMeta(
+        MakeMetaTensor(
+            *(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
+        &meta_out);
+    auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    (*kernel_fn)(*dev_ctx,
+                 (*(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
+                 target_place,
+                 blocking,
+                 static_cast<phi::SelectedRows *>(impl_.get()));
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "We currently only support dense tensor copy for now and if u need to "
+        "copy selected rows please raise a issue."));
   }
-  auto copy_tensor =
-      src.copy_to(phi::TransToPhiBackend(src.inner_place()), blocking);
-  set_impl(copy_tensor.impl());
 }
 
 }  // namespace experimental
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index a3b252598582bc212ba66f9c18ec52e035a29a68..0394835aa8b700ba4f9ee9b106661e2d70fc50b6 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -741,6 +741,10 @@ struct GPUContext::Impl {
 
 GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
 
+GPUContext::GPUContext(GPUContext&&) = default;
+
+GPUContext& GPUContext::operator=(GPUContext&&) = default;
+
 GPUContext::GPUContext(const GPUPlace& place)
     : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
 
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 3eb4360ad35382369681308b46050cc3e6e04ea0..cd08da1c0f2f8031a461a0410a89254823a6a903 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -77,6 +77,8 @@ class DnnWorkspaceHandle {
 class GPUContext : public DeviceContext {
  public:
   GPUContext();
+  GPUContext(GPUContext&&);
+  GPUContext& operator=(GPUContext&&);
 
   explicit GPUContext(const GPUPlace& place);
 
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index 0947870dcd35a689d7ebfde5ddcab12c361358e9..9bf692703860f15601ad601970ea1f5b1316442b 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(phi_place SRCS place.cc)
-cc_library(scalar SRCS scalar.cc)
+cc_library(scalar SRCS scalar.cc DEPS phi_enforce)
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index 25b80279ecf10619d97b8800b24ab5353c79745d..71cec011411641ffe34918f03162800b111275a2 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -89,6 +89,8 @@ class ArgumentMappingContext {
 
   virtual bool IsDenseTensorInput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
+  // For compatibility with LoDTensorArray
+  virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;
 
   virtual bool IsDenseTensorOutput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsOutput(const std::string& name) const = 0;
diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc
index a32e0e44f469694c62ff33863971d3b04004ff37..234e3528c363b948c0a3e3b22d5ee676660fce76 100644
--- a/paddle/phi/core/kernel_context.cc
+++ b/paddle/phi/core/kernel_context.cc
@@ -37,6 +37,13 @@ void KernelContext::EmplaceBackInputs(
                  std::make_move_iterator(inputs.end()));
 }
 
+void KernelContext::EmplaceBackInputsWithoutSetRange(
+    paddle::SmallVector<const TensorBase*> inputs) {
+  inputs_.insert(inputs_.end(),
+                 std::make_move_iterator(inputs.begin()),
+                 std::make_move_iterator(inputs.end()));
+}
+
 void KernelContext::EmplaceBackOutput(TensorBase* output) {
   int index = outputs_.size();
   outputs_.emplace_back(output);
@@ -59,6 +66,13 @@ void KernelContext::EmplaceBackOutputs(
                   std::make_move_iterator(outputs.end()));
 }
 
+void KernelContext::EmplaceBackOutputsWithoutSetRange(
+    paddle::SmallVector<TensorBase*> outputs) {
+  outputs_.insert(outputs_.end(),
+                  std::make_move_iterator(outputs.begin()),
+                  std::make_move_iterator(outputs.end()));
+}
+
 void KernelContext::EmplaceBackAttr(paddle::any attr) {
   attrs_.emplace_back(std::move(attr));
 }
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 213ac47d30bfdd28541bd1b9cb24bf2053b1c939..d3ca1ffc61c42c06c2b33cccdb6f1037df237a24 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -52,12 +52,18 @@ class KernelContext {
 
   void EmplaceBackInputs(paddle::SmallVector<const TensorBase*> inputs);
 
+  void EmplaceBackInputsWithoutSetRange(
+      paddle::SmallVector<const TensorBase*> inputs);
+
   void EmplaceBackOutput(TensorBase* output);
 
   void EmplaceBackOutputWithoutSetRange(TensorBase* output);
 
   void EmplaceBackOutputs(paddle::SmallVector<TensorBase*> outputs);
 
+  void EmplaceBackOutputsWithoutSetRange(
+      paddle::SmallVector<TensorBase*> outputs);
+
   void EmplaceBackAttr(paddle::any attr);
 
   const std::pair<int, int>& InputRangeAt(size_t idx) const;
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index be91409762635e8aabdd6953aa5527d94959e4b2..e502b9cb3e02536e8d764a4cbc5e1d5509960303 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -197,8 +197,16 @@ class Kernel {
 
   const KernelArgsDef& args_def() const { return args_def_; }
 
+  const TensorArgDef& InputAt(size_t idx) const {
+    return args_def_.input_defs().at(idx);
+  }
+
   TensorArgDef& InputAt(size_t idx) { return args_def_.input_defs().at(idx); }
 
+  const TensorArgDef& OutputAt(size_t idx) const {
+    return args_def_.output_defs().at(idx);
+  }
+
   TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); }
 
   bool IsValid() { return fn_ != nullptr; }
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 38dce0dc69d317d95541f3f10ba8018b03b9d6b5..b7a7a4ec231ddfdbfd4da75e71aebaa49f99443f 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -476,6 +476,33 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void ExpandAsInferMeta(const MetaTensor& x,
+                       paddle::optional<const MetaTensor&> y,
+                       const std::vector<int>& target_shape,
+                       MetaTensor* out) {
+#define MAX_RANK_SUPPORTED 6
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      target_shape.size(),
+      static_cast<size_t>(x_dims.size()),
+      phi::errors::InvalidArgument(
+          "The rank of target_shape must be greater than or equal "
+          "to the rank of Input(X). But received Input(X): input "
+          "rank %u; received target_shape: rank %u.",
+          x_dims.size(),
+          target_shape.size()));
+  PADDLE_ENFORCE_LE(target_shape.size(),
+                    MAX_RANK_SUPPORTED,
+                    phi::errors::InvalidArgument(
+                        "The rank of target_shape must be less than or equal "
+                        "to %d. But received: rank %u.",
+                        MAX_RANK_SUPPORTED,
+                        target_shape.size()));
+  out->set_dims(phi::make_ddim(target_shape));
+  out->set_dtype(x.dtype());
+#undef MAX_RANK_SUPPORTED
+}
+
 void GatherInferMeta(const MetaTensor& x,
                      const MetaTensor& index,
                      const Scalar& axis,
@@ -571,6 +598,48 @@ void GatherTreeMeta(const MetaTensor& ids,
   out->set_dims(ids_dims);
 }
 
+void GridSampleBaseInferMeta(const MetaTensor& x,
+                             const MetaTensor& grid,
+                             MetaTensor* out,
+                             MetaConfig config) {
+  auto x_dims = x.dims();
+  auto grid_dims = grid.dims();
+  PADDLE_ENFORCE_EQ(x_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input(X) of GridSampleOp should be 4-D Tensor, but "
+                        "received X dimension size(%d)",
+                        x_dims.size()));
+  PADDLE_ENFORCE_EQ(grid_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input(Grid) of GridSampleOp should be 4-D Tensor, "
+                        "but received X dimension size(%d)",
+                        grid_dims.size()));
+  if (config.is_runtime || grid_dims[3] > 0) {
+    PADDLE_ENFORCE_EQ(
+        grid_dims[3],
+        2,
+        phi::errors::InvalidArgument(
+            "Input(Grid) dimension[3] should be 2, but received %d",
+            grid_dims[3]));
+  }
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        grid_dims[0],
+        x_dims[0],
+        phi::errors::InvalidArgument(
+            "Input(X) and Input(Grid) dimension[0] should be equal, but "
+            "received X dimension[0](%d) != Grid dimension[0](%d)",
+            x_dims[0],
+            grid_dims[0]));
+  }
+
+  out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
 void HuberLossInferMeta(const MetaTensor& input,
                         const MetaTensor& label,
                         float delta,
@@ -686,6 +755,24 @@ void IndexSelectInferMeta(const MetaTensor& x,
   output->share_lod(x);
 }
 
+void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+  auto rank_x = dim_x.size();
+  auto rank_y = dim_y.size();
+  auto rank = (rank_x > rank_y) ? rank_x : rank_y;
+
+  std::vector<int64_t> dim_out;
+  dim_out.reserve(rank);
+  for (int i = 0; i < rank; i++) {
+    int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x));
+    int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y));
+    dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi);
+  }
+  out->set_dims(phi::make_ddim(dim_out));
+  out->set_dtype(x.dtype());
+}
+
 void LogLossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       float epsilon,
@@ -831,6 +918,60 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
   out->share_lod(x);
 }
 
+void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
+                           const MetaTensor& value,
+                           bool out_int32,
+                           bool right,
+                           MetaTensor* out) {
+  auto sequences_dims = sorted_sequence.dims();
+  auto values_dims = value.dims();
+
+  bool flag = true;
+  if (sequences_dims.size() != values_dims.size()) {
+    flag = false;
+  }
+  const auto& sequences_dims_size = sequences_dims.size();
+  for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) {
+    if (sequences_dims[dim] != values_dims[dim]) {
+      flag = false;
+      break;
+    }
+  }
+  if (sequences_dims.size() != 1) {
+    PADDLE_ENFORCE_EQ(
+        flag,
+        true,
+        phi::errors::Unavailable(
+            "The dimensions of sorted_sequence tensor ( %s ) and values "
+            "tensor ( %s ) can not match. Because the input sorted_sequence "
+            "tensor must be 1 dimension or the first N-1 dimensions of "
+            "sorted_sequence tensor and input values tensor must match. "
+            "Please input appropriate sorted_sequence and values again! ",
+            sequences_dims,
+            values_dims));
+  }
+
+  if (out_int32) {
+    PADDLE_ENFORCE_LT(
+        sequences_dims[sequences_dims.size() - 1],
+        std::numeric_limits<int>::max(),
+        phi::errors::Unavailable(
+            "The size of sorted_sequence %d exceed the maximum limit d%. "
+            "Because the size of sorted_sequence should be less than the "
+            "output maximum value for int32 bit. Please set appropriate "
+            "sorted_sequence to meet this requirement! ",
+            sequences_dims[sequences_dims.size() - 1],
+            std::numeric_limits<int>::max()));
+  }
+
+  out->set_dims(values_dims);
+  if (out_int32) {
+    out->set_dtype(DataType::INT32);
+  } else {
+    out->set_dtype(DataType::INT64);
+  }
+}
+
 void SegmentPoolInferMeta(const MetaTensor& x,
                           const MetaTensor& segment_ids,
                           const std::string& pooltype,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 8cf7ce3930e941a3c5243306fa38e4466059509a..cb680415e7d2c42de7b2339b27b22be500dfdf9b 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -90,6 +90,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              int axis,
                              MetaTensor* out);
 
+void ExpandAsInferMeta(const MetaTensor& x,
+                       paddle::optional<const MetaTensor&> y,
+                       const std::vector<int>& target_shape,
+                       MetaTensor* out);
+
 void GatherInferMeta(const MetaTensor& x,
                      const MetaTensor& index,
                      const Scalar& axis,
@@ -103,6 +108,11 @@ void GatherTreeMeta(const MetaTensor& ids,
                     const MetaTensor& parents,
                     MetaTensor* out);
 
+void GridSampleBaseInferMeta(const MetaTensor& x,
+                             const MetaTensor& grid,
+                             MetaTensor* out,
+                             MetaConfig config = MetaConfig());
+
 void HuberLossInferMeta(const MetaTensor& input_meta,
                         const MetaTensor& label_meta,
                         float delta,
@@ -120,6 +130,8 @@ void IndexSelectInferMeta(const MetaTensor& x,
                           int dim,
                           MetaTensor* output);
 
+void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+
 void LogLossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       float epsilon,
@@ -134,6 +146,12 @@ void MatmulInferMeta(const MetaTensor& x,
 
 void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);
 
+void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
+                           const MetaTensor& value,
+                           bool out_int32,
+                           bool right,
+                           MetaTensor* out);
+
 void SegmentPoolInferMeta(const MetaTensor& x,
                           const MetaTensor& segment_ids,
                           const std::string& pooltype,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 262ada3eaf3169bebc919940e7630a75b0733cd9..8a2d718f124578dbab0164048f8daa09e9a54e8f 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -648,6 +648,49 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
   mask->set_dtype(paddle::experimental::CppTypeToDataType<int>::Type());
 }
 
+void ModeInferMeta(const MetaTensor& x,
+                   int axis,
+                   bool keepdim,
+                   MetaTensor* out,
+                   MetaTensor* indices) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_EQ(
+      (axis < dim_size) && (axis >= (-1 * dim_size)),
+      true,
+      errors::InvalidArgument(
+          "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
+          dim_size,
+          dim_size,
+          axis));
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      errors::InvalidArgument("input of ModeOp must have >= 1d shape"));
+  if (axis < 0) axis += dim_size;
+  std::vector<int64_t> dimvec;
+  for (int64_t i = 0; i < axis; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  if (keepdim) {
+    dimvec.emplace_back(static_cast<int64_t>(1));
+  }
+  for (int64_t i = axis + 1; i < dim_size; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  DDim dims = phi::make_ddim(dimvec);
+  PADDLE_ENFORCE_GE(input_dims.size(),
+                    1,
+                    errors::InvalidArgument("input shape should >= 1d"));
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(x.dtype());
+}
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
@@ -1047,6 +1090,16 @@ void RollInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) {
+  auto in_dims = x.dims();
+  PADDLE_ENFORCE_LT(
+      in_dims.size(),
+      7,
+      phi::errors::InvalidArgument(
+          "The rank of input should be less than 7, but received %d.",
+          in_dims.size()));
+}
+
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) {
   auto in_dim = input.dims();
   out->set_dims(phi::make_ddim({in_dim.size()}));
@@ -1341,6 +1394,55 @@ void TileInferMeta(const MetaTensor& x,
   }
 }
 
+void TopKInferMeta(const MetaTensor& x,
+                   const Scalar& k_scalar,
+                   int axis,
+                   bool largest,
+                   bool sorted,
+                   MetaTensor* out,
+                   MetaTensor* indices,
+                   MetaConfig config) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_EQ(
+      (axis < dim_size) && (axis >= (-1 * dim_size)),
+      true,
+      phi::errors::InvalidArgument(
+          "the axis of topk must be [-%d, %d), but you set axis is %d",
+          dim_size,
+          dim_size,
+          axis));
+
+  if (axis < 0) axis += dim_size;
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    k = -1;
+  } else {
+    PADDLE_ENFORCE_EQ(k >= 1,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "the attribute of k in the topk must >= 1 or be a "
+                          "Tensor, but received %d .",
+                          k));
+  }
+
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      phi::errors::InvalidArgument("input of topk must have >= 1d shape"));
+
+  phi::DDim dims = input_dims;
+
+  dims[axis] = k;
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(DataType::INT64);
+}
+
 void TraceInferMeta(
     const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) {
   int dim1 = axis1;
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 3dfc9b797c089281cd9631642640a54be05ce679..7203a327b55698c0d4bd0271b2908cbc4a9b5ca1 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -112,6 +112,12 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
                                MetaTensor* mask,
                                MetaConfig config = MetaConfig());
 
+void ModeInferMeta(const MetaTensor& x,
+                   int axis,
+                   bool keepdim,
+                   MetaTensor* out,
+                   MetaTensor* indices);
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
@@ -171,6 +177,8 @@ void RollInferMeta(const MetaTensor& x,
                    const std::vector<int64_t>& axis,
                    MetaTensor* out);
 
+void SetValueInferMeta(const MetaTensor& x, MetaTensor* out);
+
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);
 
 void ShardIndexInferMeta(const MetaTensor& in,
@@ -209,6 +217,15 @@ void TileInferMeta(const MetaTensor& x,
                    MetaTensor* out,
                    MetaConfig config = MetaConfig());
 
+void TopKInferMeta(const MetaTensor& x,
+                   const Scalar& k_scalar,
+                   int axis,
+                   bool largest,
+                   bool sorted,
+                   MetaTensor* out,
+                   MetaTensor* indices,
+                   MetaConfig config = MetaConfig());
+
 void TraceInferMeta(
     const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
 
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index d16f5f725df50dad83ca53cb957a0bff3fcd5120..02b5b2d74ad2914f60a1df08e500b06733b95aaa 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -27,7 +27,7 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
 # Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel
+set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel 
     matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
     put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
     softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
@@ -35,7 +35,6 @@ set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_k
 kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
 kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
 kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index e0dfca756e14782b1f97618ef87290464834a0e7..241a80d85ead2d7bb6cd63105feb345c62a29a62 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -19,14 +19,14 @@ limitations under the License. */
 
 namespace phi {
 
-#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(name) \
   template <typename T, typename Context>         \
   void name##GradKernel(const Context& dev_ctx,   \
                         const DenseTensor& x,     \
                         const DenseTensor& dout,  \
                         DenseTensor* dx);
 
-#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(name, attr) \
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(name, attr) \
   template <typename T, typename Context>                       \
   void name##GradKernel(const Context& dev_ctx,                 \
                         const DenseTensor& x,                   \
@@ -34,7 +34,7 @@ namespace phi {
                         float attr,                             \
                         DenseTensor* dx);
 
-#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(name, attr1, attr2) \
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(name, attr1, attr2) \
   template <typename T, typename Context>                               \
   void name##GradKernel(const Context& dev_ctx,                         \
                         const DenseTensor& x,                           \
@@ -43,19 +43,28 @@ namespace phi {
                         float attr2,                                    \
                         DenseTensor* dx);
 
-#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(name) \
   template <typename T, typename Context>           \
   void name##GradKernel(const Context& dev_ctx,     \
                         const DenseTensor& out,     \
                         const DenseTensor& dout,    \
                         DenseTensor* dx);
 
-#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \
-  template <typename T, typename Context>                                \
-  void name##GradKernel(const Context& dev_ctx,                          \
-                        const DenseTensor& out,                          \
-                        const DenseTensor& dout,                         \
-                        float attr,                                      \
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(name, attr) \
+  template <typename T, typename Context>                         \
+  void name##GradKernel(const Context& dev_ctx,                   \
+                        const DenseTensor& out,                   \
+                        const DenseTensor& dout,                  \
+                        float attr,                               \
+                        DenseTensor* dx);
+
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \
+  template <typename T, typename Context>                                 \
+  void name##GradKernel(const Context& dev_ctx,                           \
+                        const DenseTensor& out,                           \
+                        const DenseTensor& dout,                          \
+                        float attr1,                                      \
+                        float attr2,                                      \
                         DenseTensor* dx);
 
 template <typename T, typename Context>
@@ -107,28 +116,51 @@ void EluDoubleGradKernel(const Context& dev_ctx,
                          DenseTensor* dx,
                          DenseTensor* ddout);
 
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(TanhShrink);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Silu);
-
-DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh);
-
-DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, alpha)
-    DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, threshold)
-        DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(SoftShrink, lambda)
-            DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(HardShrink, threshold)
-
-                DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, t_min, t_max)
+template <typename T, typename Context>
+void SigmoidDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             DenseTensor* dout_new,
+                             DenseTensor* ddout);
+
+template <typename T, typename Context>
+void SigmoidTripleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             const DenseTensor& d_ddout,
+                             const DenseTensor& d_dout_new,
+                             DenseTensor* d_out_new,
+                             DenseTensor* d_dout,
+                             DenseTensor* d_ddx);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Tan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index 0762ce43ff8f06bd5cc7deaf62bc3cda7d6eb81c..dbc63a636edb188e4640fdd02895868034f1dd80 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -54,6 +54,8 @@ DECLARE_ACTIVATION_KERNEL(Relu)
 DECLARE_ACTIVATION_KERNEL(Tanh)
 DECLARE_ACTIVATION_KERNEL(TanhShrink)
 DECLARE_ACTIVATION_KERNEL(Silu)
+DECLARE_ACTIVATION_KERNEL(Sigmoid)
+DECLARE_ACTIVATION_KERNEL(LogSigmoid)
 
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
@@ -62,5 +64,5 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
-
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)
 }  // namespace phi
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9faaace69176690f64ff81138844567deceef689
--- /dev/null
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/assign_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  paddle::optional<const DenseTensor&> x,
+                  DenseTensor* out) {
+  if (!x.is_initialized()) {
+    return;
+  }
+  auto& x_tensor = *x.get_ptr();
+  Copy<Context>(dev_ctx, x_tensor, x_tensor.place(), false, out);
+}
+
+// Note: use `const paddle::optional<std::vector<const DenseTensor*>&> x`
+// as input if needed
+template <typename Context>
+void AssignArrayKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       std::vector<DenseTensor*> out) {
+  for (size_t i = 0; i < x.size(); ++i) {
+    AssignKernel<Context>(dev_ctx, *x[i], out.at(i));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(
+    assign, CPU, ALL_LAYOUT, phi::AssignKernel<phi::CPUContext>, ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(assign_array,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::AssignArrayKernel<phi::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    assign, GPU, ALL_LAYOUT, phi::AssignKernel<phi::GPUContext>, ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(assign_array,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::AssignArrayKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cc06818dc007f859a3a3513d211008cd2a153e6
--- /dev/null
+++ b/paddle/phi/kernels/assign_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+// In order to be compatible with the `AsDispensable` input in the original
+// assign op maker, the input parameter here needs to be dispensable, but
+// this looks weird
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  paddle::optional<const DenseTensor&> x,
+                  DenseTensor* out);
+
+template <typename Context>
+void AssignArrayKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       std::vector<DenseTensor*> out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index 11b396a84d0dee9172f0e5e70f9761fc2869fc89..c582261596221f4db8bd03599386082cee909096 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -90,6 +90,23 @@ namespace phi {
         dev_ctx, nullptr, &out, &dout, dx, functor);         \
   }
 
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(    \
+    name, functor_class, attr1, attr2)                       \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& out,              \
+                        const DenseTensor& dout,             \
+                        float attr1,                         \
+                        float attr2,                         \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr1;                              \
+    *(attrs[1].second) = attr2;                              \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);         \
+  }
+
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor);
@@ -103,9 +120,11 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
 
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, SigmoidGradFunctor);
 
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                                LeakyReluGradFunctor,
@@ -125,6 +144,11 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                                t_min,
                                                t_max);
 
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
+                                                 HardSigmoidGradFunctor,
+                                                 slope,
+                                                 offset);
+
 template <typename T, typename Context>
 void EluGradKernel(const Context& dev_ctx,
                    const DenseTensor& x,
@@ -204,3 +228,8 @@ PD_REGISTER_KERNEL(tanh_triple_grad,
                    float,
                    double,
                    phi::dtype::float16) {}
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 59ce18a11cc5ea13f3964faddad622e3c9344efd..1d7b77ea4445f494105d4c23516f31f349847089 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -72,6 +72,8 @@ DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
 
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
@@ -82,6 +84,10 @@ DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
 
 DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
+                                     HardSigmoidFunctor,
+                                     slope,
+                                     offset)
 
 }  // namespace phi
 PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
@@ -109,3 +115,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc
index 1af071f23ddc520e6733acdbeec3a0652f4e1d8f..fa11fd05bf1d656a075b996f8688d755b28cc034 100644
--- a/paddle/phi/kernels/cpu/copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/copy_kernel.cc
@@ -38,7 +38,7 @@ void Copy(const Context& dev_ctx,
           << src_place;
 
   dst->Resize(src.dims());
-  auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  auto* dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
 
   if (src_ptr == dst_ptr) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 37ad18df56ec30c838dd5bd03c484d7889e976c0..095d11720ce26622c31e517286d6f656869e62ff 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -12,10 +12,81 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
+namespace phi {
+
+#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
+  template <typename T, typename Context>                                   \
+  void name##RawKernel(const Context& dev_ctx,                              \
+                       const DenseTensor& x,                                \
+                       const DenseTensor& y,                                \
+                       int axis,                                            \
+                       DenseTensor* out) {                                  \
+    dev_ctx.template Alloc<T>(out);                                         \
+    if (x.dims() == y.dims()) {                                             \
+      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
+          dev_ctx, x, y, out);                                              \
+    } else {                                                                \
+      auto x_dims = x.dims();                                               \
+      auto y_dims = y.dims();                                               \
+      if (x_dims.size() >= y_dims.size()) {                                 \
+        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
+            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
+      } else {                                                              \
+        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
+            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
+      }                                                                     \
+    }                                                                       \
+  }
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
+    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
+        dev_ctx, x, y, out);
+  } else {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    if (x_dims.size() >= y_dims.size()) {
+      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
+    } else {
+      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
+    }
+  }
+}
+
+// Create the definition of Add
+DEFINE_CPU_ELEMENTWISE_OP(Add)
+
+// Create the definition of Subtract
+DEFINE_CPU_ELEMENTWISE_OP(Subtract)
+
+// Create the definition of Multiply
+DEFINE_CPU_ELEMENTWISE_OP(Multiply)
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
 PD_REGISTER_KERNEL(elementwise_fmax,
                    CPU,
                    ALL_LAYOUT,
@@ -33,3 +104,49 @@ PD_REGISTER_KERNEL(elementwise_fmin,
                    double,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(add_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(subtract_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(divide_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(multiply_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cee48ed96db1c60fb77dc7c870cb256b7ce0cb6e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
@@ -0,0 +1,186 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__)
+#include "paddle/fluid/operators/jit/kernels.h"
+#endif
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         paddle::optional<const DenseTensor&> scale_opt,
+                         paddle::optional<const DenseTensor&> bias_opt,
+                         const DenseTensor& out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad) {
+  auto* scale = scale_opt.get_ptr();
+  auto d_y = out_grad;
+
+  // init output
+  auto* d_x = x_grad;
+  auto* d_scale = scale_grad;
+  auto* d_bias = bias_grad;
+
+  const auto& x_dims = x.dims();
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  DDim matrix_shape({left, right});
+
+  d_y.Resize(matrix_shape);
+
+  funcs::ColwiseSum2D<phi::CPUContext, T> colwise_sum(left, right, dev_ctx);
+  DenseTensor x_tmp = x;
+
+  DenseTensor temp;
+  DenseTensor temp_norm;
+  if (d_scale || d_x) {
+    x_tmp.Resize(matrix_shape);
+    temp.Resize(matrix_shape);
+    dev_ctx.template Alloc<T>(&temp);
+
+    temp_norm.Resize(matrix_shape);
+    dev_ctx.template Alloc<T>(&temp_norm);
+    // get x_norm
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+        dev_ctx,
+        x_tmp,
+        mean,
+        /*axis*/ 0,
+        funcs::SubtractFunctor<T>(),
+        &temp_norm);
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+        dev_ctx,
+        temp_norm,
+        variance,
+        /*axis*/ 0,
+        funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+        &temp_norm);
+  }
+
+  if (d_bias) {
+    dev_ctx.template Alloc<T>(d_bias);
+    colwise_sum(dev_ctx, d_y, d_bias);
+  }
+  if (d_scale) {
+    dev_ctx.template Alloc<T>(d_scale);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx, temp_norm, d_y, 0, funcs::MultiplyFunctor<T>(), &temp);
+    colwise_sum(dev_ctx, temp, d_scale);
+  }
+
+  if (d_x) {
+    DDim vec_shape({left});
+    dev_ctx.template Alloc<T>(d_x);
+    auto dx_dim = d_x->dims();
+    DenseTensor temp_vec;
+    temp_vec.Resize(vec_shape);
+    dev_ctx.template Alloc<T>(&temp_vec);
+
+    funcs::RowwiseMean2D<phi::CPUContext, T> row_mean(left, right, dev_ctx);
+
+    if (d_scale) {
+      // dy_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx, d_y, *scale, /*axis*/ 1, funcs::MultiplyFunctor<T>(), &temp);
+      phi::Copy<Context>(dev_ctx, temp, dev_ctx.GetPlace(), false, d_x);
+
+      // dy_dmean_dx
+      row_mean(dev_ctx, temp, &temp_vec);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+          dev_ctx,
+          *d_x,
+          temp_vec,
+          /*axis*/ 0,
+          funcs::SubtractFunctor<T>(),
+          d_x);
+
+      // dy_var_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx,
+          temp,
+          temp_norm,
+          /*axis*/ 0,
+          funcs::MultiplyFunctor<T>(),
+          &temp);
+    } else {
+      // dy_dx
+      phi::Copy<Context>(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x);
+
+      // dy_dmean_dx
+      row_mean(dev_ctx, d_y, &temp_vec);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+          dev_ctx,
+          *d_x,
+          temp_vec,
+          /*axis*/ 0,
+          funcs::SubtractFunctor<T>(),
+          d_x);
+
+      // dy_var_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx,
+          d_y,
+          temp_norm,
+          /*axis*/ 0,
+          funcs::MultiplyFunctor<T>(),
+          &temp);
+    }
+    // dy_var_dx
+    row_mean(dev_ctx, temp, &temp_vec);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx,
+        temp_norm,
+        temp_vec,
+        /*axis*/ 0,
+        funcs::MultiplyFunctor<T>(),
+        &temp);
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+        dev_ctx, *d_x, temp, /*axis*/ 0, funcs::SubtractFunctor<T>(), d_x);
+
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+        dev_ctx,
+        *d_x,
+        variance,
+        /*axis*/ 0,
+        funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+        d_x);
+    d_x->Resize(dx_dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    layer_norm_grad, CPU, ALL_LAYOUT, phi::LayerNormGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b09d68c7ca081e9a6157857eea8338aaa93d34d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__)
+#include "paddle/fluid/operators/jit/kernels.h"
+#endif
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     paddle::optional<const DenseTensor&> scale_opt,
+                     paddle::optional<const DenseTensor&> bias_opt,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor* y,
+                     DenseTensor* mean,
+                     DenseTensor* var) {
+  const auto x_dims = x.dims();
+  auto* scale = scale_opt.get_ptr();
+  auto* bias = bias_opt.get_ptr();
+
+  dev_ctx.template Alloc<T>(y);
+  dev_ctx.template Alloc<T>(mean);
+  dev_ctx.template Alloc<T>(var);
+
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  DDim matrix_shape({left, right});
+
+  auto x_tmp = x;
+  x_tmp.Resize(matrix_shape);
+  DenseTensor out;
+  out.ShareDataWith(*y);
+  out.Resize(matrix_shape);
+
+#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
+    defined(__OSX__)
+
+  funcs::RowwiseMean2D<phi::CPUContext, T> row_mean(left, right, dev_ctx);
+
+  // get mean
+  row_mean(dev_ctx, x_tmp, mean);
+
+  // get variance
+
+  phi::funcs::ElementwiseCompute<funcs::SubAndSquareFunctor<T>, T, T>(
+      dev_ctx, x_tmp, *mean, 0, funcs::SubAndSquareFunctor<T>(), &out);
+
+  row_mean(dev_ctx, out, var);
+
+  // get x_norm
+  phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+      dev_ctx, x_tmp, *mean, 0, funcs::SubtractFunctor<T>(), &out);
+
+  phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+      dev_ctx,
+      out,
+      *var,
+      0,
+      funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+      &out);
+
+  if (scale) {
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx, out, *scale, 1, funcs::MultiplyFunctor<T>(), &out);
+  }
+  if (bias) {
+    phi::funcs::ElementwiseCompute<funcs::AddFunctor<T>, T, T>(
+        dev_ctx, out, *bias, 1, funcs::AddFunctor<T>(), &out);
+  }
+#else
+  PADDLE_ENFORCE_EQ(mean->numel(),
+                    left,
+                    phi::errors::InvalidArgument(
+                        "mean's length (%d) is not equal with expected (%d).",
+                        mean->numel(),
+                        left));
+  PADDLE_ENFORCE_EQ(var->numel(),
+                    left,
+                    phi::errors::InvalidArgument(
+                        "var's length (%d) is not equal with expected (%d).",
+                        var->numel(),
+                        left));
+  if (scale) {
+    PADDLE_ENFORCE_EQ(
+        scale->numel(),
+        right,
+        phi::errors::InvalidArgument(
+            "scale's length (%d) is not equal with expected (%d).",
+            scale->numel(),
+            right));
+  }
+  if (bias) {
+    PADDLE_ENFORCE_EQ(bias->numel(),
+                      right,
+                      phi::errors::InvalidArgument(
+                          "bias's length (%d) is not equal with expected (%d).",
+                          bias->numel(),
+                          right));
+  }
+
+  auto ker = paddle::operators::jit::KernelFuncs<
+                 paddle::operators::jit::LayerNormTuple<T>,
+                 phi::CPUPlace>::Cache()
+                 .At(right);
+  ker(x_tmp.data<T>(),
+      out.data<T>(),
+      mean->data<T>(),
+      var->data<T>(),
+      scale ? scale->data<T>() : nullptr,
+      bias ? bias->data<T>() : nullptr,
+      static_cast<int>(left),
+      static_cast<const float>(epsilon),
+      right);
+#endif
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    layer_norm, CPU, ALL_LAYOUT, phi::LayerNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc
deleted file mode 100644
index 0047940fd1704be2862a4a0a4bf46f4886221464..0000000000000000000000000000000000000000
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/math_kernel.h"
-
-#include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/scalar.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/elementwise.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/complex.h"
-
-namespace phi {
-
-#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
-  template <typename T, typename Context>                                   \
-  void name##RawKernel(const Context& dev_ctx,                              \
-                       const DenseTensor& x,                                \
-                       const DenseTensor& y,                                \
-                       int axis,                                            \
-                       DenseTensor* out) {                                  \
-    dev_ctx.template Alloc<T>(out);                                         \
-    if (x.dims() == y.dims()) {                                             \
-      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
-          dev_ctx, x, y, out);                                              \
-    } else {                                                                \
-      auto x_dims = x.dims();                                               \
-      auto y_dims = y.dims();                                               \
-      if (x_dims.size() >= y_dims.size()) {                                 \
-        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
-            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
-      } else {                                                              \
-        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
-            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
-      }                                                                     \
-    }                                                                       \
-  }
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
-    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
-        dev_ctx, x, y, out);
-  } else {
-    auto x_dims = x.dims();
-    auto y_dims = y.dims();
-    if (x_dims.size() >= y_dims.size()) {
-      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
-    } else {
-      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
-    }
-  }
-}
-
-// Create the definition of Add
-DEFINE_CPU_ELEMENTWISE_OP(Add)
-
-// Create the definition of Subtract
-DEFINE_CPU_ELEMENTWISE_OP(Subtract)
-
-// Create the definition of Multiply
-DEFINE_CPU_ELEMENTWISE_OP(Multiply)
-
-}  // namespace phi
-
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
-// using bfloat16 = ::phi::dtype::bfloat16;
-PD_REGISTER_KERNEL(add_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
index 636018ffa68003bc85af22e580bc4ae0768fb1b7..ae1e406d16eec44168b2b7232586293bf90e4bd8 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -17,12 +17,12 @@
 #include <Eigen/Dense>
 #include <Eigen/SVD>
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/reduce_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/mode_grad_kernel.cc b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca813c1757eacce24ecea8687b7b80bd43c5e8f9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
@@ -0,0 +1,170 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad) {
+  auto in_dims = x.dims();
+  auto out_dims = indices.dims();
+
+  // axis < 0, get the real axis
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+  if (!keepdim) {
+    std::vector<int> tmp_out_shape;
+    for (int i = 0; i < axis; i++) {
+      tmp_out_shape.emplace_back(out_dims[i]);
+    }
+    tmp_out_shape.emplace_back(1);
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      tmp_out_shape.emplace_back(out_dims[i - 1]);
+    }
+    out_dims = phi::make_ddim(tmp_out_shape);
+  }
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+
+  if (axis == in_dims.size() - 1) {
+    // allocate the memory for the input_grad
+    // assign the out_grad to input_grad directly
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    // init the output grad with 0, because some input elements has no grad
+    memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+    // Assign the output_grad to input_grad
+    if (keepdim) {
+      funcs::ModeAssign(input_height,
+                        input_width,
+                        in_dims.size(),
+                        &out_grad,
+                        &indices,
+                        x_grad_data);
+    } else {
+      DenseTensor out_grad_tmp;
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      DenseTensor indices_tmp;
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+
+      funcs::ModeAssign(input_height,
+                        input_width,
+                        in_dims.size(),
+                        &out_grad_tmp,
+                        &indices_tmp,
+                        x_grad_data);
+    }
+  } else {
+    // can not assign grad to input_grad, must do the transpose
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+    DDim trans_shape(out_dims);
+    DDim trans_in_shape(in_dims);
+    for (size_t i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = out_dims[trans_axis[i]];
+      trans_in_shape[i] = in_dims[trans_axis[i]];
+    }
+    // transpose the out_grad, indices
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_dO);
+
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_shape);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+
+    int ndims = trans_axis.size();
+
+    if (keepdim) {
+      // Do transpose
+      funcs::TransCompute<CPUContext, T>(
+          ndims, dev_ctx, out_grad, &trans_dO, trans_axis);
+      funcs::TransCompute<CPUContext, int64_t>(
+          ndims, dev_ctx, indices, &trans_ind, trans_axis);
+    } else {
+      DenseTensor out_grad_tmp;
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+
+      DenseTensor indices_tmp;
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      // Do transpose
+      funcs::TransCompute<CPUContext, T>(
+          ndims, dev_ctx, out_grad_tmp, &trans_dO, trans_axis);
+      funcs::TransCompute<CPUContext, int64_t>(
+          ndims, dev_ctx, indices_tmp, &trans_ind, trans_axis);
+    }
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
+    const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
+
+    // Assign the out_grad to tranpose input_grad
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_shape);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+    funcs::ModeAssign<T, int64_t>(input_height,
+                                  input_width,
+                                  in_dims.size(),
+                                  &trans_dO,
+                                  &trans_ind,
+                                  t_out);
+
+    // Transpose back
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, tmp_out, x_grad, trans_axis);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mode_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ModeGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/mode_kernel.cc b/paddle/phi/kernels/cpu/mode_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6535d1b89af420ee4266981f004983157179f34f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mode_kernel.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  auto out_dims = out->dims();
+  // axis < 0, cacluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  // if axis is not the last dim, transpose it to the last dim, do the
+  // calculation, then tranpose it back to original axis.
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    funcs::GetMode<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &x,
+                               output_data,
+                               indices_data);
+  } else {
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      out->Resize(tmp_out_dim);
+      indices->Resize(tmp_out_dim);
+    }
+
+    // get the trans input_dims, out_dims
+    DDim trans_shape(in_dims);
+    DDim trans_out_shape(in_dims);
+
+    for (size_t i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = in_dims[trans_axis[i]];
+      trans_out_shape[i] = in_dims[trans_axis[i]];
+    }
+    trans_out_shape[in_dims.size() - 1] = 1;
+
+    DenseTensor trans_input;
+    trans_input.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_input);
+    int ndims = trans_axis.size();
+
+    // transpose the input value
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, x, &trans_input, trans_axis);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_width = trans_shape[trans_shape.size() - 1];
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_out_shape);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    tmp_indices.Resize(trans_out_shape);
+    int64_t* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    funcs::GetMode<T, int64_t>(
+        input_height, input_width, in_dims.size(), &trans_input, t_out, t_ind);
+    // transpose back
+    funcs::TransCompute<CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans_axis);
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, tmp_out, out, trans_axis);
+    if (!keepdim) {
+      out->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    mode, CPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0eaa873590eb0ce16933de474cc028e751fdd4a9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPoolGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& boxes,
+                       paddle::optional<const DenseTensor&> boxes_num,
+                       const DenseTensor& arg_max,
+                       const DenseTensor& out_grad,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       DenseTensor* dx) {
+  if (dx) {
+    int rois_num = boxes.dims()[0];
+    DenseTensor box_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+    int* box_batch_id_data = box_batch_id_list.data<int>();
+
+    int boxes_batch_size;
+    if (boxes_num) {
+      boxes_batch_size = boxes_num->numel();
+      auto* boxes_num_data = boxes_num->data<int>();
+      int start = 0;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (int i = start; i < start + boxes_num_data[n]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+        start += boxes_num_data[n];
+      }
+    } else {
+      auto boxes_lod = boxes.lod().back();
+      boxes_batch_size = boxes_lod.size() - 1;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+      }
+    }
+
+    const T* boxes_data = boxes.data<T>();
+    const T* out_grad_data = out_grad.data<T>();
+    const int64_t* arg_max_data = arg_max.data<int64_t>();
+    T* dx_data = dev_ctx.template Alloc<T>(dx);
+
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+
+    auto in_stride = phi::stride(x.dims());
+    auto arg_max_stride = phi::stride(arg_max.dims());
+    auto roi_stride = phi::stride(boxes.dims());
+    auto out_stride = phi::stride(out_grad.dims());
+
+    int channels = x.dims()[1];
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_idx = box_batch_id_data[n];
+      T* batch_grad_data = dx_data + roi_batch_idx * in_stride[0];
+      for (int c = 0; c < channels; ++c) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int pool_index = ph * pooled_width + pw;
+            if (arg_max_data[pool_index] >= 0) {
+              auto index = arg_max_data[pool_index];
+              batch_grad_data[index] += out_grad_data[pool_index];
+            }
+          }
+        }
+        batch_grad_data += in_stride[1];
+        out_grad_data += out_stride[1];
+        arg_max_data += arg_max_stride[1];
+      }
+      boxes_data += roi_stride[0];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roi_pool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RoiPoolGradKernel,
+                   float,
+                   double,
+                   int) {
+  kernel->InputAt(3).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02020354cd35701b5fdcd1e8beae87bc813ca18f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max) {
+  auto x_dims = x.dims();
+  int batch_size = x_dims[0];
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  auto in_stride = phi::stride(x_dims);
+  auto arg_max_stride = phi::stride(arg_max->dims());
+  auto box_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out->dims());
+
+  const T* input_data = x.data<T>();
+
+  DenseTensor box_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* box_batch_id_data = box_batch_id_list.data<int>();
+
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument("The boxes_batch_size and imgs "
+                                     "batch_size must be the same."));
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument("The boxes_batch_size and imgs "
+                                     "batch_size must be the same."));
+    int rois_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        rois_num_with_lod,
+        phi::errors::InvalidArgument("The rois_num from input "
+                                     "and lod must be the same."));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
+
+  const T* boxes_data = boxes.data<T>();
+  for (int n = 0; n < rois_num; ++n) {
+    int box_batch_id = box_batch_id_data[n];
+    int box_start_w = round(boxes_data[0] * spatial_scale);
+    int box_start_h = round(boxes_data[1] * spatial_scale);
+    int box_end_w = round(boxes_data[2] * spatial_scale);
+    int box_end_h = round(boxes_data[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    int box_height = std::max(box_end_h - box_start_h + 1, 1);
+    int box_width = std::max(box_end_w - box_start_w + 1, 1);
+
+    const float bin_size_h =
+        static_cast<float>(box_height) / static_cast<float>(pooled_height);
+    const float bin_size_w =
+        static_cast<float>(box_width) / static_cast<float>(pooled_width);
+
+    const T* batch_data = input_data + box_batch_id * in_stride[0];
+
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          //  Compute pooling region for this output unit:
+          //  start (included) = floor(ph * box_height / pooled_height_)
+          //  end (excluded) = ceil((ph + 1) * box_height / pooled_height_)
+          int hstart =
+              static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+          int wstart =
+              static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+          int hend =
+              static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+          int wend =
+              static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+          hstart = std::min(std::max(hstart + box_start_h, 0), height);
+          hend = std::min(std::max(hend + box_start_h, 0), height);
+          wstart = std::min(std::max(wstart + box_start_w, 0), width);
+          wend = std::min(std::max(wend + box_start_w, 0), width);
+
+          const int pool_index = ph * pooled_width + pw;
+
+          // Define an empty pooling region to be zero
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          output_data[pool_index] =
+              is_empty ? 0 : -std::numeric_limits<T>::max();
+          arg_max_data[pool_index] = -1;
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              if (batch_data[index] > output_data[pool_index]) {
+                output_data[pool_index] = batch_data[index];
+                arg_max_data[pool_index] = index;
+              }
+            }
+          }
+        }
+      }
+
+      batch_data += in_stride[1];
+      output_data += out_stride[1];
+      arg_max_data += arg_max_stride[1];
+    }
+    // Increment ROI data pointer
+    boxes_data += box_stride[0];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool, CPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double, int) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
index 585c27bdcec97e11a68cdc536c829f76c000a8df..a5c9dc4c55e495833f40ec7499e6c0373594d319 100644
--- a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
@@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(segment_pool_grad,
                    ALL_LAYOUT,
                    phi::SegmentPoolGradKernel,
                    float,
-                   double) {}
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
index d0413457f8177338aa450211539dc16d0880c74c..ad76a7a86bcb28f291288418c43740ed0b7adb97 100644
--- a/paddle/phi/kernels/cpu/segment_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
@@ -18,5 +18,11 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-PD_REGISTER_KERNEL(
-    segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
+PD_REGISTER_KERNEL(segment_pool,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
index ab3d3c2376b8b05b4909f2c44df260299c2fe460..4247e597acef4aac14f93066a3ea6232734e0c8c 100644
--- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
@@ -37,13 +37,8 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,
 
   T* data = dev_ctx.template Alloc<T>(tensor);
 
-  auto normal_cdf = [](float x) {
-    return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-  };
-  float a_normal_cdf = normal_cdf((-2.0 - mean) / std);
-  float b_normal_cdf = normal_cdf((2.0 - mean) / std);
-  std::uniform_real_distribution<float> dist(2.0 * a_normal_cdf - 1.0,
-                                             2.0 * b_normal_cdf - 1.0);
+  std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                         1.0);
   TruncatedNormal<T> truncated_normal(mean, std);
   int64_t size = tensor->numel();
 
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
similarity index 98%
rename from paddle/phi/kernels/math_kernel.cc
rename to paddle/phi/kernels/elementwise_kernel.cc
index 5aad2375ebb85a52684946fe35b2a5b17a0b9efd..9d10a48c9e0795d8914c0c6cfb49b7686575cfac 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
index c1e73ad91c67d415437829d5fc731ac91a5722f5..b064ecc454c592df49670205163e73d2d3b249b3 100644
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace phi {
 
@@ -33,4 +33,100 @@ void ElementwiseFMinKernel(const Context& dev_ctx,
                            int axis,
                            DenseTensor* out);
 
+template <typename T, typename Context>
+void AddRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Add(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Subtract(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Divide(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Multiply(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 663258fa560b21a86c881a8bd0446eb8e77804bb..6c5ffbd06e3a435d9568a6c4717d8ce83b5aec00 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -1012,6 +1012,217 @@ struct SiluGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+// sigmoid(x) = 1 / (1 + exp(-x))
+template <typename T>
+struct SigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+  }
+};
+
+template <typename T>
+struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out * (static_cast<T>(1) - out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+/*
+    Out
+    DOut -> SigmoidGradGrad -> DOutNew
+    DDX                        DDOut
+
+    DDOut = (1-Out)*Out*DDX
+    DOutNew = (1-2*Out)*DOut*DDX
+*/
+template <typename T>
+struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  DenseTensor* dOutNew,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
+
+    if (dOutNew) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
+      auto dout_new = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad"));
+      dout_new.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+/*
+    Out
+    DOut                            D_Dout
+    DDx     -> SigmoidTripleGrad -> D_DDx
+    D_DDout                         d_OutNew
+    D_Dout_new
+
+    D_Dout = (1-2*Out)*DDx*D_Dout_new
+    D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
+    D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
+
+    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
+    D_OutNew, D_DOut, D_DDx               // output
+*/
+template <typename T>
+struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  const DenseTensor* d_DDOut,
+                  const DenseTensor* d_dOut_New,
+                  DenseTensor* d_d_Out,
+                  DenseTensor* d_Out_New,
+                  DenseTensor* d_DDx) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad"));
+    auto dout = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad"));
+    auto d_ddOut = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
+    auto d_dOutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
+        d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad"));
+
+    if (d_Out_New) {
+      auto d_OutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
+          d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad"));
+      d_OutNew.device(*d) = (ddx - static_cast<T>(2) * out * ddx) * d_ddOut -
+                            static_cast<T>(2) * dout * ddx * d_dOutNew;
+    }
+    if (d_d_Out) {
+      auto d_dOut = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad"));
+      d_dOut.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * ddx * d_dOutNew;
+    }
+    if (d_DDx) {
+      auto d_ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad"));
+      d_ddx.device(*d) =
+          (static_cast<T>(1) - out) * out * d_ddOut +
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+// Originally: logsigmoid(x) = -log (1 + exp(-x))
+// For numerical stability, we can use the log-sum-exp trick:
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// We can rewrite the above equation as:
+// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
+//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
+//           max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
+//
+// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
+// + exp(-x - max(-x, 0))))
+template <typename T>
+struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
+  }
+};
+
+// Originally: f' = exp(-x) / (1 + exp(-x))
+// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
+// exp(-x - max(-x, 0)))
+template <typename T>
+struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    dx.device(d) =
+        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    out.device(d) =
+        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
+                       .template cast<T>() *
+                   static_cast<T>(slope);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 template <typename T>
 struct CudaReluFunctor : public BaseActivationFunctor<T> {
@@ -1653,6 +1864,112 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // sigmoid(x) = 1 / (1 + exp(-x))
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(one / (one + exp(-x)));
+  }
+};
+
+template <typename T>
+struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout * out * (1 - out)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * out * (one - out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // logsigmoid(x) = log(1 / (1 + exp(-x)))
+  // For numerical stability,
+  // logsigmoid(x) =
+  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType temp = x > zero ? zero : -x;
+    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
+  }
+};
+
+template <typename T>
+struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // dx = dout * exp(-x) / (1 + exp(-x))
+  // For numerical stability:
+  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
+  // 0)))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType temp1 = x > zero ? zero : -x;
+    MPType temp2 = exp(-x - temp1);
+    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // hard_sigmoid(x) = 0, when x <= -3
+  //                   1, when x >= 3
+  //                   x * slope + offset, otherwise
+  __device__ __forceinline__ T operator()(const T x) const {
+    T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    T temp_max = temp > zero ? temp : zero;
+    T temp_min = temp_max < one ? temp_max : one;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // dx = (out > 0 && out < 1) ? dout * slope : 0
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 #endif
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e78730cbf38495637e4bd4c455a3f522b38a9017
--- /dev/null
+++ b/paddle/phi/kernels/funcs/layer_norm_util.h
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+// Wrap RowwiseMean and ColwiseMean.
+// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
+// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
+// implementation only considers 2D.
+template <typename DeviceContext, typename T>
+struct RowwiseMean2D {
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx);
+
+  void operator()(const DeviceContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* vec);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class RowwiseMean2D<phi::GPUContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx)
+      : left_(left), right_(right) {
+    DDim ones_dim({right_});
+    divisor_.Resize(ones_dim);
+    dev_ctx.template Alloc<T>(&divisor_);
+    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
+  }
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    phi::funcs::GetBlas<phi::GPUContext, T>(context).GEMV(false,
+                                                          left_,
+                                                          right_,
+                                                          1.,
+                                                          input.data<T>(),
+                                                          divisor_.data<T>(),
+                                                          0.,
+                                                          out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  DenseTensor divisor_;
+};
+#endif
+
+template <typename T>
+class RowwiseMean2D<phi::CPUContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx) {}
+
+  void operator()(const phi::CPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    row_mean_(context, input, out);
+  }
+
+ private:
+  phi::funcs::RowwiseMean<phi::CPUContext, T> row_mean_;
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum2D {
+  ColwiseSum2D(int left, int right, const DeviceContext& dev_ctx);
+
+  void operator()(const phi::DeviceContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* vec);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class ColwiseSum2D<phi::GPUContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const phi::GPUContext& dev_ctx)
+      : left_(left), right_(right) {
+    DDim ones_dim({left_});
+    divisor_.Resize(ones_dim);
+    dev_ctx.template Alloc<T>(&divisor_);
+    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0);
+  }
+
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    phi::funcs::GetBlas<phi::GPUContext, T>(context).GEMV(true,
+                                                          left_,
+                                                          right_,
+                                                          1.,
+                                                          input.data<T>(),
+                                                          divisor_.data<T>(),
+                                                          0.,
+                                                          out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  DenseTensor divisor_;
+};
+#endif
+
+template <typename T>
+class ColwiseSum2D<phi::CPUContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const phi::CPUContext& dev_ctx) {}
+
+  void operator()(const phi::CPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    col_wise_(context, input, out);
+  }
+
+ private:
+  phi::funcs::ColwiseSum<phi::CPUContext, T> col_wise_;
+};
+
+template <typename T>
+struct SubAndSquareFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
+};
+
+template <typename T>
+struct DivAndSqrtFunctor {
+  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a / (sqrt(b + epsilon_));
+  }
+
+ private:
+  T epsilon_;
+};
+
+template <typename T>
+struct MulInvVarFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a * std::sqrt(1.0 / b);
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 4201a75be8ac7ee9f7e633f6def1e002ce4b7e8a..afa2214f5b9df968d9fe01f6310e151c12e19362 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -331,12 +331,20 @@ template struct ColwiseSum<paddle::platform::CPUDeviceContext, double>;
 template struct ColwiseSum<paddle::platform::CPUDeviceContext, int>;
 template struct ColwiseSum<paddle::platform::CPUDeviceContext, int64_t>;
 
+template struct ColwiseSum<phi::CPUContext, float>;
+template struct ColwiseSum<phi::CPUContext, double>;
+template struct ColwiseSum<phi::CPUContext, int>;
+template struct ColwiseSum<phi::CPUContext, int64_t>;
+
 template struct RowwiseSum<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseSum<paddle::platform::CPUDeviceContext, double>;
 
 template struct RowwiseMean<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseMean<paddle::platform::CPUDeviceContext, double>;
 
+template struct RowwiseMean<phi::CPUContext, float>;
+template struct RowwiseMean<phi::CPUContext, double>;
+
 template <typename T>
 struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
   void operator()(paddle::platform::CPUDeviceContext* ctx,
diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b7641762e2639acf3db540280891b518f22eed2
--- /dev/null
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -0,0 +1,197 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/extrema.h>
+#include <thrust/functional.h>
+#include <thrust/inner_product.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <utility>
+#include <vector>
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+static int ComputeBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+static inline void GetDims(
+    const phi::DDim& dim, int axis, int* pre, int* n, int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
+template <typename T, typename Type>
+static void GetMode(Type input_height,
+                    Type input_width,
+                    int input_dim,
+                    const DenseTensor* input,
+                    T* t_out,
+                    Type* t_indices) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    std::sort(col_vec.begin(),
+              col_vec.end(),
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+    T mode = 0;
+    int64_t indice = 0;
+    int64_t cur_freq = 0;
+    int64_t max_freq = 0;
+    for (int64_t i = 0; i < input_width; ++i) {
+      ++cur_freq;
+      if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
+        if (cur_freq > max_freq) {
+          max_freq = cur_freq;
+          mode = col_vec[i].first;
+          indice = col_vec[i].second;
+        }
+        cur_freq = 0;
+      }
+    }
+    t_out[i] = mode;
+    t_indices[i] = indice;
+  }
+}
+
+template <typename T, typename Type>
+static void ModeAssign(const Type& input_height,
+                       const Type& input_width,
+                       const int& input_dim,
+                       const DenseTensor* input,
+                       const DenseTensor* indices,
+                       T* output_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      output_data[i * input_width + e_indices(0)] = e_input(0);
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
+    }
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+static void GetModebySort(const phi::GPUContext& dev_ctx,
+                          const DenseTensor* input_tensor,
+                          const int64_t num_cols,
+                          const int64_t num_rows,
+                          T* out_tensor,
+                          int64_t* indices_tensor) {
+  DenseTensor input_tmp;
+  input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
+  T* input_tmp_data = dev_ctx.Alloc<T>(&input_tmp);
+  phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp);
+
+  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
+  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
+
+  for (int64_t i = 0; i < num_rows; ++i) {
+    T* begin = input_tmp_data + num_cols * i;
+    T* end = input_tmp_data + num_cols * (i + 1);
+    thrust::device_vector<int64_t> indices_data(num_cols);
+    thrust::sequence(
+        thrust::device, indices_data.begin(), indices_data.begin() + num_cols);
+    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
+    int unique = 1 + thrust::inner_product(thrust::device,
+                                           begin,
+                                           end - 1,
+                                           begin + 1,
+                                           0,
+                                           thrust::plus<int>(),
+                                           thrust::not_equal_to<T>());
+    thrust::device_vector<T> keys_data(unique);
+    thrust::device_vector<int64_t> cnts_data(unique);
+    thrust::reduce_by_key(thrust::device,
+                          begin,
+                          end,
+                          thrust::constant_iterator<int>(1),
+                          keys_data.begin(),
+                          cnts_data.begin());
+    auto it = thrust::max_element(
+        thrust::device, cnts_data.begin(), cnts_data.begin() + unique);
+    T mode = keys_data[it - cnts_data.begin()];
+    int64_t counts = cnts_data[it - cnts_data.begin()];
+    auto pos = thrust::find(thrust::device, begin, end, mode);
+    int64_t index = indices_data[pos - begin + counts - 1];
+    out_tensor_ptr[i] = static_cast<T>(mode);
+    indices_tensor_ptr[i] = static_cast<int64_t>(index);
+  }
+}
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
index 4cf5e1c02c59757ee8bd0ae91c18d0882b702da1..417c1cd234754f994383988c63ff44ba06794822 100644
--- a/paddle/phi/kernels/funcs/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -392,7 +392,7 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
   int nthreads = batch_size * output_channels * output_height * output_width;
   int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-  // paddle::platform::ChangeThreadNum(context, &thread_num);
+  // backends::gpu::ChangeThreadNum(context, &thread_num);
   thread_num = 512;
 #endif
   int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -460,7 +460,7 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -527,7 +527,7 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -1293,7 +1293,7 @@ class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -1369,7 +1369,7 @@ class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -1906,7 +1906,7 @@ class MaxPool2dWithIndexFunctor<phi::GPUContext, T1, T2> {
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
 
     int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -2205,7 +2205,7 @@ class MaxPool3dWithIndexFunctor<phi::GPUContext, T1, T2> {
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
 
     int blocks = (nthreads + thread_num - 1) / thread_num;
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 5834f091d9a4de02afe7488ededc0189ae6f21d0..85c371e9f9d450c55741b901eff6f102fa6c3f6f 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// CUDA, XPU and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
@@ -220,7 +220,7 @@ struct IndexCalculator {
   phi::Array<int, kMaxRank> dims;
   phi::Array<int, kMaxRank> strides;
   phi::Array<int, kMaxRank> reduce_strides;
-#ifndef PADDLE_WITH_XPU2
+#ifndef PADDLE_WITH_XPU_KP
   phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
 #endif
 };
@@ -231,81 +231,65 @@ struct ReduceIndexMapping {
   HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
       : dim(dims) {}
 
+#ifdef PADDLE_WITH_XPU_KP
   __device__ __forceinline__ int BlockIdX() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return (cluster_id() / dim.split_num_x % dim.split_num_y);
     } else {
       return cluster_id() % dim.split_num_x;
     }
-#else
-    return blockIdx.x;
-#endif
   }
 
   __device__ __forceinline__ int BlockIdY() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return (cluster_id() % dim.split_num_x);
     } else {
       return (cluster_id() / dim.split_num_x % dim.split_num_y);
     }
-#else
-    return blockIdx.y;
-#endif
   }
 
-  __device__ __forceinline__ int BlockDimX() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_x;
-#else
-    return blockDim.x;
-#endif
-  }
+  __device__ __forceinline__ int BlockDimX() { return dim.deal_size_x; }
 
-  __device__ __forceinline__ int BlockDimY() {
-#ifdef PADDLE_WITH_XPU2
-    return 1;
-#else
-    return blockDim.y;
-#endif
-  }
+  __device__ __forceinline__ int BlockDimY() { return 1; }
 
   __device__ __forceinline__ int GridDimX() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.split_num_y;
     } else {
       return dim.split_num_x;
     }
-#else
-    return gridDim.x;
-#endif
   }
 
   __device__ __forceinline__ int GridDimY() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.split_num_x;
     } else {
       return dim.split_num_y;
     }
-#else
-    return gridDim.y;
-#endif
   }
 
   __device__ __forceinline__ int GetLoopSize() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.deal_size_y;
     } else {
       return dim.deal_size_x;
     }
+  }
 #else
-    return 1;
+  __device__ __forceinline__ int BlockIdX() { return blockIdx.x; }
+
+  __device__ __forceinline__ int BlockIdY() { return blockIdx.y; }
+
+  __device__ __forceinline__ int BlockDimX() { return blockDim.x; }
+
+  __device__ __forceinline__ int BlockDimY() { return blockDim.y; }
+
+  __device__ __forceinline__ int GridDimX() { return gridDim.x; }
+
+  __device__ __forceinline__ int GridDimY() { return gridDim.y; }
+
+  __device__ int GetLoopSize() { return 1; }
 #endif
-  }
 };
 
 // when reduce_type == kReduceLastDim this struct will be used
@@ -341,7 +325,7 @@ struct ReduceConfig {
 
   // when should_reduce_again is true, we need malloc temp space for temp data
   void SetOutputData(Ty* y_data,
-                     const phi::GPUContext& dev_ctx,
+                     const KPDevice& dev_ctx,
                      phi::DenseTensor* tmp) {
     if (should_reduce_again) {
       tmp->Resize(phi::make_ddim(
@@ -640,9 +624,7 @@ struct ReduceConfig {
   int blocking_size;
   bool should_reduce_again;
   bool reduce_last_dim;
-
   Ty* output_data;
-
   dim3 block;
   dim3 grid;
 };
@@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x,
 
     kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
         &reduce_var, &reduce_var, reducer, reduce_last_dim);
-    if (need_store) {
-      y[store_offset + i] = static_cast<Ty>(reduce_var);
-    }
+
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::details::WriteData<Ty>(
+        y + store_offset + i, &result, static_cast<int>(need_store));
   }
 }
 
@@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data,
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
+    auto grid_num = 8;
+    auto block_num = 64;
 #else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
+                    OneDimIndexCal><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         reduce_index_calculator,
         left_index_calculator,
         dim);
-#endif
 
   } else {
     int reduce_rank = config.reduce_strides.size();
@@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data,
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
+    auto grid_num = 8;
+    auto block_num = 64;
 #else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
+                    IndexCalculator><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         reduce_index_calculator,
         left_index_calculator,
         dim);
-#endif
   }
 
   if (config.should_reduce_again) {
@@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data,
         kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
     dim.SetRem(config.left_num % block.x, 0, 0);
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim);
-#else
+    grid = 8;
+    block = 64;
+#endif
     ReduceHigherDimKernel<
         Ty,
         Ty,
@@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         config.left_num,
         config.grid.y,
         dim);
-#endif
   }
 }
 
@@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     Ty* y_data,
                     const TransformOp& transform,
                     int reduce_num,
-                    const phi::GPUContext& dev_ctx,
+                    const KPDevice& dev_ctx,
                     KPStream stream) {
   auto reducer = ReduceOp<Ty>();
   cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
@@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     Ty* y_data,
                     const TransformOp& transform,
                     int reduce_num,
-                    const phi::GPUContext& dev_ctx,
+                    const KPDevice& dev_ctx,
                     KPStream stream) {
   PADDLE_THROW(phi::errors::InvalidArgument(
       "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
@@ -1087,12 +1030,16 @@ template <typename Tx,
           typename Ty,
           template <typename> class ReduceOp,
           typename TransformOp>
-void ReduceKernel(const phi::GPUContext& dev_ctx,
+void ReduceKernel(const KPDevice& dev_ctx,
                   const phi::DenseTensor& x,
                   phi::DenseTensor* y,
                   const TransformOp& transform,
                   const std::vector<int>& origin_reduce_dims) {
+#ifdef PADDLE_WITH_XPU_KP
+  auto stream = dev_ctx.x_context()->xpu_stream;
+#else
   auto stream = dev_ctx.stream();
+#endif
   dev_ctx.Alloc<Ty>(y);
 
   auto x_dim = phi::vectorize<int>(x.dims());
@@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
                0);
 
 #ifdef PADDLE_WITH_XPU_KP
+    auto grid_num = 8;
+    auto block_num = 64;
+#else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceHigherDimKernel<Tx,
                           Ty,
                           MPType,
                           ReduceOp<MPType>,
-                          TransformOp><<<8, 64, 0, stream>>>(
+                          TransformOp><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
         config.left_num,
         config.blocking_size,
         dim);
-#else
-    ReduceHigherDimKernel<
-        Tx,
-        Ty,
-        MPType,
-        ReduceOp<MPType>,
-        TransformOp><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim);
-#endif
 
     if (config.should_reduce_again) {
       dim3 block = dim3(config.block.x, 1, 1);
@@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
       dim2.SetRem(config.left_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2);
-#else
+      grid = 8;
+      block = 64;
+#endif
       ReduceHigherDimKernel<
           Ty,
           Ty,
@@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
           config.left_num,
           config.grid.y,
           dim2);
-#endif
     }
     return;
   }
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
index bf4a21f37223dab5a67649406496e9828b0bcf3f..fbd744430aa11ab1a5a17c76b6d37c10c3085556 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -149,10 +149,19 @@ template class SegmentPoolFunctor<CPU, float, int>;
 template class SegmentPoolFunctor<CPU, float, int64_t>;
 template class SegmentPoolFunctor<CPU, double, int>;
 template class SegmentPoolFunctor<CPU, double, int64_t>;
+template class SegmentPoolFunctor<CPU, int, int>;
+template class SegmentPoolFunctor<CPU, int, int64_t>;
+template class SegmentPoolFunctor<CPU, int64_t, int>;
+template class SegmentPoolFunctor<CPU, int64_t, int64_t>;
+
 template class SegmentPoolGradFunctor<CPU, float, int>;
 template class SegmentPoolGradFunctor<CPU, float, int64_t>;
 template class SegmentPoolGradFunctor<CPU, double, int>;
 template class SegmentPoolGradFunctor<CPU, double, int64_t>;
+template class SegmentPoolGradFunctor<CPU, int, int>;
+template class SegmentPoolGradFunctor<CPU, int, int64_t>;
+template class SegmentPoolGradFunctor<CPU, int64_t, int>;
+template class SegmentPoolGradFunctor<CPU, int64_t, int64_t>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index 305cd39f077bc359543b399a8775b5a92a2eb00d..95606b152672916116813c97cbbc0856d33e49a7 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -453,10 +453,19 @@ template class SegmentPoolFunctor<GPU, float, int>;
 template class SegmentPoolFunctor<GPU, float, int64_t>;
 template class SegmentPoolFunctor<GPU, double, int>;
 template class SegmentPoolFunctor<GPU, double, int64_t>;
+template class SegmentPoolFunctor<GPU, int, int>;
+template class SegmentPoolFunctor<GPU, int, int64_t>;
+template class SegmentPoolFunctor<GPU, int64_t, int>;
+template class SegmentPoolFunctor<GPU, int64_t, int64_t>;
+
 template class SegmentPoolGradFunctor<GPU, float, int>;
 template class SegmentPoolGradFunctor<GPU, float, int64_t>;
 template class SegmentPoolGradFunctor<GPU, double, int>;
 template class SegmentPoolGradFunctor<GPU, double, int64_t>;
+template class SegmentPoolGradFunctor<GPU, int, int>;
+template class SegmentPoolGradFunctor<GPU, int, int64_t>;
+template class SegmentPoolGradFunctor<GPU, int64_t, int>;
+template class SegmentPoolGradFunctor<GPU, int64_t, int64_t>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a1d9b8ea7a7a36c31f31f7fc60dffc1f827d34e
--- /dev/null
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -0,0 +1,447 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include <algorithm>
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace kps = phi::kps;
+
+namespace phi {
+namespace funcs {
+using Mode = kps::details::ReduceMode;
+
+/*
+* Count how many of the data being processed by the current block are true
+* 1. Load data from global memory and cast from bool to int64_t
+* 2. Get result of this thread according to thread reduce
+* 3. Get result of this block according to block reduce
+* 4. first block store 0 and current result
+*/
+template <typename T>
+struct NonZeroFunctor {
+  HOSTDEVICE NonZeroFunctor() {}
+  HOSTDEVICE inline T operator()(const T in) {
+    if (in) {
+      return static_cast<T>(1);
+    } else {
+      return static_cast<T>(0);
+    }
+  }
+};
+
+template <typename InT, typename OutT, int VecSize, int IsBoundary>
+__device__ void GetBlockCountImpl(const InT *in,
+                                  OutT *out,
+                                  int num,
+                                  int repeat) {
+  InT in_data[VecSize];
+  OutT temp[VecSize];
+  OutT result = static_cast<OutT>(0.0f);
+  using Add = kps::AddFunctor<OutT>;
+  using Cast = NonZeroFunctor<InT>;
+  int store_fix = BLOCK_ID_X + repeat * GRID_NUM_X;
+
+  kps::Init<InT, VecSize>(&in_data[0], static_cast<InT>(0.0f));
+  kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
+  kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Cast>(
+      &temp[0], &in_data[0], Cast());
+  kps::Reduce<OutT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+      &result, &temp[0], Add(), true);
+  kps::Reduce<OutT, 1, 1, 1, Add, Mode::kGlobalMode>(
+      &result, &result, Add(), true);
+  if (store_fix == 0) {
+    // first block's fix_size = 0;
+    OutT tmp = static_cast<OutT>(0.0f);
+    kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix, &tmp, 1);
+  }
+
+  // store num of this block
+  kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix + 1, &result, 1);
+}
+
+// Count how many data is not zero in current block
+template <typename InT, typename OutT, int VecSize>
+__global__ void GetBlockCountKernel(const InT *in,
+                                    OutT *out,
+                                    int64_t numel,
+                                    int64_t main_offset) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int repeat = 0;
+  for (; data_offset < main_offset; data_offset += stride) {
+    GetBlockCountImpl<InT, OutT, VecSize, false>(
+        in + data_offset, out, BLOCK_NUM_X * VecSize, repeat);
+    repeat++;  // to get the real blockIdx
+  }
+
+  int num = numel - data_offset;
+  if (num > 0) {
+    GetBlockCountImpl<InT, OutT, VecSize, true>(
+        in + data_offset, out, num, repeat);
+  }
+}
+
+/*
+* Get block num prefix us one block, VecSize must be 2
+* 1. Each thread load 2 data : threadIdx.x and threadIdx.x + blockDimx.x
+* 2. Cumsum limitation is blockDim.x must be less than 512
+*/
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          bool IsBoundary>
+__device__ void CumsumImpl(
+    const InT *in, OutT *out, OutT *pre_cumsum, int num, Functor func) {
+  __shared__ OutT max_thread_data;
+  OutT temp[VecSize];
+  InT arg[VecSize];
+  OutT result[VecSize];
+  // init data_pr
+  kps::Init<InT, VecSize>(&arg[0], static_cast<InT>(0.0f));
+  // set pre_cumsum
+  kps::Init<OutT, VecSize>(&temp[0], *pre_cumsum);
+  // load data to arg
+  kps::ReadData<InT, InT, VecSize, 1, 1, IsBoundary>(
+      &arg[0], in, num, 1, BLOCK_NUM_X, 1);
+  // block cumsum
+  kps::Cumsum<InT, OutT, 1, Functor>(&result[0], &arg[0], func);
+  // result = cumsum_result + pre_cumsum
+  kps::ElementwiseBinary<OutT, OutT, VecSize, 1, 1, Functor>(
+      &result[0], &result[0], &temp[0], func);
+  // get the last prefix sum
+  if ((THREAD_ID_X == BLOCK_NUM_X - 1) && !IsBoundary) {
+    max_thread_data = result[VecSize - 1];
+  }
+  __syncthreads();
+  // update pre_cumsum
+  *pre_cumsum = max_thread_data;
+  kps::WriteData<OutT, OutT, VecSize, 1, 1, IsBoundary>(
+      out, &result[0], num, 1, BLOCK_NUM_X, 1);
+}
+
+// Compute this store_offset of this block
+template <typename InT, typename OutT, typename Functor, int VecSize>
+__global__ void CumsumOneBlock(
+    const InT *in, OutT *out, int numel, int main_offset, Functor func) {
+  int stride = BLOCK_NUM_X * VecSize;
+  int offset = 0;
+  OutT pre_cumsum = static_cast<OutT>(0);
+  for (; offset < main_offset; offset += stride) {
+    CumsumImpl<InT, OutT, Functor, VecSize, false>(
+        in + offset, out + offset, &pre_cumsum, BLOCK_NUM_X * VecSize, func);
+  }
+
+  int num = numel - offset;
+  if (num > 0) {
+    CumsumImpl<InT, OutT, Functor, VecSize, true>(
+        in + offset, out + offset, &pre_cumsum, num, func);
+  }
+}
+
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename IdT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary,
+          int IsMaskData>
+struct SelectCaller {
+  __device__ void inline operator()(OutT *store_data,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int num,
+                                    int data_offset) {
+    // where_index op
+    IdT index_reg[VecSize];
+    // Set data index of global
+    kps::InitWithDataIndex<IdT, VecSize, 1, 1>(&index_reg[0], data_offset);
+    // Get store data according to mask_idt
+    kps::OperatorTernary<MT, IdT, OutT, Functor>(
+        store_data, mask_data, &index_reg[0], func, VecSize);
+  }
+};
+
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename IdT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary>
+struct SelectCaller<OutT,
+                    MT,
+                    InT,
+                    IdT,
+                    Functor,
+                    VecSize,
+                    IsBoundary,
+                    1> {  // masked_select
+  __device__ void inline operator()(OutT *store_data,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int num,
+                                    int data_offset) {
+    InT in_data[VecSize];
+    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
+    // Get store data according to mask_idt
+    kps::OperatorTernary<MT, InT, OutT, Functor>(
+        store_data, mask_data, &in_data[0], func, VecSize);
+  }
+};
+
+/**
+* Get mask's index if mask == true
+*/
+template <typename InT,
+          typename MT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int MaskData,
+          int IsBoundary>  // SelectType = 1 Mask_select else where_index
+__device__ void
+SelectKernelImpl(OutT *out,
+                 const MT *mask,
+                 const InT *in,
+                 Functor func,
+                 int num,
+                 int data_offset,
+                 int store_rank) {
+  const int kCVecSize = 2;
+  // each thread cumsum 2 data
+  using IdT = int64_t;
+  // Set index data type
+  using Add = kps::AddFunctor<IdT>;  // for cumsum
+  using Cast = NonZeroFunctor<InT>;  // for mask
+
+  IdT init_idx = static_cast<IdT>(0.0f);
+  MT init_mask = static_cast<MT>(0.0f);
+
+  IdT num_thread[kCVecSize];
+  IdT cumsum_thread[kCVecSize];
+
+  OutT store_data[VecSize * phi::DDim::kMaxRank];
+  MT mask_data[VecSize];
+  IdT mask_idt[VecSize];
+  // init data_pr
+  kps::Init<IdT, kCVecSize>(&cumsum_thread[0], init_idx);
+  kps::Init<IdT, kCVecSize>(&num_thread[0], init_idx);
+  kps::Init<MT, VecSize>(&mask_data[0], init_mask);
+  // Load mask
+  kps::ReadData<MT, VecSize, 1, 1, IsBoundary>(&mask_data[0], mask, num);
+  // Cast from MT to int
+  kps::ElementwiseUnary<MT, IdT, VecSize, 1, 1, Cast>(
+      &mask_idt[0], &mask_data[0], Cast());
+  // Get the num of thread only num_thread[1] has data
+  kps::Reduce<IdT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+      &num_thread[0], &mask_idt[0], Add(), true);
+  // Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
+  // thread_fix
+  kps::Cumsum<IdT, IdT, 1, Add>(&cumsum_thread[0], &num_thread[0], Add());
+  // Get store data(index) according to mask_idt
+  SelectCaller<OutT, MT, InT, IdT, Functor, VecSize, IsBoundary, MaskData>
+      compute;
+  compute(&store_data[0], &mask_data[0], in, func, num, data_offset);
+  // get thread_fix
+  int thread_fix =
+      (static_cast<int>(cumsum_thread[0] - num_thread[0]) * store_rank);
+  // get how many data need to store
+  int store_num = static_cast<int>(num_thread[0]) * store_rank;
+  // thread store num data, each thread may has different num
+  kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
+}
+
+template <typename MT,
+          typename InT,
+          typename CT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int MaskData>
+__global__ void SelectKernel(OutT *out,
+                             const MT *mask,
+                             const InT *in,
+                             CT *cumsum,
+                             Functor func,
+                             const int64_t numel,
+                             int64_t main_offset,
+                             int store_rank) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int repeat = 0;
+  int size = VecSize * BLOCK_ID_X;
+  for (; data_offset < main_offset; data_offset += stride) {
+    // Cumsum index
+    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
+    // niuliling todo: us ReadData API
+    int block_store_offset = cumsum[idx_cumsum];
+    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, false>(
+        out + block_store_offset * store_rank,
+        mask + data_offset,
+        in + data_offset,
+        func,
+        size,
+        data_offset,
+        store_rank);
+    repeat++;
+  }
+
+  int num = numel - data_offset;
+  if (num > 0) {
+    // Cumsum index
+    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
+    // niuliling todo: us ReadData API
+    int block_store_offset = static_cast<int>(cumsum[idx_cumsum]);
+    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, true>(
+        out + block_store_offset * store_rank,
+        mask + data_offset,
+        in + data_offset,
+        func,
+        num,
+        data_offset,
+        store_rank);
+  }
+}
+
+inline int64_t Floor(int64_t in, int64_t div) { return in / div * div; }
+
+// SelectData = 1 then masked_select; SelectData = 0 then where_index
+template <typename MT,
+          typename InT,
+          typename OutT,
+          int SelectData,
+          typename Functor>
+void SelectKernel(const KPDevice &dev_ctx,
+                  const DenseTensor &condition,
+                  const DenseTensor &in_data,
+                  DenseTensor *out,
+                  Functor func) {
+  const MT *cond_data = condition.data<MT>();
+  const int64_t numel = condition.numel();
+  auto dims = condition.dims();
+  int rank = SelectData ? 1 : dims.size();
+  const InT *in_data_ptr = SelectData ? in_data.data<InT>() : nullptr;
+  // calculate the inclusive prefix sum of "true_num_array"
+  // to get the index of "out" tensor,
+  // and the total number of cond_data[i]==true.
+  // Example:
+  // condition: F T T F F F T T
+  // before:    0 1 1 0 0 0 1 1
+  // after:     0 1 2 2 2 2 3 4
+  // out:       1 2 6 7
+  // alloc for cpu
+  using CT = int64_t;  // set Count_data Type
+  const int t_size = sizeof(CT);
+
+  const paddle::platform::CUDAPlace &cuda_place = dev_ctx.GetPlace();
+  paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
+
+  // 1.1 get stored data num of per block
+  int total_true_num = 0;  // init
+  const int kVecSize = 4;
+#ifdef PADDLE_WITH_XPU_KP
+  int block = 64;
+  auto stream = dev_ctx.x_context()->xpu_stream;
+  const int num_per_block = kVecSize * block;
+  const int need_grids = (numel + num_per_block - 1) / num_per_block;
+  const int grid = std::min(need_grids, 8);
+#else
+  const int block = 256;
+  const int num_per_block = kVecSize * block;
+  const int need_grids = (numel + num_per_block - 1) / num_per_block;
+  const int grid = std::min(need_grids, 256);
+  auto stream = dev_ctx.stream();
+#endif
+  const int64_t main_offset = Floor(numel, num_per_block);
+  // 1.2 alloc tmp data for CoutBlock
+  const int size_count_block = need_grids + 1;
+  std::vector<int> dims_vec = {size_count_block * 2};
+  ScalarArray dims_array(dims_vec);
+  DenseTensor count_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
+  CT *count_data = count_mem.data<CT>();
+  // 1.3 launch CountKernl
+  GetBlockCountKernel<MT, CT, kVecSize><<<grid, block, 0, stream>>>(
+      cond_data, count_data, numel, main_offset);
+  // 2.1 alloc cumsum data for CoutBlock prefix
+  DenseTensor cumsum_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
+  CT *cumsum_data = cumsum_mem.data<CT>();
+  // 2.2 get prefix of count_data for real out_index
+  const int kCumVesize = 2;
+  const int block_c = 256;
+  const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c));
+  using Add = kps::AddFunctor<CT>;
+  CumsumOneBlock<CT, CT, Add, kCumVesize><<<1, block_c, 0, stream>>>(
+      count_data, cumsum_data, size_count_block, main_offset_c, Add());
+  // 3.1 set temp ptr for in;
+  // 3.1 alloc for out
+  // 3.1.1 get true_num for gpu place the last cumsum is the true_num
+  paddle::memory::Copy(cpu_place,
+                       &total_true_num,
+                       cuda_place,
+                       cumsum_data + need_grids,
+                       t_size,
+                       dev_ctx.stream());
+
+  dev_ctx.Wait();
+  // 3.1.2 allock for out with total_true_num
+  std::vector<int64_t> out_dim = {static_cast<int64_t>(total_true_num)};
+  if (SelectData == 0) {  // where_index
+    out_dim.push_back(rank);
+  }
+  out->Resize(phi::make_ddim(out_dim));
+  auto out_data = out->mutable_data<OutT>(cuda_place);
+  // 3.2 get true data's index according to cond_data and cumsum_data
+  if (total_true_num <= 0) return;
+  SelectKernel<MT,
+               InT,
+               CT,
+               OutT,
+               Functor,
+               kVecSize,
+               SelectData><<<grid, block, 0, stream>>>(out_data,
+                                                       cond_data,
+                                                       in_data_ptr,
+                                                       cumsum_data,
+                                                       func,
+                                                       numel,
+                                                       main_offset,
+                                                       rank);
+}
+
+}  // namespace funcs
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index b12fc6975b37d79ac9d49284b34b746d24c53681..c912d0c4686ff3fee88925f4d7121f38f24a5485 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -142,8 +142,27 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
         dev_ctx, nullptr, &out, &dout, dx, functor);            \
   }
 
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(       \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& out,                 \
+                        const DenseTensor& dout,                \
+                        float attr1,                            \
+                        float attr2,                            \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);            \
+  }
+
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, CudaSigmoidGradFunctor);
+
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor);
@@ -157,6 +176,7 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
 
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                                CudaLeakyReluGradFunctor,
@@ -176,6 +196,11 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                                t_min,
                                                t_max);
 
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
+                                                 CudaHardSigmoidGradFunctor,
+                                                 slope,
+                                                 offset);
+
 template <typename T, typename Context>
 void EluGradKernel(const Context& dev_ctx,
                    const DenseTensor& x,
@@ -270,3 +295,8 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index cd9330ead84295769244485365f0a0f06d44082e..6b598c764debb059072ba3ae3ac90e6985479133 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -91,6 +91,8 @@ DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)
 
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
@@ -103,6 +105,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha)
 
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
+                                     CudaHardSigmoidFunctor,
+                                     slope,
+                                     offset)
 
 }  // namespace phi
 
@@ -155,3 +161,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
diff --git a/paddle/phi/kernels/gpu/elementwise_kernel.cu b/paddle/phi/kernels/gpu/elementwise_kernel.cu
index 2cffc68fa0648937b96095f5bd58210adaf865b3..a57d89013f921e3adb5587c70b7bbb12c383de61 100644
--- a/paddle/phi/kernels/gpu/elementwise_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu
@@ -13,9 +13,50 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
+namespace phi {
+
+#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
+  template <typename T, typename Context>                            \
+  void name##RawKernel(const Context& dev_ctx,                       \
+                       const DenseTensor& x,                         \
+                       const DenseTensor& y,                         \
+                       int axis,                                     \
+                       DenseTensor* out) {                           \
+    std::vector<const DenseTensor*> inputs;                          \
+    std::vector<DenseTensor*> outputs;                               \
+    inputs.emplace_back(&x);                                         \
+    inputs.emplace_back(&y);                                         \
+    outputs.emplace_back(out);                                       \
+    dev_ctx.template Alloc<T>(out);                                  \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
+        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
+  }
+
+/**
+ * Kernels
+ */
+
+// Create the definition of Add
+DEFINE_CUDA_ELEMENTWISE_OP(Add)
+// Create the definition of Subtract
+DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
+// Create the definition of Multiply
+DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
+// Create the definition of Divide
+DEFINE_CUDA_ELEMENTWISE_OP(Divide)
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
 PD_REGISTER_KERNEL(elementwise_fmax,
                    GPU,
                    ALL_LAYOUT,
@@ -33,3 +74,55 @@ PD_REGISTER_KERNEL(elementwise_fmin,
                    double,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(add_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(subtract_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(divide_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(multiply_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   float16,
+                   complex64,
+                   complex128,
+                   bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c3f7a5261712a1d33bb4ad47dd080a489b303717
--- /dev/null
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context &dev_ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &mean,
+                         const DenseTensor &variance,
+                         paddle::optional<const DenseTensor &> scale_opt,
+                         paddle::optional<const DenseTensor &> bias_opt,
+                         const DenseTensor &out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor *x_grad,
+                         DenseTensor *scale_grad,
+                         DenseTensor *bias_grad) {
+  using U = paddle::operators::LayerNormParamType<T>;
+  // d_x, d_scale, d_bias may be nullptr
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  auto *scale = scale_opt.get_ptr();
+  auto *bias = bias_opt.get_ptr();
+  auto *d_y = &out_grad;
+
+  const auto &x_dims = x.dims();
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+
+  auto *x_data = x.data<T>();
+  auto *d_y_data = d_y->data<T>();
+
+  auto *mean_data = mean.data<U>();
+  auto *var_data = variance.data<U>();
+
+  auto *d_x_data = (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));
+
+  auto x_dtype = x.dtype();
+
+  phi::DataType scale_bias_dtype;
+  if (scale != nullptr) {
+    scale_bias_dtype = scale->dtype();
+  } else {
+    // FIXME(zengjinle): do not find a better way to get the right
+    // data type of the d_scale and d_bias if scale == nullptr.
+    if (bias != nullptr) {
+      scale_bias_dtype = bias->dtype();
+    } else {
+      scale_bias_dtype = x_dtype;
+    }
+  }
+
+#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX)  \
+  do {                                                                      \
+    auto *scale_data =                                                      \
+        (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());           \
+    auto *d_scale_data =                                                    \
+        (d_scale == nullptr ? nullptr                                       \
+                            : dev_ctx.template Alloc<ScaleBiasT>(d_scale)); \
+    auto *d_bias_data =                                                     \
+        (d_bias == nullptr ? nullptr                                        \
+                           : dev_ctx.template Alloc<ScaleBiasT>(d_bias));   \
+    auto *d_x_data =                                                        \
+        (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));        \
+    paddle::operators::LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(  \
+        x_data,                                                             \
+        d_y_data,                                                           \
+        scale_data,                                                         \
+        mean_data,                                                          \
+        var_data,                                                           \
+        d_x_data,                                                           \
+        d_scale_data,                                                       \
+        d_bias_data,                                                        \
+        epsilon,                                                            \
+        batch_size,                                                         \
+        feature_size,                                                       \
+        dev_ctx);                                                           \
+  } while (0)
+
+  if (scale_bias_dtype == x_dtype) {
+    PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
+  } else {
+    PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
+  }
+
+#undef PADDLE_LAUNCH_LAYERNORM_BWD
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d87b7c2193811cd6cf8138d1904c7fce01d3884a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -0,0 +1,229 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+
+namespace phi {
+
+template <typename T>
+void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                               const T *input,
+                                               std::vector<int> input_shape,
+                                               const T *bias,
+                                               const T *scale,
+                                               T *output,
+                                               T *mean,
+                                               T *variance,
+                                               int begin_norm_axis,
+                                               float eps) {
+  const auto x_dims = phi::make_ddim(input_shape);
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+  switch (paddle::operators::GetDesiredBlockDim(feature_size)) {
+    FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<
+                         T,
+                         T,
+                         kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+        input, scale, bias, output, mean, variance, eps, feature_size));
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Product from begin_norm_axis to end in layer_norm must be larger "
+          "than 1"));
+      break;
+  }
+}
+
+template class LayerNormDirectCUDAFunctor<float>;
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context &dev_ctx,
+                     const DenseTensor &x,
+                     paddle::optional<const DenseTensor &> scale_opt,
+                     paddle::optional<const DenseTensor &> bias_opt,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor *y,
+                     DenseTensor *mean,
+                     DenseTensor *var) {
+  using U = paddle::operators::LayerNormParamType<T>;
+  auto *scale = scale_opt.get_ptr();
+  auto *bias = bias_opt.get_ptr();
+
+  const auto x_dims = x.dims();
+  auto *x_data = x.data<T>();
+  auto *y_data = dev_ctx.template Alloc<T>(y);
+  auto *mean_data = dev_ctx.template Alloc<U>(mean);
+  auto *var_data = dev_ctx.template Alloc<U>(var);
+
+  auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
+  auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
+
+  auto x_dtype = x.dtype();
+  phi::DataType scale_bias_dtype;
+  if (void_scale_data != nullptr) {
+    scale_bias_dtype = scale->dtype();
+    if (void_bias_data != nullptr) {
+      PADDLE_ENFORCE_EQ(
+          scale->dtype(),
+          bias->dtype(),
+          phi::errors::InvalidArgument("Thie Scale and Bias of layer_norm op "
+                                       "should have the same data type."));
+    }
+  } else {
+    scale_bias_dtype = (void_bias_data != nullptr ? bias->dtype() : x_dtype);
+  }
+
+  bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
+  if (!is_scale_bias_same_dtype_with_x) {
+    PADDLE_ENFORCE_EQ(scale_bias_dtype,
+                      paddle::experimental::CppTypeToDataType<U>::Type(),
+                      phi::errors::InvalidArgument(
+                          "Unsupported data type of Scale and Bias"));
+  }
+
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+
+  auto stream = dev_ctx.stream();
+
+#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
+  do {                                                                     \
+    switch (paddle::operators::GetDesiredBlockDim(feature_size)) {         \
+      FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<            \
+                           T,                                              \
+                           U,                                              \
+                           kBlockDim,                                      \
+                           IsScaleBiasSameDTypeWithX><<<batch_size,        \
+                                                        kBlockDim,         \
+                                                        0,                 \
+                                                        stream>>>(         \
+          x_data,                                                          \
+          static_cast<const ScaleBiasT *>(void_scale_data),                \
+          static_cast<const ScaleBiasT *>(void_bias_data),                 \
+          y_data,                                                          \
+          mean_data,                                                       \
+          var_data,                                                        \
+          epsilon,                                                         \
+          feature_size));                                                  \
+      default:                                                             \
+        PADDLE_THROW(phi::errors::InvalidArgument(                         \
+            "Product from begin_norm_axis to end must be larger than 1")); \
+        break;                                                             \
+    }                                                                      \
+  } while (0)
+
+#ifdef PADDLE_WITH_CUDA
+  bool can_call_1024_kernel = false;
+  if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
+    can_call_1024_kernel = true;
+  }
+  if (can_call_1024_kernel) {
+    const int WARPS_M = 4;
+    const int WARPS_N = 1;
+    const int THREADS_PER_WARP = 32;
+    const int BYTES_PER_LDG = 16;
+    const int VecSize = BYTES_PER_LDG / sizeof(T);
+
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
+    const int ROWS_PER_CTA = WARPS_M;
+
+    const int grid = static_cast<int>(
+        std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
+    if (is_scale_bias_same_dtype_with_x) {
+      paddle::operators::ln_fwd_1024_kernel<
+          T,
+          U,
+          T,
+          VecSize,
+          WARPS_M,
+          WARPS_N,
+          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+          batch_size,
+          feature_size,
+          epsilon,
+          x_data,
+          static_cast<const T *>(void_scale_data),
+          static_cast<const T *>(void_bias_data),
+          mean_data,
+          var_data,
+          y_data);
+    } else {
+      paddle::operators::ln_fwd_1024_kernel<
+          T,
+          U,
+          U,
+          VecSize,
+          WARPS_M,
+          WARPS_N,
+          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+          batch_size,
+          feature_size,
+          epsilon,
+          x_data,
+          static_cast<const U *>(void_scale_data),
+          static_cast<const U *>(void_bias_data),
+          mean_data,
+          var_data,
+          y_data);
+    }
+  } else {
+#endif
+    if (is_scale_bias_same_dtype_with_x) {
+      PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
+    } else {
+      PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
+    }
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+
+#undef PADDLE_LAUNCH_LAYERNORM_FWD
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu
index fc4adca2f42438f464346ad83bc7e49448826bb2..b443ae6b8fb5e6c3bf5264a50d25205a419f22ad 100644
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -19,34 +19,27 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/masked_select_kernel.h"
 
 namespace phi {
 
-__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx])
-      mask_array[idx] = 1;
-    else
-      mask_array[idx] = 0;
-  }
-}
+template <typename MT, typename InT, typename OutT>
+struct MaskedSelectFunctor {
+  HOSTDEVICE MaskedSelectFunctor() {}
 
-template <typename T>
-__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum,
-                                     const bool* mask,
-                                     const T* input,
-                                     T* out,
-                                     int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx]) {
-      int index = mask_prefix_sum[idx];
-      out[index] = input[idx];
+  HOSTDEVICE inline void operator()(OutT* out,
+                                    const MT* mask,
+                                    const InT* value,
+                                    int num) {
+    int store_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        out[store_fix++] = value[idx];
+      }
     }
   }
-}
+};
 
 template <typename T, typename Context>
 void MaskedSelectKernel(const Context& dev_ctx,
@@ -68,42 +61,9 @@ void MaskedSelectKernel(const Context& dev_ctx,
                         "value.",
                         input_dim,
                         mask_dim));
-
-  thrust::device_ptr<const bool> mask_dev_ptr =
-      thrust::device_pointer_cast(mask_data);
-  thrust::device_vector<T> mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size);
-  auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true);
-
-  DDim out_dim{out_size};
-  out->Resize(out_dim);
-  auto out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-  DenseTensor mask_array;
-  DenseTensor mask_prefix_sum;
-  mask_array.Resize(mask_dim);
-  mask_prefix_sum.Resize(mask_dim);
-
-  int32_t* mask_array_data =
-      mask_array.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int32_t* mask_prefix_sum_data =
-      mask_prefix_sum.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int threads = 512;
-  int grid = (mask_size + threads - 1) / threads;
-  auto stream = dev_ctx.stream();
-  SetMaskArray<<<grid, threads, 0, stream>>>(
-      mask_data, mask_array_data, mask_size);
-
-  thrust::device_ptr<int32_t> mask_array_dev_ptr =
-      thrust::device_pointer_cast(mask_array_data);
-  thrust::device_vector<int32_t> mask_array_vec(mask_array_dev_ptr,
-                                                mask_array_dev_ptr + mask_size);
-  thrust::exclusive_scan(thrust::device,
-                         mask_array_vec.begin(),
-                         mask_array_vec.end(),
-                         mask_prefix_sum_data);
-
-  SelectWithPrefixMask<T><<<grid, threads, 0, stream>>>(
-      mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  using Functor = MaskedSelectFunctor<bool, T, T>;
+  phi::funcs::SelectKernel<bool, T, T, 1, Functor>(
+      dev_ctx, mask, x, out, Functor());
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
deleted file mode 100644
index d33f216468220da7ef9fc09533226e8fdd0c702f..0000000000000000000000000000000000000000
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/math_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-
-#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
-  template <typename T, typename Context>                            \
-  void name##RawKernel(const Context& dev_ctx,                       \
-                       const DenseTensor& x,                         \
-                       const DenseTensor& y,                         \
-                       int axis,                                     \
-                       DenseTensor* out) {                           \
-    std::vector<const DenseTensor*> inputs;                          \
-    std::vector<DenseTensor*> outputs;                               \
-    inputs.emplace_back(&x);                                         \
-    inputs.emplace_back(&y);                                         \
-    outputs.emplace_back(out);                                       \
-    dev_ctx.template Alloc<T>(out);                                  \
-    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
-        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
-  }
-
-/**
- * Kernels
- */
-
-// Create the definition of Add
-DEFINE_CUDA_ELEMENTWISE_OP(Add)
-// Create the definition of Subtract
-DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
-// Create the definition of Multiply
-DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
-// Create the definition of Divide
-DEFINE_CUDA_ELEMENTWISE_OP(Divide)
-
-}  // namespace phi
-
-using float16 = phi::dtype::float16;
-using bfloat16 = phi::dtype::bfloat16;
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-PD_REGISTER_KERNEL(add_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   float16,
-                   complex64,
-                   complex128,
-                   bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index 7796132ec07f433d8495d1dba197c06d536e1338..66ba30f7ce6945693a974733c77a47f0d328e50b 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -23,11 +23,11 @@
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/abs_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/reduce_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/mode_grad_kernel.cu b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..43502621c2d3a878a144de1878aa09b8d64b6a47
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void AssignGradWithAxis(const T* grad_out,
+                                   const int64_t* indices,
+                                   T* grad_in,
+                                   int pre,
+                                   int post,
+                                   int raw_height,
+                                   int k) {
+  // raw_height is the length of topk axis
+  for (int i = blockIdx.x; i < pre; i += gridDim.x) {
+    int base_index = i * post * k;
+    int base_grad = i * post * raw_height;
+    for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
+      grad_in[base_grad + j] = static_cast<T>(0);
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
+      int64_t idx_ij = indices[base_index + j];
+      int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
+      grad_in[in_ij] = grad_out[base_index + j];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  auto out_dims = indices.dims();
+
+  if (axis < 0) axis += in_dims.size();
+  // allocate the cuda memory for the x_grad
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* out_grad_data = out_grad.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+
+  int pre, n, post;
+  funcs::GetDims(in_dims, axis, &pre, &n, &post);
+
+  // calcluate the block and grid num
+  int block_size = funcs::ComputeBlockSize(post);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+  AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mode_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ModeGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..629b9722cd6bcfe12d0fb5a7e8be6439f5ea286f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mode_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  // get the input dims
+  const auto& in_dims = x.dims();
+  // calcluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  auto out_dims = out->dims();
+
+  const T* input_data = x.data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    funcs::GetModebySort<T>(
+        dev_ctx, &x, input_width, input_height, output_data, indices_data);
+  } else {
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      out->Resize(tmp_out_dim);
+      indices->Resize(tmp_out_dim);
+    }
+
+    DDim trans_shape(in_dims);
+    DDim trans_out_shape(in_dims);
+    for (int i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = in_dims[trans_axis[i]];
+      trans_out_shape[i] = in_dims[trans_axis[i]];
+    }
+    trans_out_shape[in_dims.size() - 1] = 1;
+
+    // second step, tranpose the input
+    DenseTensor trans_input;
+    trans_input.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_input);
+
+    int ndims = trans_axis.size();
+    funcs::TransCompute<Context, T>(
+        ndims, dev_ctx, x, &trans_input, trans_axis);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_out_shape);
+    int64_t* trans_ind_data = dev_ctx.template Alloc<int64_t>(&trans_ind);
+
+    DenseTensor trans_out;
+    trans_out.Resize(trans_out_shape);
+    T* trans_out_data = dev_ctx.template Alloc<T>(&trans_out);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_width = trans_shape[trans_shape.size() - 1];
+    funcs::GetModebySort<T>(dev_ctx,
+                            &trans_input,
+                            input_width,
+                            input_height,
+                            trans_out_data,
+                            trans_ind_data);
+    // last step, tranpose back the indices and output
+    funcs::TransCompute<Context, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans_axis);
+    funcs::TransCompute<Context, T>(ndims, dev_ctx, trans_out, out, trans_axis);
+    if (!keepdim) {
+      out->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    mode, GPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu
index cd4ed29cdd1dd7b48a9135597ca79ab401a0cfba..cb3375dee95a5992fd598fdc8ba4f5e176f357a2 100644
--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -18,7 +18,6 @@
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/empty_kernel.h"
 
 #include "paddle/fluid/memory/memory.h"
 
diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d093a71d23f4ea96f9d7e7de11dcfefade3788ee
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPURoiPoolBackward(const int nthreads,
+                                   const T* input_rois,
+                                   const T* output_grad,
+                                   const int64_t* arg_max_data,
+                                   const int num_rois,
+                                   const float spatial_scale,
+                                   const int channels,
+                                   const int height,
+                                   const int width,
+                                   const int pooled_height,
+                                   const int pooled_width,
+                                   int* box_batch_id_data,
+                                   T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    int roi_batch_ind = box_batch_id_data[n];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_arg_max_data = arg_max_data + output_offset;
+
+    int arg_max = offset_arg_max_data[ph * pooled_width + pw];
+    if (arg_max != -1) {
+      paddle::platform::CudaAtomicAdd(
+          offset_input_grad + arg_max,
+          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiPoolGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& boxes,
+                       paddle::optional<const DenseTensor&> boxes_num,
+                       const DenseTensor& arg_max,
+                       const DenseTensor& out_grad,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       DenseTensor* dx) {
+  auto x_dims = x.dims();
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  if (dx) {
+    DenseTensor box_batch_id_list;
+    box_batch_id_list.Resize({rois_num});
+    int* box_batch_id_data =
+        dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+
+    auto gplace = dev_ctx.GetPlace();
+    if (boxes_num) {
+      int boxes_batch_size = boxes_num->numel();
+      std::vector<int> boxes_num_list(boxes_batch_size);
+      paddle::memory::Copy(phi::CPUPlace(),
+                           boxes_num_list.data(),
+                           gplace,
+                           boxes_num->data<int>(),
+                           sizeof(int) * boxes_batch_size,
+                           0);
+      int start = 0;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (int i = start; i < start + boxes_num_list[n]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+        start += boxes_num_list[n];
+      }
+    } else {
+      auto boxes_lod = boxes.lod().back();
+      int boxes_batch_size = boxes_lod.size() - 1;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+      }
+    }
+    int bytes = box_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    paddle::memory::Copy(gplace,
+                         roi_id_data,
+                         phi::CPUPlace(),
+                         box_batch_id_data,
+                         bytes,
+                         dev_ctx.stream());
+
+    dev_ctx.template Alloc<T>(dx);
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+
+    int output_grad_size = out_grad.numel();
+    int blocks = NumBlocks(output_grad_size);
+    int threads = kNumCUDAThreads;
+
+    if (output_grad_size > 0) {
+      GPURoiPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          output_grad_size,
+          boxes.data<T>(),
+          out_grad.data<T>(),
+          arg_max.data<int64_t>(),
+          rois_num,
+          spatial_scale,
+          channels,
+          height,
+          width,
+          pooled_height,
+          pooled_width,
+          roi_id_data,
+          dx->data<T>());
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool_grad, GPU, ALL_LAYOUT, phi::RoiPoolGradKernel, float, double) {
+  kernel->InputAt(3).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ab33e2cf64751f1cd5be44fc6f759acffd2fb93d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -0,0 +1,220 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPURoiPoolForward(const int nthreads,
+                                  const T* input_data,
+                                  const T* input_rois,
+                                  const float spatial_scale,
+                                  const int channels,
+                                  const int height,
+                                  const int width,
+                                  const int pooled_height,
+                                  const int pooled_width,
+                                  int* box_batch_id_data,
+                                  T* output_data,
+                                  int64_t* arg_max_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int box_batch_ind = box_batch_id_data[n];
+    int box_start_w = round(offset_input_rois[0] * spatial_scale);
+    int box_start_h = round(offset_input_rois[1] * spatial_scale);
+    int box_end_w = round(offset_input_rois[2] * spatial_scale);
+    int box_end_h = round(offset_input_rois[3] * spatial_scale);
+
+    int box_width = max(box_end_w - box_start_w + 1, 1);
+    int box_height = max(box_end_h - box_start_h + 1, 1);
+
+    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
+                                        static_cast<double>(box_height) /
+                                        static_cast<double>(pooled_height)));
+    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
+                                        static_cast<double>(box_width) /
+                                        static_cast<double>(pooled_width)));
+    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
+                                     static_cast<double>(box_height) /
+                                     static_cast<double>(pooled_height)));
+    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
+                                     static_cast<double>(box_width) /
+                                     static_cast<double>(pooled_width)));
+    hstart = min(max(hstart + box_start_h, 0), height);
+    hend = min(max(hend + box_start_h, 0), height);
+    wstart = min(max(wstart + box_start_w, 0), width);
+    wend = min(max(wend + box_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (box_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
+        }
+      }
+    }
+    output_data[i] = maxval;
+    if (arg_max_data) {
+      arg_max_data[i] = maxidx;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max) {
+  auto x_dims = x.dims();
+  int batch_size = x_dims[0];
+  auto in_stride = phi::stride(x_dims);
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+
+  int rois_num = boxes.dims()[0];
+
+  if (rois_num == 0) return;
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+
+  DenseTensor box_batch_id_list;
+  box_batch_id_list.Resize({rois_num});
+  int* box_batch_id_data = dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+  auto gplace = dev_ctx.GetPlace();
+
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument(
+            "The batch size of input(ROIs) and input(X) must be the same but "
+            "received batch size of input(ROIs) and input(X) is %d and %d "
+            "respectively.",
+            boxes_batch_size,
+            batch_size));
+    std::vector<int> boxes_num_list(boxes_batch_size);
+    paddle::memory::Copy(phi::CPUPlace(),
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_list[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_list[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument(
+            "The batch size of input(ROIs) and input(X) must be the same but "
+            "received batch size of input(ROIs) and input(X) is %d and %d "
+            "respectively.",
+            boxes_batch_size,
+            batch_size));
+
+    int boxes_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num,
+                      boxes_num_with_lod,
+                      phi::errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num,
+                          boxes_num_with_lod));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+
+  int bytes = box_batch_id_list.numel() * sizeof(int);
+  auto box_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+  int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
+  paddle::memory::Copy(gplace,
+                       box_id_data,
+                       phi::CPUPlace(),
+                       box_batch_id_data,
+                       bytes,
+                       dev_ctx.stream());
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
+
+  GPURoiPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      boxes.data<T>(),
+      spatial_scale,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      box_id_data,
+      output_data,
+      arg_max_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool, GPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
index d9618dc159a6d3f5b24bdfcfdb219ec649e051f9..9d1769e18b4b809fbc353513a05553e0ccd97572 100644
--- a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
@@ -24,4 +24,6 @@ PD_REGISTER_KERNEL(segment_pool_grad,
                    ALL_LAYOUT,
                    phi::SegmentPoolGradKernel,
                    float,
-                   double) {}
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
index c38e935adf837ef00c48fa31bc1e37eea2948673..3128e534166acba6ca136331ad8efea66b18621f 100644
--- a/paddle/phi/kernels/gpu/segment_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
@@ -19,5 +19,11 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-PD_REGISTER_KERNEL(
-    segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
+PD_REGISTER_KERNEL(segment_pool,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
index bb04e7ee8515bb6320860e4fd20366995d26c991..f27b32ca7b8319440b62f0d03d21129133c8470c 100644
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -33,27 +33,23 @@ struct GPUTruncatedNormal {
   T mean, std;
   T a_normal_cdf;
   T b_normal_cdf;
-
   unsigned int seed;
   T numeric_min;
 
   __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
       : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
-    auto normal_cdf = [](float x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    a_normal_cdf = normal_cdf((-2.0 - mean) / std);
-    b_normal_cdf = normal_cdf((2.0 - mean) / std);
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
   }
 
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed);
-    thrust::uniform_real_distribution<T> dist(2.0 * a_normal_cdf - 1.0,
-                                              2.0 * b_normal_cdf - 1.0);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
     rng.discard(n);
     T value = dist(rng);
-    return std::sqrt(2.0) * erfinvf(value) * std + mean;
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
   }
 };
 
@@ -73,21 +69,18 @@ struct TruncatedNormalOffset {
         seed(seed),
         numeric_min(numeric_min),
         offset_(offset) {
-    auto normal_cdf = [](float x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    a_normal_cdf = normal_cdf((-2.0 - mean) / std);
-    b_normal_cdf = normal_cdf((2.0 - mean) / std);
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
   }
 
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed);
-    thrust::uniform_real_distribution<T> dist(2.0 * a_normal_cdf - 1.0,
-                                              2.0 * b_normal_cdf - 1.0);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
     rng.discard(n + offset_);
     T value = dist(rng);
-    return std::sqrt(2.0) * erfinvf(value) * std + mean;
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu
index 535cb812a20ea90bdb3f07b731af52c2822f0ec2..9538533f70d597e21b393d2650d56bebd823c360 100644
--- a/paddle/phi/kernels/gpu/where_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -20,150 +20,59 @@
 namespace cub = hipcub;
 #endif
 
-#include "paddle/phi/kernels/where_index_kernel.h"
-
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
+#include "paddle/phi/kernels/where_index_kernel.h"
 
 namespace phi {
-
-template <typename T>
-__global__ void GetTrueNum(const T *cond_data,
-                           const int64_t numel,
-                           int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    true_num_array[idx] =
-        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
+template <typename T1, typename T2, typename OutT>
+struct IndexFunctor {
+  T2 stride[phi::DDim::kMaxRank];
+  int dims;
+  explicit IndexFunctor(const phi::DDim &in_dims) {
+    dims = in_dims.size();
+    std::vector<T2> strides_in_tmp;
+    strides_in_tmp.resize(dims, 1);
+    // get strides according to in_dims
+    for (T2 i = 1; i < dims; i++) {
+      strides_in_tmp[i] = strides_in_tmp[i - 1] * in_dims[dims - i];
+    }
+    memcpy(stride, strides_in_tmp.data(), dims * sizeof(T2));
   }
-}
-
-template <typename T>
-__global__ void SetTrueIndex(int64_t *out_ptr,
-                             const T *cond_data,
-                             const int64_t numel,
-                             const int64_t *stride_array,
-                             const int64_t rank,
-                             const int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    // true_num_array is calculated by cub::InclusiveSum,
-    // cause the first element of true_num_array is 1,
-    // so we need substract 1 to get true index.
-    const int64_t true_index = true_num_array[idx] - 1;
-    if (static_cast<bool>(cond_data[idx])) {
-      int64_t rank_index = idx;
-      for (int j = 0; j < rank; j++) {
-        const int64_t out_index = rank_index / stride_array[j];
-        out_ptr[true_index * rank + j] = out_index;
-        rank_index -= out_index * stride_array[j];
+  HOSTDEVICE inline void operator()(OutT *out,
+                                    const T1 *mask,
+                                    const T2 *index,
+                                    const int num) {
+    int store_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        T2 data_index = index[idx];
+        // get index
+        for (int rank_id = dims - 1; rank_id >= 0; --rank_id) {
+          out[store_fix] = static_cast<OutT>(data_index / stride[rank_id]);
+          data_index = data_index % stride[rank_id];
+          store_fix++;
+        }
       }
     }
   }
-}
+};
 
 template <typename T, typename Context>
 void WhereIndexKernel(const Context &dev_ctx,
                       const DenseTensor &condition,
                       DenseTensor *out) {
-  const T *cond_data = condition.data<T>();
-  const int64_t numel = condition.numel();
+  DenseTensor in_data;
   auto dims = condition.dims();
-  const int rank = dims.size();
-
-  auto d_array_mem =
-      paddle::memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
-  auto h_array_mem =
-      paddle::memory::Alloc(phi::CPUPlace(), (rank + 1) * sizeof(int64_t));
-
-  // "stride_array" is an array and len(stride_array)==rank,
-  // each element is the stride of each dimension -- the length from i to i+1.
-  int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
-  int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
-
-  // "true_num_array" is an array and len(stride_array)==numel,
-  // at the beginning,
-  // "true_num_array" will set 1 if condition[i] == true else 0,
-  // then it will be calculated by cub::InclusiveSum,
-  // so that we can get the true number before i as the out index
-  int64_t *d_true_num_array = d_stride_array + rank;
-
-  // the total_true_num is the total number of condition[i] == true
-  int64_t *h_total_true_num = h_stride_array + rank;
-
-  // alloce cub memory
-  size_t cub_size = 0;
-  cub::DeviceScan::InclusiveSum(nullptr,
-                                cub_size,
-                                d_true_num_array,
-                                d_true_num_array,
-                                numel,
-                                dev_ctx.stream());
-  auto cub_mem = paddle::memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
-  void *cub_data = cub_mem->ptr();
-
-  // set d_true_num_array[i]=1 if cond_data[i]==true else 0
-  const int threads = std::min(numel, static_cast<int64_t>(128));
-  const int64_t need_grids = (numel + threads - 1) / threads;
-  const int grids = std::min(need_grids, static_cast<int64_t>(256));
-  GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-      cond_data, numel, d_true_num_array);
-
-  // calculate the inclusive prefix sum of "true_num_array"
-  // to get the index of "out" tensor,
-  // and the total number of cond_data[i]==true.
-  // Example:
-  // condition: F T T F F F T T
-  // before:    0 1 1 0 0 0 1 1
-  // after:     0 1 2 2 2 2 3 4
-  // out:       1 2 6 7
-  cub::DeviceScan::InclusiveSum(cub_data,
-                                cub_size,
-                                d_true_num_array,
-                                d_true_num_array,
-                                numel,
-                                dev_ctx.stream());
-
-  // calculate each dimension's stride
-  h_stride_array[rank - 1] = 1;
-  for (int i = rank - 2; i >= 0; i--) {
-    h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
-  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       d_stride_array,
-                       phi::CPUPlace(),
-                       h_stride_array,
-                       rank * sizeof(int64_t),
-                       dev_ctx.stream());
-
-  // get total ture number and set output size
-  // the last element of cub::InclusiveSum is the total number
-  paddle::memory::Copy(phi::CPUPlace(),
-                       h_total_true_num,
-                       dev_ctx.GetPlace(),
-                       d_true_num_array + numel - 1,
-                       sizeof(int64_t),
-                       dev_ctx.stream());
-  dev_ctx.Wait();
-
-  int64_t true_num = *h_total_true_num;
-  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-  auto *out_data = dev_ctx.template Alloc<int64_t>(out);
-
-  if (true_num == 0) {
-    return;
-  }
-
-  // using true_num_array and stride_array to calculate the output index
-  SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-      out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
+  using Functor = IndexFunctor<T, int64_t, int64_t>;
+  Functor index_functor = Functor(dims);
+  phi::funcs::SelectKernel<T, T, int64_t, 0, Functor>(
+      dev_ctx, condition, in_data, out, index_functor);
 }
-
 }  // namespace phi
 
 PD_REGISTER_KERNEL(where_index,
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 2b2dd5118969cf35c4762f3ab774ce41c04d2e4d..77159bfc876da603f703a13592f525d808adfbbf 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -121,17 +121,10 @@ struct ReduceMaxFunctor {
 };
 
 template <typename Tx, typename Ty = Tx>
-struct ExpSubFunctor {
-  HOSTDEVICE inline ExpSubFunctor() { y = static_cast<Tx>(0.0f); }
-
-  HOSTDEVICE explicit inline ExpSubFunctor(Tx y) : y((Tx)(y)) {}
-
+struct ExpFunctor {
   HOSTDEVICE inline Ty operator()(const Tx& x) const {
-    return static_cast<Ty>(std::exp(x - y));
+    return static_cast<Ty>(std::exp(x));
   }
-
- private:
-  Tx y;
 };
 
 template <typename Tx, typename Ty = Tx>
@@ -293,10 +286,14 @@ __global__ void WarpSoftmaxForward(T* softmax,
   }
 
   // data src
-  AccT srcdata[kBatchSize][kLoopsV][kVSize];
-  T src_tmp[kBatchSize][kLoopsV][kVSize];
-  kps::Init<AccT, kStep>(&srcdata[0][0][0], kLowInf);
-  kps::Init<T, kStep>(&src_tmp[0][0][0], -std::numeric_limits<T>::infinity());
+  // src_data: the raw data form global memory
+  // sub_data: store the data obtained by (src_data - max), used by log_softmax
+  // exp_data: store the data obtained by (exp(sub_data)), used by softmax
+  T src_data[kBatchSize][kLoopsV][kVSize];
+  AccT sub_data[kBatchSize][kLoopsV][kVSize];
+  AccT exp_data[kBatchSize][kLoopsV][kVSize];
+  kps::Init<AccT, kStep>(&sub_data[0][0][0], kLowInf);
+  kps::Init<T, kStep>(&src_data[0][0][0], -std::numeric_limits<T>::infinity());
 
   // data dst
   T out_tmp[kBatchSize][kLoopsV][kVSize];
@@ -313,11 +310,11 @@ __global__ void WarpSoftmaxForward(T* softmax,
   for (int i = 0; i < kBatchSize; ++i) {
     const VecT* src_v =
         reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
-    VecT* reg_v = reinterpret_cast<VecT*>(&src_tmp[i][0][0]);
+    VecT* reg_v = reinterpret_cast<VecT*>(&src_data[i][0][0]);
     kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
         &reg_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1);
     kps::ElementwiseUnary<T, AccT, kVItem, 1, 1, DataTransFunctor<T, AccT>>(
-        &srcdata[i][0][0], &src_tmp[i][0][0], DataTransFunctor<T, AccT>());
+        &sub_data[i][0][0], &src_data[i][0][0], DataTransFunctor<T, AccT>());
   }
 
   // compute max
@@ -327,14 +324,16 @@ __global__ void WarpSoftmaxForward(T* softmax,
               1,
               ReduceMaxFunctor<AccT>,
               kMode::kLocalMode>(
-      &max[0], &srcdata[0][0][0], ReduceMaxFunctor<AccT>(), true);
+      &max[0], &sub_data[0][0][0], ReduceMaxFunctor<AccT>(), true);
   WarpReduceMax<AccT, kBatchSize, kWarpSize>(max);
 
 // compute sum
 #pragma unroll
   for (int i = 0; i < kBatchSize; ++i) {
-    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpSubFunctor<AccT>>(
-        &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor<AccT>(max[i]));
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+        &sub_data[i][0][0], &sub_data[i][0][0], UnarySubFunctor<AccT>(max[i]));
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpFunctor<AccT>>(
+        &exp_data[i][0][0], &sub_data[i][0][0], ExpFunctor<AccT>());
   }
   kps::Reduce<AccT,
               kVItem,
@@ -342,7 +341,7 @@ __global__ void WarpSoftmaxForward(T* softmax,
               1,
               kps::AddFunctor<AccT>,
               kMode::kLocalMode>(
-      &sum[0], &srcdata[0][0][0], kps::AddFunctor<AccT>(), true);
+      &sum[0], &exp_data[0][0][0], kps::AddFunctor<AccT>(), true);
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
 // write data to global memory
@@ -352,15 +351,13 @@ __global__ void WarpSoftmaxForward(T* softmax,
         reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
     VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
     if (LogMode) {
-      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnaryLogFunctor<AccT>>(
-          &srcdata[i][0][0], &srcdata[i][0][0], UnaryLogFunctor<AccT>());
       kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnarySubFunctor<AccT>>(
           &out_tmp[i][0][0],
-          &srcdata[i][0][0],
+          &sub_data[i][0][0],
           UnarySubFunctor<AccT>(std::log(sum[i])));
     } else {
       kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
-          &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
+          &out_tmp[i][0][0], &exp_data[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
     }
     kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
         &softmax_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index a95f49c0e7cfd32802f1d1899a1fe1590fdf6a87..7d6b6dc72ea60214ff4c9974b4ff885feecb5822 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -222,4 +222,57 @@ void EluDoubleGradKernel(const Context& dev_ctx,
   functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
 }
 
+template <typename T, typename Context>
+void SigmoidDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             DenseTensor* dout_new,
+                             DenseTensor* ddout) {
+  if (dout_new) {
+    dout_new->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout_new);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::SigmoidGradGradFunctor<T> functor;
+  functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout);
+}
+
+template <typename T, typename Context>
+void SigmoidTripleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             const DenseTensor& d_ddout,
+                             const DenseTensor& d_dout_new,
+                             DenseTensor* d_out_new,
+                             DenseTensor* d_dout,
+                             DenseTensor* d_ddx) {
+  if (d_dout) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_out_new) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_out_new);
+  }
+  if (d_ddx) {
+    d_dout->Resize(ddx.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  funcs::SigmoidTripleGradFunctor<T> functor;
+  functor(dev_ctx,
+          &out,
+          &ddx,
+          &dout,
+          &d_ddout,
+          &d_dout_new,
+          d_dout,
+          d_out_new,
+          d_ddx);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
index 72741e6d3a01ae374c43a24ac519ff5106b5733e..e3ea10705d24e90a76246d439c6d9263e072bc39 100644
--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -19,6 +19,7 @@
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -27,7 +28,6 @@
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/matrix_reduce.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
index 038ef0c214bc73b41fc3aff661e296207d615df1..e4356e9af39372cd330991502078a13520d05586 100644
--- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -17,13 +17,13 @@
 #include "paddle/phi/kernels/determinant_grad_kernel.h"
 
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
index 5b71fd7fa3a5ecd1c864c155df2586d293d3d2e6..5e06435b28e2719c2e9fc18de034073f9674a977 100644
--- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -16,11 +16,11 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
diff --git a/paddle/phi/kernels/layer_norm_grad_kernel.h b/paddle/phi/kernels/layer_norm_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c32be63db4178f92d9564f357c30bb28fb415516
--- /dev/null
+++ b/paddle/phi/kernels/layer_norm_grad_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         paddle::optional<const DenseTensor&> scale,
+                         paddle::optional<const DenseTensor&> bias,
+                         const DenseTensor& out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9679420bda5cf6beffb56b7ec319c1b80ac4eda
--- /dev/null
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     paddle::optional<const DenseTensor&> scale,
+                     paddle::optional<const DenseTensor&> bias,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor* out,
+                     DenseTensor* mean,
+                     DenseTensor* variance);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class LayerNormDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T* input,
+                  std::vector<int> input_shape,
+                  const T* bias,
+                  const T* scale,
+                  T* output,
+                  T* mean,
+                  T* variance,
+                  int begin_norm_axis,
+                  float eps);
+};
+#endif
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h
deleted file mode 100644
index ddc3a46e989f5cc86e294eb16ca0f82fcd7d8115..0000000000000000000000000000000000000000
--- a/paddle/phi/kernels/math_kernel.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/infermeta/binary.h"
-namespace phi {
-
-template <typename T, typename Context>
-void AddRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  int axis,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void AddKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& y,
-               DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
-template <typename T, typename Context>
-DenseTensor Add(const Context& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Subtract(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Divide(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Multiply(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-}  // namespace phi
diff --git a/paddle/phi/kernels/mode_grad_kernel.h b/paddle/phi/kernels/mode_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccde8c3648fa556401f1937c78039743daf43f4c
--- /dev/null
+++ b/paddle/phi/kernels/mode_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/mode_kernel.h b/paddle/phi/kernels/mode_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..831c4369304e5c5d27cddf01bcba021745bf7083
--- /dev/null
+++ b/paddle/phi/kernels/mode_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 632ad00f6d06ed8a02b2d9677ff665c677cf8cb9..e02f4450a8babb9dd90cae6d8d1622938ae2f795 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -22,7 +22,6 @@
 #endif
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-// #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
 namespace phi {
@@ -591,7 +590,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
     int index = (tidx + 1) * 2 * stride - 1;
     if (index < (blockDim.x * 2)) {
       temp[index + index / 32] =
-          compute(temp[index + index / 2],
+          compute(temp[index + index / 32],
                   temp[index - stride + (index - stride) / 32]);
     }
   }
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 2f1e2f589c5122987d9776700f3aa7bd95daa7a5..1d4181f3b9a89509ada2a8fe27d584a9b5aa039c 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -115,6 +115,14 @@ struct BroadcastConfig {
   }
 };
 
+template <typename T>
+__device__ __forceinline__ void WriteData(T* dst,
+                                          T* __restrict__ src,
+                                          int num) {
+  for (int i = 0; i < num; i++) {
+    dst[i] = src[i];
+  }
+}
 #undef INT_BITS
 }  // namespace details
 
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 53a8b7d0c9ef9489056ab293d97e5767b23531fe..d2cfdbdec3064c8e9cf20d101afc2adf0ed011a8 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -76,6 +76,16 @@ struct BroadcastConfig {
 };
 #pragma pack()
 
+template <typename T>
+__device__ __forceinline__ void WriteData(T* _global_ptr_ dst,
+                                          T* src,
+                                          int num) {
+  if (num > 0) {
+    LM2GM(src, dst, num * sizeof(T));
+  }
+}
+#undef INT_BITS
+
 }  // namespace details
 
 /**
diff --git a/paddle/phi/kernels/roi_pool_grad_kernel.h b/paddle/phi/kernels/roi_pool_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7f1c378f75c398a714f6aa4e4d857e314f47eeb
--- /dev/null
+++ b/paddle/phi/kernels/roi_pool_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPooGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& boxes,
+                      paddle::optional<const DenseTensor&> boxes_num,
+                      const DenseTensor& arg_max,
+                      const DenseTensor& out_grad,
+                      int pooled_height,
+                      int pooled_width,
+                      float spatial_scale,
+                      DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_pool_kernel.h b/paddle/phi/kernels/roi_pool_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ff6f223612a46c00abff103c3b3a193264b122
--- /dev/null
+++ b/paddle/phi/kernels/roi_pool_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+static constexpr int kROISize = 4;
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fae876facfc8fae9b2db783576444ac8bfde09a1
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/assign_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/assign_kernel.h"
+
+namespace phi {
+namespace sr {
+
+// Note: use `const paddle::optional<const SelectedRows&> x`
+// as input if needed
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  phi::AssignKernel<Context>(dev_ctx, x.value(), out->mutable_value());
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(assign_sr,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::sr::AssignKernel<phi::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(assign_sr,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::sr::AssignKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.h b/paddle/phi/kernels/selected_rows/assign_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ba465615a73a3036d4b029c8ecb54002b86cb97
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.cc b/paddle/phi/kernels/selected_rows/copy_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf71ab0583f6120e7bf10f26f00024b27a56ca79
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/copy_kernel.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/selected_rows/copy_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const SelectedRows& src,
+          Place dst_place,
+          bool blocking,
+          SelectedRows* dst) {
+  if (src.value().Holder() != dst->value().Holder() ||
+      src.value().data() != dst->value().data()) {
+    dst->set_rows(src.rows());
+    dst->set_height(src.height());
+  }
+  phi::Copy<Context>(
+      dev_ctx, src.value(), dst_place, blocking, dst->mutable_value());
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(
+    copy_sr, CPU, ALL_LAYOUT, phi::sr::Copy<phi::CPUContext>, ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    copy_sr, GPU, ALL_LAYOUT, phi::sr::Copy<phi::GPUContext>, ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.h b/paddle/phi/kernels/selected_rows/copy_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4aa848bea2a717ffcda4dff562ec56a702b7dbc5
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/copy_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const SelectedRows& src,
+          Place dst_place,
+          bool blocking,
+          SelectedRows* dst);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
index c4c13578a989961839600d8ee403e478c76d1345..f8547ced41934a9810dc6874c090ab5aefd43497 100644
--- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h
+++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
@@ -141,9 +141,19 @@ T Erfinv(T x) {
 template <typename T>
 struct TruncatedNormal {
   T mean, std;
-  TruncatedNormal(T mean, T std) : mean(mean), std(std) {}
+  T a_normal_cdf;
+  T b_normal_cdf;
+  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
+    auto normal_cdf = [](T x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    a_normal_cdf = normal_cdf(-2.0);
+    b_normal_cdf = normal_cdf(2.0);
+  }
+
   T operator()(T value) const {
-    return std::sqrt(2.0) * Erfinv(value) * std + mean;
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
   }
 };
 
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
index 890dbadf17c81fa40f629114df47f518fdcc387b..7ae0dc45c5e1be09a31821c171b84fbb47fe1c9e 100644
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -56,9 +56,14 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", );  // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", );               // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", );   // NOLINT
 
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );  // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", );  // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid,
+                                 "hard_sigmoid",
+                                 "slope" comma "offset");  // NOLINT
 
 KernelSignature ReluDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
@@ -79,6 +84,20 @@ KernelSignature TanhTripleGradOpArgumentMapping(
                          {"D_OutNew", "D_DOut", "D_DDx"});
 }
 
+KernelSignature SigmoidDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "sigmoid_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
+}
+
+KernelSignature SigmoidTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sigmoid_triple_grad",
+                         {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
+                         {},
+                         {"D_OutNew", "D_DOut", "D_DDx"});
+}
+
 KernelSignature LeakyReluDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
@@ -114,6 +133,7 @@ PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink);
 PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
@@ -152,3 +172,12 @@ PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad,
+                           phi::SigmoidDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_triple_grad,
+                           phi::SigmoidTripleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad,
+                           phi::LogSigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad,
+                           phi::HardSigmoidGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/assign_sig.cc b/paddle/phi/ops/compat/assign_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d149e8e6a9aa04d3cc8d02e370e7e07e3cbebeb0
--- /dev/null
+++ b/paddle/phi/ops/compat/assign_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AssignOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("X")) {
+    if (ctx.IsDenseTensorVectorInput("X")) {
+      return KernelSignature("assign_array", {"X"}, {}, {"Out"});
+    } else if (ctx.IsSelectedRowsInput("X")) {
+      return KernelSignature("assign_sr", {"X"}, {}, {"Out"});
+    } else {
+      return KernelSignature("assign", {"X"}, {}, {"Out"});
+    }
+  } else {
+    return KernelSignature("assign", {"X"}, {}, {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(assign, phi::AssignOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/layer_norm_sig.cc b/paddle/phi/ops/compat/layer_norm_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17a81e9ec012f2c116762ff2d653bb96f0e1c4f4
--- /dev/null
+++ b/paddle/phi/ops/compat/layer_norm_sig.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("layer_norm",
+                         {"X", "Scale", "Bias"},
+                         {"epsilon", "begin_norm_axis", "is_test"},
+                         {"Y", "Mean", "Variance"});
+}
+
+KernelSignature LayerNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "layer_norm_grad",
+      {"X", "Mean", "Variance", "Scale", "Bias", GradVarName("Y")},
+      {"epsilon", "begin_norm_axis", "is_test"},
+      {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(layer_norm, phi::LayerNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(layer_norm_grad,
+                           phi::LayerNormGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/mode_sig.cc b/paddle/phi/ops/compat/mode_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20994c08aa73c33328568e334d258c44eef68171
--- /dev/null
+++ b/paddle/phi/ops/compat/mode_sig.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"});
+}
+
+KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("mode_grad",
+                         {"X", "Indices", GradVarName("Out")},
+                         {"axis", "keepdim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/roi_pool_sig.cc b/paddle/phi/ops/compat/roi_pool_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d04c645f183c6e1ac91e4bf6003427008a24fe42
--- /dev/null
+++ b/paddle/phi/ops/compat/roi_pool_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RoiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_pool",
+                         {"X", "ROIs", "RoisNum"},
+                         {"pooled_height", "pooled_width", "spatial_scale"},
+                         {"Out", "Argmax"});
+}
+
+KernelSignature RoiPoolOpGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_pool_grad",
+                         {"X", "ROIs", "RoisNum", "Argmax", GradVarName("Out")},
+                         {"pooled_height", "pooled_width", "spatial_scale"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roi_pool, phi::RoiPoolOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_pool_grad, phi::RoiPoolOpGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc
index 9653250bded84f8ff87f613f6e17e50e351504fa..5feff54b028ba437125d65e4a6709254704164d8 100644
--- a/paddle/phi/ops/compat/set_value_sig.cc
+++ b/paddle/phi/ops/compat/set_value_sig.cc
@@ -19,9 +19,9 @@ namespace phi {
 
 KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.IsDenseTensorInput("Input")) {
-    if (ctx.HasInput("StartsTensorList")) {
-      if (ctx.HasInput("EndsTensorList")) {
-        if (ctx.HasInput("StepsTensorList")) {
+    if (ctx.InputSize("StartsTensorList") > 0) {
+      if (ctx.InputSize("EndsTensorList") > 0) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -197,7 +197,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
           }
         }
       } else {
-        if (ctx.HasInput("StepsTensorList")) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -374,8 +374,8 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
         }
       }
     } else {
-      if (ctx.HasInput("EndsTensorList")) {
-        if (ctx.HasInput("StepsTensorList")) {
+      if (ctx.InputSize("EndsTensorList") > 0) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -551,7 +551,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
           }
         }
       } else {
-        if (ctx.HasInput("StepsTensorList")) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -734,9 +734,9 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 KernelSignature SetValueGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
-  if (ctx.HasInput("StartsTensorList")) {
-    if (ctx.HasInput("EndsTensorList")) {
-      if (ctx.HasInput("StepsTensorList")) {
+  if (ctx.InputSize("StartsTensorList") > 0) {
+    if (ctx.InputSize("EndsTensorList") > 0) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
         return KernelSignature(
             "set_value_grad",
             {GradVarName("Out")},
@@ -760,7 +760,7 @@ KernelSignature SetValueGradOpArgumentMapping(
             {GradVarName("Input"), GradVarName("ValueTensor")});
       }
     } else {
-      if (ctx.HasInput("StepsTensorList")) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
         return KernelSignature(
             "set_value_grad",
             {GradVarName("Out")},
@@ -785,8 +785,8 @@ KernelSignature SetValueGradOpArgumentMapping(
       }
     }
   } else {
-    if (ctx.HasInput("EndsTensorList")) {
-      if (ctx.HasInput("StepsTensorList")) {
+    if (ctx.InputSize("EndsTensorList") > 0) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
         return KernelSignature(
             "set_value_grad",
             {GradVarName("Out")},
@@ -810,7 +810,7 @@ KernelSignature SetValueGradOpArgumentMapping(
             {GradVarName("Input"), GradVarName("ValueTensor")});
       }
     } else {
-      if (ctx.HasInput("StepsTensorList")) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
         return KernelSignature(
             "set_value_grad",
             {GradVarName("Out")},
diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc
index d69c7b2174f726d5757ea707678ddb383cf19d68..460d85f83133f9ecef83daa4e6a446e53485cd0e 100644
--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -61,6 +61,10 @@ TEST(DEV_API, copy) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
   phi::Copy(
       dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());
diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
index 3e5f96507415624750eb297953719f397e294230..9552c02976f30d11601967034815545f94ff1f97 100644
--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
index dc283728ee5f761e79c9c396d63121d555139dee..e3f2e8b57e3df48d860734f164f41be95f6f3d96 100644
--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -58,6 +58,10 @@ TEST(DEV_API, flatten) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
 
   // 2. test API
diff --git a/paddle/phi/tests/kernels/test_reshape_dev_api.cc b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
index 16ad4fc341be0ac68c571b29ffe182ae5d4c625f..7de039372fa9c2b46d5b6f9b430a816382072449 100644
--- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
@@ -50,6 +50,10 @@ TEST(DEV_API, reshape) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
   auto out = phi::Reshape<float>(dev_ctx, dense_x, shape);
   // 3. check result
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index 06048f33d940a28ddf9e3aa488a6e24a9e4a93b6..8468dad10eb64a066cc11dafa125dde3174b7e30 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -72,6 +72,11 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
     return selected_rows_inputs.count(name) > 0;
   }
 
+  // add member if needed
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    return false;
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     return dense_tensor_outputs.count(name) > 0;
   }
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 3b2df68074a82b7485b8c8f67e7d0d0fadf5fbd9..850d4015abf7a8164add9d4896d5a9bdfa26989d 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -44,6 +44,8 @@ function update_pd_ops() {
    cd ${PADDLE_ROOT}/tools/infrt/
    python3 generate_pd_op_dialect_from_paddle_op_maker.py
    python3 generate_phi_kernel_dialect.py
+   # generate test model
+   python3 paddle/infrt/tests/model/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs
 }
 
 function init() {
diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
index 3fadf25150f9ef3556a343fdce8acc24d788f5dc..f97c2778c0918ecbfbed546089c17e9d505818cd 100644
--- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
@@ -52,6 +52,30 @@ def parse_args():
         '--debug',
         action='store_true',
         help='If used, the graph of Quant model is drawn.')
+    parser.add_argument(
+        '--quant_model_filename',
+        type=str,
+        default="",
+        help='The input model`s file name. If empty, search default `__model__` and separate parameter files and use them or in case if not found, attempt loading `model` and `params` files.'
+    )
+    parser.add_argument(
+        '--quant_params_filename',
+        type=str,
+        default="",
+        help='If quant_model_filename is empty, this field is ignored. The input model`s all parameters file name. If empty load parameters from separate files.'
+    )
+    parser.add_argument(
+        '--save_model_filename',
+        type=str,
+        default="__model__",
+        help='The name of file to save the inference program itself. If is set None, a default filename __model__ will be used.'
+    )
+    parser.add_argument(
+        '--save_params_filename',
+        type=str,
+        default=None,
+        help='The name of file to save all related parameters. If it is set None, parameters will be saved in separate files'
+    )
 
     test_args, args = parser.parse_known_args(namespace=unittest)
     return test_args, sys.argv[:1] + args
@@ -61,18 +85,29 @@ def transform_and_save_int8_model(original_path,
                                   save_path,
                                   ops_to_quantize='',
                                   op_ids_to_skip='',
-                                  debug=False):
+                                  debug=False,
+                                  quant_model_filename='',
+                                  quant_params_filename='',
+                                  save_model_filename='',
+                                  save_params_filename=''):
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     inference_scope = fluid.executor.global_scope()
     with fluid.scope_guard(inference_scope):
-        if os.path.exists(os.path.join(original_path, '__model__')):
-            [inference_program, feed_target_names,
-             fetch_targets] = fluid.io.load_inference_model(original_path, exe)
+        if not quant_model_filename:
+            if os.path.exists(os.path.join(original_path, '__model__')):
+                [inference_program, feed_target_names,
+                 fetch_targets] = fluid.io.load_inference_model(original_path,
+                                                                exe)
+            else:
+                [inference_program, feed_target_names,
+                 fetch_targets] = fluid.io.load_inference_model(
+                     original_path, exe, 'model', 'params')
         else:
             [inference_program, feed_target_names,
-             fetch_targets] = fluid.io.load_inference_model(original_path, exe,
-                                                            'model', 'params')
+             fetch_targets] = fluid.io.load_inference_model(
+                 original_path, exe, quant_model_filename,
+                 quant_params_filename)
 
         ops_to_quantize_set = set()
         print(ops_to_quantize)
@@ -97,8 +132,14 @@ def transform_and_save_int8_model(original_path,
         graph = transform_to_mkldnn_int8_pass.apply(graph)
         inference_program = graph.to_program()
         with fluid.scope_guard(inference_scope):
-            fluid.io.save_inference_model(save_path, feed_target_names,
-                                          fetch_targets, exe, inference_program)
+            fluid.io.save_inference_model(
+                save_path,
+                feed_target_names,
+                fetch_targets,
+                exe,
+                inference_program,
+                model_filename=save_model_filename,
+                params_filename=save_params_filename)
         print(
             "Success! INT8 model obtained from the Quant model can be found at {}\n"
             .format(save_path))
@@ -109,4 +150,6 @@ if __name__ == '__main__':
     test_args, remaining_args = parse_args()
     transform_and_save_int8_model(
         test_args.quant_model_path, test_args.int8_model_save_path,
-        test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug)
+        test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug,
+        test_args.quant_model_filename, test_args.quant_params_filename,
+        test_args.save_model_filename, test_args.save_params_filename)
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 8149d69d36a27fadcefa8dc6b6ff1dd89792e29e..9439982858530e1e81156be4b32ef2d91dc4a33a 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -565,16 +565,25 @@ def grad(outputs,
         if isinstance(in_out_list, (list, tuple)):
             assert len(in_out_list) > 0, "{} cannot be empty".format(name)
             for each_var in in_out_list:
-                assert isinstance(
-                    each_var,
-                    core.VarBase), "Elements of {} must be Variable".format(
-                        name)
+                if core._in_eager_mode():
+                    assert isinstance(
+                        each_var, core.eager.
+                        Tensor), "Elements of {} must be Tensor".format(name)
+                else:
+                    assert isinstance(
+                        each_var,
+                        core.VarBase), "Elements of {} must be Variable".format(
+                            name)
             return in_out_list
         else:
-            assert isinstance(
-                in_out_list,
-                core.VarBase), "{} must be Variable or list of Variable".format(
-                    name)
+            if core._in_eager_mode():
+                assert isinstance(
+                    in_out_list, core.eager.
+                    Tensor), "{} must be Tensor or list of Tensor".format(name)
+            else:
+                assert isinstance(
+                    in_out_list, core.VarBase
+                ), "{} must be Variable or list of Variable".format(name)
             return [in_out_list]
 
     outputs = check_in_out(outputs, 'outputs')
@@ -586,9 +595,14 @@ def grad(outputs,
 
         for each_var in grad_outputs:
             if each_var is not None:
-                assert isinstance(
-                    each_var, core.VarBase
-                ), "grad_outputs must be None, a Variable or a list containing None or Variables"
+                if core._in_eager_mode():
+                    assert isinstance(
+                        each_var, core.eager.Tensor
+                    ), "grad_outputs must be None, a Variable or a list containing None or Variables"
+                else:
+                    assert isinstance(
+                        each_var, core.VarBase
+                    ), "grad_outputs must be None, a Variable or a list containing None or Variables"
     else:
         grad_outputs = []
 
@@ -600,14 +614,27 @@ def grad(outputs,
         no_grad_vars = []
     elif isinstance(no_grad_vars, core.VarBase):
         no_grad_vars = [no_grad_vars]
+    elif isinstance(no_grad_vars, core.eager.Tensor):
+        no_grad_vars = [no_grad_vars]
     elif isinstance(no_grad_vars, (list, tuple, set)):
         no_grad_vars = list(no_grad_vars)
         for var in no_grad_vars:
-            assert isinstance(
-                var, core.VarBase), "no_grad_vars can only contains Variable"
+            if core._in_eager_mode():
+                assert isinstance(
+                    var,
+                    core.eager.Tensor), "no_grad_vars can only contains Tensor"
+            else:
+                assert isinstance(
+                    var,
+                    core.VarBase), "no_grad_vars can only contains Variable"
     else:
-        raise AssertionError(
-            "no_grad_vars must be None, Variable or list/tuple/set of Variables")
+        if core._in_eager_mode():
+            raise AssertionError(
+                "no_grad_vars must be None, Tensor or list/tuple/set of Tensors")
+        else:
+            raise AssertionError(
+                "no_grad_vars must be None, Variable or list/tuple/set of Variables"
+            )
 
     assert isinstance(create_graph, bool), "create_graph must be True or False"
 
@@ -622,6 +649,11 @@ def grad(outputs,
     assert isinstance(only_inputs, bool), "only_inputs must be True or False"
     assert only_inputs, "only_inputs=False is not supported yet"
 
+    if core._in_eager_mode():
+        return core.eager.run_partial_grad(
+            outputs, inputs, grad_outputs, retain_graph, create_graph,
+            only_inputs, allow_unused, no_grad_vars)
+
     place = core.Place()
     place.set_place(framework._current_expected_place())
     return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index f58952d3036c506341955eff2472079bb696bb1f..a36164a277dec0762e7ba49a1d158837f27bc517 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -30,6 +30,7 @@ from paddle.fluid.layers import nn
 from paddle.fluid.layers.utils import _hash_with_id
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.framework import in_dygraph_mode
+from paddle import _C_ops
 
 __all__ = ['TranslatedLayer']
 
@@ -761,6 +762,21 @@ def _construct_params_and_buffers(model_path,
     return var_dict
 
 
+def _valid_vars(vars):
+    if vars:
+        return vars
+    if framework._in_eager_mode():
+        return [
+            core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
+                              core.VarDesc.VarType.RAW, False)
+        ]
+    else:
+        return [
+            core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
+                         core.VarDesc.VarType.RAW, False)
+        ]
+
+
 def _run_dygraph(instance, input, program_holder):
 
     # 1. prepare inputs, outputs, attrs
@@ -826,17 +842,12 @@ def _run_dygraph(instance, input, program_holder):
 
     # hold forward variables
     if framework._in_eager_mode():
-        tmp_scope_vec = core.eager.Tensor(
-            dtype=core.VarDesc.VarType.FP32,
-            dims=[],
-            name="program_out_scope",
-            type=core.VarDesc.VarType.STEP_SCOPES,
-            persistable=True)
+        tmp_scope_vec = [program_holder.scope]
     else:
         tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
                                      "program_out_scope",
                                      core.VarDesc.VarType.STEP_SCOPES, True)
-    tmp_scope_vec.value().set_scope(program_holder.scope)
+        tmp_scope_vec.value().set_scope(program_holder.scope)
 
     double_grad_vars = []
     for var_desc in program_holder.double_grad_descs:
@@ -852,41 +863,18 @@ def _run_dygraph(instance, input, program_holder):
                                var_desc.shape(),
                                var_desc.name(), var_desc.type(), False)
         double_grad_vars.append(var)
-    if len(double_grad_vars) == 0:
-        if framework._in_eager_mode():
-            double_grad_vars = [
-                core.eager.Tensor(
-                    value=[1],
-                    name='Fake_var',
-                    place=framework._current_expected_place())
-            ]
-        else:
-            double_grad_vars = [
-                core.VarBase(
-                    value=[1],
-                    name='Fake_var',
-                    place=framework._current_expected_place())
-            ]
 
     # 2. run program by op
     trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program
     end_op_index = program_holder.infer_program.block(0).op_size()
-    framework._dygraph_tracer().trace_op(
-        type='run_program',
-        inputs={'X': input_vars,
-                'Params': persistable_vars},
-        outputs={
-            'Out': output_vars,
-            'OutScope': tmp_scope_vec,
-            'DOut': double_grad_vars
-        },
-        attrs={
-            'global_block': trace_program.block(0),
-            'start_op_index': 0,
-            'end_op_index': end_op_index,
-            'is_test': instance._is_test,
-            'program_id': _hash_with_id(trace_program, instance)
-        })
+    attrs = ('global_block', trace_program.block(0), 'start_op_index', 0,
+             'end_op_index', end_op_index, 'is_test', instance._is_test,
+             'program_id', _hash_with_id(trace_program, instance))
+    _C_ops.run_program(
+        _valid_vars(input_vars),
+        _valid_vars(persistable_vars),
+        _valid_vars(output_vars), tmp_scope_vec,
+        _valid_vars(double_grad_vars), *attrs)
     # NOTE: [ why need set param's gradient type here ]
     # if user set sparse gradient mode, the param's gradient
     # will be SelectedRows, not LoDTensor. But tracer will just
@@ -914,8 +902,10 @@ def _run_dygraph(instance, input, program_holder):
 
 def drop_scope_if_no_grad(instance, scope_vec):
     tracer = framework._dygraph_tracer()
+    scope = scope_vec.value().get_scope() if isinstance(scope_vec, (
+        core.VarBase)) else scope_vec[0]
     if (not instance._is_test) and (not tracer._has_grad):
-        scope_vec.value().get_scope().drop_kids()
+        scope.drop_kids()
 
 
 def _run_static_graph(input, program_holder, trace_program):
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index b1865691b2475c4f855f51244e627965047d7720..1e1ce3ba7e4912d391085c7acbd7aa4bbb6a4da1 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -821,7 +821,7 @@ def save(layer, path, input_spec=None, **configs):
         for var in flatten(input_spec):
             if isinstance(var, paddle.static.InputSpec):
                 inner_input_spec.append(var)
-            elif isinstance(var, (core.VarBase, Variable)):
+            elif isinstance(var, (core.VarBase, core.eager.Tensor, Variable)):
                 inner_input_spec.append(
                     paddle.static.InputSpec.from_tensor(var))
             else:
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 53dbf1a66b27f35a75b44a0b6444cd8282c5278c..6957850d205794363183b4e6ca58a6daf3e11358 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -760,7 +760,8 @@ class Layer(object):
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
             raise KeyError("attribute '{}' already exists.".format(name))
-        elif tensor is not None and not type(tensor) == core.VarBase:
+        elif tensor is not None and not (type(tensor) == core.VarBase or
+                                         type(tensor) == core.eager.Tensor):
             raise TypeError(
                 "The registered buffer should be a core.VarBase, but received {}.".
                 format(type(tensor).__name__))
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index d0552ca41f0daf56ce23317dd06cb5744baaff84..d8b1883fc62a0fb4575a2e525d7d37a9029cf40d 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -35,6 +35,12 @@ final_state_name_mapping = {
         "x": "X",
         "out": "Out",
     },
+    "pool2d": {
+        "final_op_name": "final_state_pool2d",
+        "x": "X",
+        "kernel_size": "ksize",
+        "out": "Out",
+    },
     "abs": {
         "final_op_name": "final_state_abs",
         "x": "X",
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index cbe360f556cd986646bc7f45b3a80ab0f5edb9eb..c82172780b7b2e27e430d0494ce59f7dce626d74 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1118,9 +1118,9 @@ set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 150)
     set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 150)
     set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
     set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index cac64c7391351b23c4b9f9275c4b20bf85f571fd..2b8307461b8f57ea73503cf6ad4e8a90cdba652c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D
 from paddle.fluid.optimizer import AdamOptimizer
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
+from paddle.fluid.framework import _test_eager_guard
 
 from predictor_utils import PredictorTools
 
@@ -155,6 +156,13 @@ class TestMNISTWithToStatic(TestMNIST):
             np.allclose(dygraph_loss, static_loss),
             msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
                                                             static_loss))
+        with _test_eager_guard():
+            dygraph_loss = self.train_dygraph()
+            static_loss = self.train_static()
+            self.assertTrue(
+                np.allclose(dygraph_loss, static_loss),
+                msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
+                                                                static_loss))
 
     def test_mnist_declarative_cpu_vs_mkldnn(self):
         dygraph_loss_cpu = self.train_dygraph()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
index 8ff68a1ce0d69307547db2fd1f83526094c9bfcf..91c340c35d478d9576dcc3f1b15d4d2300692c5a 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
@@ -19,6 +19,7 @@ import unittest
 import os
 import numpy as np
 import random
+import socket
 
 import paddle
 import paddle.nn as nn
@@ -31,13 +32,26 @@ from paddle.optimizer import SGD
 from paddle.fluid.initializer import NumpyArrayInitializer
 
 
+def net_is_used(port, ip='127.0.0.1'):
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    try:
+        s.connect((ip, port))
+        s.shutdown(2)
+        return True
+    except Exception as e:
+        return False
+
+
 def init_process_group(strategy=None):
     nranks = ParallelEnv().nranks
     rank = ParallelEnv().local_rank
     is_master = True if rank == 0 else False
-    store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks)
-    group = core.ProcessGroupNCCL(store, rank, nranks)
-    return group
+    for port in range(20000, 21000):
+        if not net_is_used(port):
+            store = paddle.fluid.core.TCPStore("127.0.0.1", port, is_master,
+                                               nranks)
+            group = core.ProcessGroupNCCL(store, rank, nranks)
+            return group
 
 
 class LinearModel(nn.Layer):
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 27aec284de4cdebb5ebb9191bfb67d48c1b327f5..98ef339e04535bb943add02b6cf6efe490f0354b 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -52,7 +52,7 @@ class EagerScaleTestCase(unittest.TestCase):
             out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
             self.assertIsNone(data_eager.grad)
             out_eager.backward(grad_eager, False)
-            self.assertTrue(data_eager.grad._is_initialized())
+            self.assertIsNotNone(data_eager.grad)
             self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data))
 
     def test_retain_grad_and_run_backward_raises(self):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index d1d391a3949ead28697c0756803e873c41914079..318e826058f2c111f825b113c8ee4676ff87d630 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -17,7 +17,7 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 00967cb503fe5fd677839a869798964bb5fb0b71..b35b2840ed30a2650e6e19a4cfbc381f50fd5024 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -23,7 +23,7 @@ import paddle.fluid.core as core
 from paddle.fluid import Program, compiler, program_guard
 from paddle.fluid.op import Operator
 
-from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 
 
 class ElementwiseMulOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index cd4ba5b054264afca65d4c4d8359eb1854fbb658..7436e9eb7b12623296d7a714e742cc4212c4ca91 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,9 @@ from paddle.vision.models import resnet50, resnet101
 import unittest
 from unittest import TestCase
 import numpy as np
+import paddle.compat as cpt
+from paddle.fluid.framework import _test_eager_guard
+import paddle.fluid.core as core
 
 
 def _dygraph_guard_(func):
@@ -40,6 +43,80 @@ def random_var(size, low=-1, high=1, dtype='float32'):
     return fluid.dygraph.to_variable(x_np)
 
 
+class TestEagerGrad(TestCase):
+    def func_simple_example_eager_grad(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        out = paddle.matmul(x, y)
+        dx = fluid.dygraph.grad(out, x)
+
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
+
+    def test_simple_example_eager_grad(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad()
+        self.func_simple_example_eager_grad()
+
+    def func_simple_example_eager_grad_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        dx = fluid.dygraph.grad(out, [x, z], allow_unused=True)
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+        self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        # x is unused input in the graph
+        self.assertEqual(dx[1], None)
+
+    def test_simple_example_eager_grad_allow_unused(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_allow_unused()
+        self.func_simple_example_eager_grad_allow_unused()
+
+    def func_simple_example_eager_grad_not_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # allow_unused is false in default
+            dx = fluid.dygraph.grad(out, [x, z])
+        except ValueError as e:
+            error_msg = cpt.get_exception_message(e)
+            assert error_msg.find("allow_unused") > 0
+
+    def test_simple_example_eager_grad_not_allow_unused(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_not_allow_unused()
+        self.func_simple_example_eager_grad_not_allow_unused()
+
+
 class TestDygraphDoubleGrad(TestCase):
     def setUp(self):
         self.sort_sum_gradient = False
@@ -64,7 +141,7 @@ class TestDygraphDoubleGrad(TestCase):
             allow_unused=allow_unused)
 
     @dygraph_guard
-    def test_exception(self):
+    def func_exception(self):
         with self.assertRaises(AssertionError):
             self.grad(None, None)
 
@@ -93,8 +170,13 @@ class TestDygraphDoubleGrad(TestCase):
         with self.assertRaises(AssertionError):
             self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
 
+    def test_exception(self):
+        with _test_eager_guard():
+            self.func_exception()
+        self.func_exception()
+
     @dygraph_guard
-    def test_simple_example(self):
+    def func_simple_example(self):
         x = random_var(self.shape)
         x.stop_gradient = False
         y = x + 1
@@ -123,8 +205,44 @@ class TestDygraphDoubleGrad(TestCase):
             self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
                                 create_graph)
 
+    def test_simple_example(self):
+        with _test_eager_guard():
+            self.func_simple_example()
+        self.func_simple_example()
+
     @dygraph_guard
-    def test_none_one_initial_gradient(self):
+    def func_example_no_grad_vars(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y1 = fluid.layers.relu(x)
+        y2 = fluid.layers.relu(x)
+        z = y1 + y2
+        w = z * z
+
+        w_mean = fluid.layers.reduce_mean(w)
+        del y1, z, w
+
+        dx_actual, = self.grad(
+            [w_mean], [x], create_graph=True, no_grad_vars=[y2])
+
+        self.assertFalse(y2.stop_gradient)
+        self.assertFalse(dx_actual.stop_gradient)
+
+        dx_expected = (1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) *
+                       (x_np > 0) * 2).astype('float32')
+
+        self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
+
+    def test_example_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_no_grad_vars()
+        self.func_example_no_grad_vars()
+
+    @dygraph_guard
+    def func_none_one_initial_gradient(self):
         numel = 1
         for s in self.shape:
             numel *= s
@@ -190,8 +308,13 @@ class TestDygraphDoubleGrad(TestCase):
                             np.array_equal(grad_z.numpy(),
                                            original_random_grad_z))
 
+    def test_none_one_initial_gradient(self):
+        with _test_eager_guard():
+            self.func_none_one_initial_gradient()
+        self.func_none_one_initial_gradient()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_create_graph(self):
+    def func_example_with_gradient_accumulation_and_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -214,25 +337,33 @@ class TestDygraphDoubleGrad(TestCase):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward(retain_graph=True)
-
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 2 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
-
-        for i in range(5):
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
             loss.backward(retain_graph=True)
+
             x_grad_actual = x.gradient()
-            x_grad_expected = (i + 2) * (2.0 / float(numel) * (
+            x_grad_expected = (2.0 / float(numel) * (
                 x_np + dx_expected *
                 (x_np > 0) * 2 / float(numel))).astype('float32')
             self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
 
+            for i in range(5):
+                loss.backward(retain_graph=True)
+                x_grad_actual = x.gradient()
+                x_grad_expected = (i + 2) * (2.0 / float(numel) * (
+                    x_np + dx_expected *
+                    (x_np > 0) * 2 / float(numel))).astype('float32')
+                self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_create_graph()
+        self.func_example_with_gradient_accumulation_and_create_graph()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+    def func_example_with_gradient_accumulation_and_no_grad_vars(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -256,17 +387,25 @@ class TestDygraphDoubleGrad(TestCase):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 4 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 4 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_no_grad_vars()
+        self.func_example_with_gradient_accumulation_and_no_grad_vars()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+    def func_example_with_gradient_accumulation_and_not_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -289,12 +428,20 @@ class TestDygraphDoubleGrad(TestCase):
 
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_not_create_graph()
+        self.func_example_with_gradient_accumulation_and_not_create_graph()
 
 
 class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
@@ -304,7 +451,7 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
 
 
 class TestDygraphDoubleGradVisitedUniq(TestCase):
-    def test_compare(self):
+    def func_compare(self):
         value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2,
                                                           5).astype("float32")
 
@@ -349,6 +496,11 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
 
         self.assertTrue(np.array_equal(grad_1, grad_2))
 
+    def test_compare(self):
+        with _test_eager_guard():
+            self.func_compare()
+        self.func_compare()
+
 
 class TestRaiseNoDoubleGradOp(TestCase):
     def raise_no_grad_op(self):
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index ca9a489c7496f33cb084f1cd43158cebc7a1add6..b75dc2c964ca0b22219de1b33cdbfc3d74c19e45 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -215,6 +215,8 @@ class TestLayerNormOp(unittest.TestCase):
                                   for name in ['x', 'scale', 'bias', 'y@GRAD']
                               },
                               fetch_list=fetch_list)
+                # print(y)
+                # print(out[0])
                 self.__assert_close(y, out[0], "y")
                 self.__assert_close(mean, out[1], "mean")
                 self.__assert_close(variance, out[2], "variance", 1e-3)
@@ -238,6 +240,7 @@ class TestLayerNormOp(unittest.TestCase):
 
     def test_check_forward_backward_with_scale_and_bias(self):
         self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
+
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
         self.check_forward_backward(
             shape=[2, 3, 4, 5],
@@ -432,4 +435,5 @@ class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
index 2ffe523ef6dda18a24813e702a1892c335ba6a68..531e9663a2b728a2871dff404425b063a0c47e67 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@ import unittest
 from unittest import TestCase
 import numpy as np
 import paddle
+from paddle.fluid.framework import _test_eager_guard
+import paddle.fluid.core as core
 
 
 def _dygraph_guard_(func):
@@ -62,7 +64,7 @@ class TestDygraphDoubleGrad(TestCase):
             allow_unused=allow_unused)
 
     @dygraph_guard
-    def test_exception(self):
+    def func_exception(self):
         with self.assertRaises(AssertionError):
             self.grad(None, None)
 
@@ -91,8 +93,13 @@ class TestDygraphDoubleGrad(TestCase):
         with self.assertRaises(AssertionError):
             self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
 
+    def test_exception(self):
+        with _test_eager_guard():
+            self.func_exception()
+        self.func_exception()
+
     @dygraph_guard
-    def test_simple_example(self):
+    def func_simple_example(self):
         x = random_var(self.shape)
         x.stop_gradient = False
         y = x + 1
@@ -121,8 +128,13 @@ class TestDygraphDoubleGrad(TestCase):
             self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
                                 create_graph)
 
+    def test_simple_example(self):
+        with _test_eager_guard():
+            self.func_simple_example()
+        self.func_simple_example()
+
     @dygraph_guard
-    def test_none_one_initial_gradient(self):
+    def func_none_one_initial_gradient(self):
         numel = 1
         for s in self.shape:
             numel *= s
@@ -188,8 +200,13 @@ class TestDygraphDoubleGrad(TestCase):
                             np.array_equal(grad_z.numpy(),
                                            original_random_grad_z))
 
+    def test_none_one_initial_gradient(self):
+        with _test_eager_guard():
+            self.func_none_one_initial_gradient()
+        self.func_none_one_initial_gradient()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_create_graph(self):
+    def func_example_with_gradient_accumulation_and_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -212,17 +229,25 @@ class TestDygraphDoubleGrad(TestCase):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 2 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 2 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_create_graph()
+        self.func_example_with_gradient_accumulation_and_create_graph()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+    def func_example_with_gradient_accumulation_and_no_grad_vars(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -246,17 +271,25 @@ class TestDygraphDoubleGrad(TestCase):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
+
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 4 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 4 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_no_grad_vars()
+        self.func_example_with_gradient_accumulation_and_no_grad_vars()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+    def func_example_with_gradient_accumulation_and_not_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -279,12 +312,20 @@ class TestDygraphDoubleGrad(TestCase):
 
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_not_create_graph()
+        self.func_example_with_gradient_accumulation_and_not_create_graph()
 
 
 class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index 69bca8dd9ef15459021f44fd1b4887e636516ec6..66f2e871dac462c8e6e47357e7367755d2fc0cfc 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -849,6 +849,38 @@ def ref_softsign(x):
     return out
 
 
+class XPUTestSoftshrinkOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'softshrink'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSoftshrink(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "softshrink"
+            self.dtype = self.in_type
+
+            threshold = 0.5
+            np.random.seed(1023)
+            x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype)
+            out = ref_softshrink(x, threshold)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('softshrink')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftshrinkOP, stype)
+
+
+def ref_softshrink(x, threshold=0.5):
+    out = np.copy(x)
+    out = (out < -threshold) * (out + threshold) + (out > threshold) * (
+        out - threshold)
+    return out
+
+
 class XPUTestSwishOP(XPUOpTestWrapper):
     def __init__(self):
         self.op_name = 'swish'
@@ -879,5 +911,36 @@ def ref_swish(x):
     return out
 
 
+class XPUTestThresholdedReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'thresholded_relu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestThresholdedRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "thresholded_relu"
+            self.dtype = self.in_type
+
+            threshold = 1.0
+            np.random.seed(1024)
+            x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype)
+            x[np.abs(x) < 0.005] = 0.02
+            out = ref_thresholded_relu(x, threshold)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('thresholded_relu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestThresholdedReluOP, stype)
+
+
+def ref_thresholded_relu(x, threshold=1.0):
+    out = (x > threshold) * x
+    return out
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
index 2ad79dd0cca00585b01065e1ae6fbb34da4970d4..9999217041859f43a26b5cb071a2f4942634de2d 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
@@ -21,6 +21,8 @@ import random
 import sys
 sys.path.append("../")
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types
+from xpu.get_test_cover_info import XPUOpTestWrapper
 
 paddle.enable_static()
 np.set_printoptions(threshold=np.inf)
@@ -73,188 +75,198 @@ def seqconv(x,
     return np.dot(col, filter)
 
 
-class TestSeqProject(XPUOpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = 'sequence_conv'
-        self.use_xpu = True
-
-        if self.context_length == 1 \
-                and self.context_start == 0 \
-                and self.padding_trainable:
-            print("If context_start is 0 " \
-                  "and context_length is 1," \
-                  " padding_trainable should be false.")
-            return
-
-        # one level, batch size
-        x = np.random.uniform(-6.10907e-05, 0.000104218,
-                              [self.input_size[0],
-                               self.input_size[1]]).astype('float32')
-        w = np.random.uniform(-3.17068e-05, 0.000159822, [
-            self.context_length * self.input_size[1], self.output_represention
-        ]).astype('float32')
-
-        begin_pad = np.max([0, -self.context_start])
-        end_pad = np.max([0, self.context_start + self.context_length - 1])
-        total_pad = begin_pad + end_pad
-        padding_data = np.random.uniform(
-            0, 0, [total_pad, self.input_size[1]]).astype('float32')
-        self.pad_data = padding_data
-        self.inputs = {
-            'X': (x, self.lod),
-            'Filter': w,
-        }
-        self.inputs_val = ['X', 'Filter']
-        self.inputs_val_no_x = ['Filter']
-        self.inputs_val_no_f = ['X']
-
-        if total_pad != 0:
-            self.inputs['PaddingData'] = padding_data
-            self.inputs_val = ['X', 'PaddingData', 'Filter']
-            self.inputs_val_no_x = ['PaddingData', 'Filter']
-            self.inputs_val_no_f = ['PaddingData', 'X']
-
-        self.attrs = {
-            'contextStart': self.context_start,
-            'contextLength': self.context_length,
-            'paddingTrainable': self.padding_trainable,
-            'contextStride': self.context_stride
-        }
-        out = seqconv(x, self.lod, w, self.context_length, self.context_start,
-                      self.padding_trainable, self.pad_data)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad_input(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x))
-
-    def test_check_grad_padding_data(self):
-        if self.padding_trainable:
+class XPUTestSequenceConv(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'sequence_conv'
+
+    class TestSeqProject(XPUOpTest):
+        def setUp(self):
+            self.init_test_case()
+            self.op_type = 'sequence_conv'
+            self.dtype = self.in_type
+            self.use_xpu = True
+
+            if self.context_length == 1 \
+                    and self.context_start == 0 \
+                    and self.padding_trainable:
+                print("If context_start is 0 " \
+                      "and context_length is 1," \
+                      " padding_trainable should be false.")
+                return
+
+            # one level, batch size
+            x = np.random.uniform(-6.10907e-05, 0.000104218,
+                                  [self.input_size[0],
+                                   self.input_size[1]]).astype(self.dtype)
+            w = np.random.uniform(-3.17068e-05, 0.000159822, [
+                self.context_length * self.input_size[1],
+                self.output_represention
+            ]).astype(self.dtype)
+
+            begin_pad = np.max([0, -self.context_start])
+            end_pad = np.max([0, self.context_start + self.context_length - 1])
+            total_pad = begin_pad + end_pad
+            padding_data = np.random.uniform(
+                0, 0, [total_pad, self.input_size[1]]).astype(self.dtype)
+            self.pad_data = padding_data
+            self.inputs = {
+                'X': (x, self.lod),
+                'Filter': w,
+            }
+            self.inputs_val = ['X', 'Filter']
+            self.inputs_val_no_x = ['Filter']
+            self.inputs_val_no_f = ['X']
+
+            if total_pad != 0:
+                self.inputs['PaddingData'] = padding_data
+                self.inputs_val = ['X', 'PaddingData', 'Filter']
+                self.inputs_val_no_x = ['PaddingData', 'Filter']
+                self.inputs_val_no_f = ['PaddingData', 'X']
+
+            self.attrs = {
+                'contextStart': self.context_start,
+                'contextLength': self.context_length,
+                'paddingTrainable': self.padding_trainable,
+                'contextStride': self.context_stride
+            }
+            out = seqconv(x, self.lod, w, self.context_length,
+                          self.context_start, self.padding_trainable,
+                          self.pad_data)
+            self.outputs = {'Out': out}
+
+        def test_check_output(self):
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+        def test_check_grad_input(self):
+            self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x))
+
+        def test_check_grad_padding_data(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
+
+        def test_check_grad_Filter(self):
             self.check_grad(
-                ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
-
-    def test_check_grad_Filter(self):
-        self.check_grad(
-            ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f))
-
-    def test_check_grad_input_filter(self):
-        if self.padding_trainable:
-            self.check_grad(
-                ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData']))
-
-    def test_check_grad_padding_input(self):
-        if self.padding_trainable:
-            self.check_grad(
-                self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter']))
-
-    def test_check_grad_padding_filter(self):
-        if self.padding_trainable:
-            self.check_grad(self.inputs_val_no_x, 'Out', no_grad_set=set(['X']))
-
-    def init_test_case(self):
-        self.input_row = 7
-        self.input_col = 25
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, self.input_col]
-        offset_lod = [[0, 1, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase1(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 50]
-        offset_lod = [[0, 4, 5, 8, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase2Len0(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 50]
-        offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase3(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 25
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 25]
-        idx = list(range(self.input_size[0]))
-        del idx[0]
-        offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
-                      [self.input_size[0]]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase4(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 7835
-        self.input_col = 128
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, self.input_col]
-        offset_lod = [[
-            0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387, 515,
-            516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073, 1074, 1202,
-            1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876, 1912, 1913, 1914,
-            2032, 2066, 2194, 2308, 2309, 2347, 2475, 2476, 2477, 2478, 2606,
-            2607, 2735, 2736, 2737, 2738, 2838, 2966, 2967, 2968, 2969, 3097,
-            3225, 3353, 3481, 3482, 3520, 3642, 3643, 3754, 3882, 3883, 4010,
-            4011, 4012, 4140, 4219, 4228, 4356, 4357, 4415, 4475, 4476, 4604,
-            4605, 4606, 4694, 4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260,
-            5312, 5440, 5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939,
-            6021, 6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867,
-            6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595, 7699,
-            7827, 7835
-        ]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
+                ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f))
+
+        def test_check_grad_input_filter(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData']))
+
+        def test_check_grad_padding_input(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter']))
+
+        def test_check_grad_padding_filter(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    self.inputs_val_no_x, 'Out', no_grad_set=set(['X']))
+
+        def init_test_case(self):
+            self.input_row = 7
+            self.input_col = 25
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, self.input_col]
+            offset_lod = [[0, 1, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase1(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 11
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 50]
+            offset_lod = [[0, 4, 5, 8, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase2Len0(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 11
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 50]
+            offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase3(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 25
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 25]
+            idx = list(range(self.input_size[0]))
+            del idx[0]
+            offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
+                          [self.input_size[0]]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase4(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 7835
+            self.input_col = 128
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, self.input_col]
+            offset_lod = [[
+                0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387,
+                515, 516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073,
+                1074, 1202, 1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876,
+                1912, 1913, 1914, 2032, 2066, 2194, 2308, 2309, 2347, 2475,
+                2476, 2477, 2478, 2606, 2607, 2735, 2736, 2737, 2738, 2838,
+                2966, 2967, 2968, 2969, 3097, 3225, 3353, 3481, 3482, 3520,
+                3642, 3643, 3754, 3882, 3883, 4010, 4011, 4012, 4140, 4219,
+                4228, 4356, 4357, 4415, 4475, 4476, 4604, 4605, 4606, 4694,
+                4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260, 5312, 5440,
+                5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939, 6021,
+                6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867,
+                6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595,
+                7699, 7827, 7835
+            ]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+
+support_types = get_xpu_op_support_types('sequence_conv')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSequenceConv, stype)
 
 
 class TestSeqConvApi(unittest.TestCase):
diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py
index cb85ad0b7411c120b2704eb1639889202d77a0de..cb5458cf550103896a730fc7f248d3b8bfd88bbc 100644
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
@@ -30,7 +30,7 @@ def segment_sum(data, segment_ids, name=None):
     where sum is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (Tensor): A tensor, available data type float32, float64.
+        data (Tensor): A tensor, available data type float32, float64, int32, int64.
         segment_ids (Tensor): A 1-D tensor, which have the same size
                             with the first dimension of input data. 
                             Available data type is int32, int64.
@@ -57,7 +57,8 @@ def segment_sum(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -85,7 +86,7 @@ def segment_mean(data, segment_ids, name=None):
     of all index 'segment_ids[j] == i'.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size 
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -113,7 +114,8 @@ def segment_mean(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -140,7 +142,7 @@ def segment_min(data, segment_ids, name=None):
     where min is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -167,7 +169,8 @@ def segment_min(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -194,7 +197,7 @@ def segment_max(data, segment_ids, name=None):
     where max is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -221,7 +224,8 @@ def segment_max(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MAX")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index f06c45cc369737403025ed264815a98b81acc6da..7c0c71951aa1d7a566cabf73ecb9d26e03b8dab6 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -193,7 +193,7 @@ class InputSpec(object):
                 print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
 
         """
-        if isinstance(tensor, (Variable, core.VarBase)):
+        if isinstance(tensor, (Variable, core.VarBase, core.eager.Tensor)):
             return cls(tensor.shape, tensor.dtype, name or tensor.name)
         else:
             raise ValueError(
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 998a9425d4121b4871246417e0578f13e55c54af..37b1536e3f2989dfeee7746f0ceec47e2d8c69ef 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -141,6 +141,14 @@
   output : Tensor
   invoke : full_like(x, 1, dtype, place)
 
+- api : pool2d
+  args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
+  output : Tensor(out)
+  infer_meta :
+    func : PoolInferMeta
+  kernel:
+    func : pool2d
+
 - api : reshape
   args : (Tensor x, ScalarArray shape)
   output : Tensor(out)
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index d91b76bb70314a2d516b8a384cf3406b7f9e4d0d..bf3d7b3d19eab806706f1d2d654957aac5b33434 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -698,7 +698,7 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
             self.outputs['types'], 'SetKernelOutput', code_indent, inplace_flag)
         api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         return f"""
-{code_indent}  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+{code_indent}  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
 {code_indent}      "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
 {code_indent}  VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
 {code_indent}  VLOG(6) << "{self.api} API kernel: " << kernel;
diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td
index f754767259563f2cd64bac92adf76249b18af11f..861b31941200fd8a7482482cb683ff969bd05a18 100644
--- a/tools/infrt/custom_pdop.td
+++ b/tools/infrt/custom_pdop.td
@@ -23,16 +23,6 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
   let arguments = (ins PD_Tensor :$inputs, StrAttr:$name);
 }
 
-def PD_ReturnOp : PD_Op<"return", [Terminator]> {
-  let summary = "return Op";
-
-  let description = [{
-    Fetch tensor from the graph.
-  }];
-
-  let arguments = (ins Variadic<PD_Tensor>:$inputs);
-}
-
 def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
   let summary = "paddle graph Op";
   let description = [{
diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
index 027dfe4328a55ff246928cbc9ab6d3d36f15e1fd..8855e1eee38717a6cffc14e9c1762af36e94fa84 100644
--- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
+++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
@@ -16,8 +16,6 @@ import paddle.fluid.framework as framework
 from paddle.fluid import core
 from paddle import compat as cpt
 
-ops_having_canonicalization = {"elementwise_add", }
-
 
 # collect original ops: op which has both inference and grid defination
 def get_original_ops():
@@ -186,7 +184,7 @@ def generate_all_ops_inputs_outputs_map(op_descs):
     cpp_style_ops_outputs_map_str = start_ + ops_outputs_str + "\n};"
 
     # 3. Write to header file
-    dst_head_file = "../../paddle/infrt/dialect/pd_ops_info.h"
+    dst_head_file = "../../paddle/infrt/dialect/pd/common/pd_ops_info.h"
     with open(dst_head_file, 'w') as ops_inputs_outputs_head_file:
         ops_inputs_outputs_head_file.write(cpp_style_ops_inputs_map_str)
         ops_inputs_outputs_head_file.write("\n\n")
@@ -195,7 +193,7 @@ def generate_all_ops_inputs_outputs_map(op_descs):
 
 # funtion to generate paddle op dialect file
 def convert_op_proto_into_mlir(op_descs):
-    dst_dialect_file = "../../paddle/infrt/dialect/pd_ops.td"
+    dst_dialect_file = "../../paddle/infrt/dialect/pd/ir/pd_ops.td"
     custom_dialect_file = "custom_pdop.td"
 
     # 1. Head files
@@ -214,7 +212,7 @@ def convert_op_proto_into_mlir(op_descs):
         "include \"mlir/Interfaces/InferTypeOpInterface.td\"",
         "include \"mlir/Interfaces/LoopLikeInterface.td\"",
         "include \"mlir/IR/OpBase.td\"",
-        "include \"paddle/infrt/dialect/pd_op_base.td\"",
+        "include \"paddle/infrt/dialect/pd/ir/pd_op_base.td\"",
         "",
     ]
 
@@ -245,7 +243,6 @@ def convert_op_proto_into_mlir(op_descs):
             op_type=op_type,
             left_brace="{")
         SUMMARY = '  let summary = "{} op";\n'.format(op_type)
-        CANONICALIZATION = "let hasCanonicalizer = 1;" if op_type in ops_having_canonicalization else ""
 
         # 2.2 Description
         contents = ""
@@ -348,7 +345,6 @@ def convert_op_proto_into_mlir(op_descs):
             ops_mlir_file.write(DESCRIPTION)
             ops_mlir_file.write(ARGUMENTS)
             ops_mlir_file.write(RESULTS)
-            ops_mlir_file.write(CANONICALIZATION)
             ops_mlir_file.write("}\n")
 
     print("Skipped ops num: " + str(len(skipped_op_list)))