Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_temporal_shift_to_phi

c72cf5fa · phlrain · 77812c05 · 1904572a · c72cf5fa · c72cf5fa
154 changed file
--- a/cmake/external/paddle2onnx.cmake
+++ b/cmake/external/paddle2onnx.cmake
@@ -61,6 +61,7 @@ set(PADDLE2ONNX_OPTIONAL_ARGS
      -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
      -DWITH_STATIC=OFF
      -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
+      -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}
      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
      -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
      ${EXTERNAL_OPTIONAL_ARGS}

--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -4,7 +4,7 @@ if(WITH_PYTHON)
 endif()
 proto_library(interceptor_message_proto SRCS interceptor_message.proto)

-if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+if(WITH_DISTRIBUTE AND WITH_PSCORE)
  set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog)
 else()
  set(BRPC_DEPS "")

--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -67,8 +67,7 @@ bool MessageBus::IsInit() const { return is_init_; }

 MessageBus::~MessageBus() {
  VLOG(3) << "Message bus releases resource.";
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  server_.Stop(1000);
  server_.Join();
 #endif
@@ -87,8 +86,7 @@ bool MessageBus::Send(int64_t dst_rank,
      IsInit(), true,
      platform::errors::PreconditionNotMet(
          "Using message bus since it has not been initialized."));
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  int retry_time = 0;  // message bus will retry sending for 10 times
  while (retry_time < 10) {
    ++retry_time;
@@ -173,8 +171,7 @@ void MessageBus::ListenPort() {
    LOG(INFO) << "No need listen to port since training on single card.";
    return;
  }
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  // function keep listen the port and handle the message
  PADDLE_ENFORCE_EQ(
      server_.AddService(&message_service_, brpc::SERVER_DOESNT_OWN_SERVICE), 0,
@@ -203,8 +200,7 @@ void MessageBus::ListenPort() {
 #endif
 }

-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 bool MessageBus::SendInterRank(int64_t dst_rank,
                               const InterceptorMessage& interceptor_message) {
  const auto& dst_addr = GetAddr(dst_rank);

--- a/paddle/fluid/distributed/fleet_executor/message_bus.h
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.h
@@ -20,8 +20,7 @@
 #include <thread>
 #include <unordered_map>

-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "brpc/channel.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
@@ -64,8 +63,7 @@ class MessageBus final {

  const std::string& GetAddr(int64_t rank) const;

-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  // send the message inter rank (dst is different rank with src)
  bool SendInterRank(int64_t dst_rank,
                     const InterceptorMessage& interceptor_message);
@@ -81,8 +79,7 @@ class MessageBus final {
  // the ip needs to be listened
  std::string addr_;

-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  MessageServiceImpl message_service_;
  // brpc server
  brpc::Server server_;

--- a/paddle/fluid/distributed/fleet_executor/message_service.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_service.cc
@@ -11,8 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"

--- a/paddle/fluid/distributed/fleet_executor/message_service.h
+++ b/paddle/fluid/distributed/fleet_executor/message_service.h
@@ -11,8 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #pragma once

 #include "brpc/server.h"

--- a/paddle/fluid/distributed/ps.proto
+++ b/paddle/fluid/distributed/ps.proto
@@ -115,6 +115,7 @@ message TableParameter {
  optional CommonAccessorParameter common = 6;
  optional TableType type = 7;
  optional bool compress_in_save = 8 [ default = false ];
+  optional GraphParameter graph_parameter = 9;
 }

 message TableAccessorParameter {
@@ -211,3 +212,25 @@ message SparseAdamSGDParameter { // SparseAdamSGDRule
  optional double ada_epsilon = 5 [ default = 1e-08 ];
  repeated float weight_bounds = 6;
 }
+
+message GraphParameter {
+  optional int32 task_pool_size = 1 [ default = 24 ];
+  optional bool gpups_mode = 2 [ default = false ];
+  optional string gpups_graph_sample_class = 3
+      [ default = "CompleteGraphSampler" ];
+  optional string gpups_graph_sample_args = 4 [ default = "" ];
+  optional bool use_cache = 5 [ default = true ];
+  optional float cache_ratio = 6 [ default = 0.3 ];
+  optional int32 cache_ttl = 7 [ default = 5 ];
+  optional GraphFeature graph_feature = 8;
+  optional string table_name = 9 [ default = "" ];
+  optional string table_type = 10 [ default = "" ];
+  optional int32 gpups_mode_shard_num = 11 [ default = 127 ];
+  optional int32 gpu_num = 12 [ default = 1 ];
+}
+
+message GraphFeature {
+  repeated string name = 1;
+  repeated string dtype = 2;
+  repeated int32 shape = 3;
+}
\ No newline at end of file
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
@@ -44,7 +44,7 @@ void GraphPsService_Stub::service(
  }
 }

-int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
+int GraphBrpcClient::get_server_index_by_id(int64_t id) {
  int shard_num = get_shard_num();
  int shard_per_server = shard_num % server_size == 0
                             ? shard_num / server_size
@@ -53,7 +53,7 @@ int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
 }

 std::future<int32_t> GraphBrpcClient::get_node_feat(
-    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const uint32_t &table_id, const std::vector<int64_t> &node_ids,
    const std::vector<std::string> &feature_names,
    std::vector<std::vector<std::string>> &res) {
  std::vector<int> request2server;
@@ -66,7 +66,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
    }
  }
  size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
    int server_index = get_server_index_by_id(node_ids[query_idx]);
@@ -129,7 +129,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(

    closure->request(request_idx)
        ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
    std::string joint_feature_name =
        paddle::string::join_strings(feature_names, '\t');
    closure->request(request_idx)
@@ -179,9 +179,9 @@ std::future<int32_t> GraphBrpcClient::clear_nodes(uint32_t table_id) {
  return fut;
 }
 std::future<int32_t> GraphBrpcClient::add_graph_node(
-    uint32_t table_id, std::vector<uint64_t> &node_id_list,
+    uint32_t table_id, std::vector<int64_t> &node_id_list,
    std::vector<bool> &is_weighted_list) {
-  std::vector<std::vector<uint64_t>> request_bucket;
+  std::vector<std::vector<int64_t>> request_bucket;
  std::vector<std::vector<bool>> is_weighted_bucket;
  bool add_weight = is_weighted_list.size() > 0;
  std::vector<int> server_index_arr;
@@ -191,7 +191,7 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
    if (index_mapping[server_index] == -1) {
      index_mapping[server_index] = request_bucket.size();
      server_index_arr.push_back(server_index);
-      request_bucket.push_back(std::vector<uint64_t>());
+      request_bucket.push_back(std::vector<int64_t>());
      if (add_weight) is_weighted_bucket.push_back(std::vector<bool>());
    }
    request_bucket[index_mapping[server_index]].push_back(
@@ -229,7 +229,7 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
    size_t node_num = request_bucket[request_idx].size();
    closure->request(request_idx)
        ->add_params((char *)request_bucket[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
    if (add_weight) {
      bool weighted[is_weighted_bucket[request_idx].size() + 1];
      for (size_t j = 0; j < is_weighted_bucket[request_idx].size(); j++)
@@ -248,8 +248,8 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
  return fut;
 }
 std::future<int32_t> GraphBrpcClient::remove_graph_node(
-    uint32_t table_id, std::vector<uint64_t> &node_id_list) {
-  std::vector<std::vector<uint64_t>> request_bucket;
+    uint32_t table_id, std::vector<int64_t> &node_id_list) {
+  std::vector<std::vector<int64_t>> request_bucket;
  std::vector<int> server_index_arr;
  std::vector<int> index_mapping(server_size, -1);
  for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) {
@@ -257,7 +257,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
    if (index_mapping[server_index] == -1) {
      index_mapping[server_index] = request_bucket.size();
      server_index_arr.push_back(server_index);
-      request_bucket.push_back(std::vector<uint64_t>());
+      request_bucket.push_back(std::vector<int64_t>());
    }
    request_bucket[index_mapping[server_index]].push_back(
        node_id_list[query_idx]);
@@ -291,7 +291,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(

    closure->request(request_idx)
        ->add_params((char *)request_bucket[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
    // PsService_Stub rpc_stub(get_cmd_channel(server_index));
    GraphPsService_Stub rpc_stub =
        getServiceStub(get_cmd_channel(server_index));
@@ -303,9 +303,9 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
 }
 // char* &buffer,int &actual_size
 std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
-    uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
-    // std::vector<std::vector<std::pair<uint64_t, float>>> &res,
-    std::vector<std::vector<uint64_t>> &res,
+    uint32_t table_id, std::vector<int64_t> node_ids, int sample_size,
+    // std::vector<std::vector<std::pair<int64_t, float>>> &res,
+    std::vector<std::vector<int64_t>> &res,
    std::vector<std::vector<float>> &res_weight, bool need_weight,
    int server_index) {
  if (server_index != -1) {
@@ -337,7 +337,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
          int start = 0;
          while (start < actual_size) {
            res[node_idx].emplace_back(
-                *(uint64_t *)(node_buffer + offset + start));
+                *(int64_t *)(node_buffer + offset + start));
            start += GraphNode::id_size;
            if (need_weight) {
              res_weight[node_idx].emplace_back(
@@ -358,7 +358,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
    closure->request(0)->set_table_id(table_id);
    closure->request(0)->set_client_id(_client_id);
    closure->request(0)->add_params((char *)node_ids.data(),
-                                    sizeof(uint64_t) * node_ids.size());
+                                    sizeof(int64_t) * node_ids.size());
    closure->request(0)->add_params((char *)&sample_size, sizeof(int));
    closure->request(0)->add_params((char *)&need_weight, sizeof(bool));
    ;
@@ -380,14 +380,14 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
      server2request[server_index] = request2server.size();
      request2server.push_back(server_index);
    }
-    // res.push_back(std::vector<std::pair<uint64_t, float>>());
+    // res.push_back(std::vector<std::pair<int64_t, float>>());
    res.push_back({});
    if (need_weight) {
      res_weight.push_back({});
    }
  }
  size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
    int server_index = get_server_index_by_id(node_ids[query_idx]);
@@ -428,7 +428,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
              int start = 0;
              while (start < actual_size) {
                res[query_idx].emplace_back(
-                    *(uint64_t *)(node_buffer + offset + start));
+                    *(int64_t *)(node_buffer + offset + start));
                start += GraphNode::id_size;
                if (need_weight) {
                  res_weight[query_idx].emplace_back(
@@ -459,7 +459,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(

    closure->request(request_idx)
        ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
    closure->request(request_idx)
        ->add_params((char *)&sample_size, sizeof(int));
    closure->request(request_idx)
@@ -476,7 +476,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
 }
 std::future<int32_t> GraphBrpcClient::random_sample_nodes(
    uint32_t table_id, int server_index, int sample_size,
-    std::vector<uint64_t> &ids) {
+    std::vector<int64_t> &ids) {
  DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
    int ret = 0;
    auto *closure = (DownpourBrpcClosure *)done;
@@ -490,7 +490,7 @@ std::future<int32_t> GraphBrpcClient::random_sample_nodes(
      auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
      int index = 0;
      while (index < bytes_size) {
-        ids.push_back(*(uint64_t *)(buffer + index));
+        ids.push_back(*(int64_t *)(buffer + index));
        index += GraphNode::id_size;
      }
      delete[] buffer;
@@ -633,7 +633,7 @@ std::future<int32_t> GraphBrpcClient::pull_graph_list(
 }

 std::future<int32_t> GraphBrpcClient::set_node_feat(
-    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const uint32_t &table_id, const std::vector<int64_t> &node_ids,
    const std::vector<std::string> &feature_names,
    const std::vector<std::vector<std::string>> &features) {
  std::vector<int> request2server;
@@ -646,7 +646,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
    }
  }
  size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
  std::vector<std::vector<std::vector<std::string>>> features_idx_buckets(
      request_call_num);
@@ -696,7 +696,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(

    closure->request(request_idx)
        ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
    std::string joint_feature_name =
        paddle::string::join_strings(feature_names, '\t');
    closure->request(request_idx)

--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
@@ -63,8 +63,8 @@ class GraphBrpcClient : public BrpcPsClient {
  virtual ~GraphBrpcClient() {}
  // given a batch of nodes, sample graph_neighbors for each of them
  virtual std::future<int32_t> batch_sample_neighbors(
-      uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
-      std::vector<std::vector<uint64_t>>& res,
+      uint32_t table_id, std::vector<int64_t> node_ids, int sample_size,
+      std::vector<std::vector<int64_t>>& res,
      std::vector<std::vector<float>>& res_weight, bool need_weight,
      int server_index = -1);

@@ -75,20 +75,20 @@ class GraphBrpcClient : public BrpcPsClient {
  virtual std::future<int32_t> random_sample_nodes(uint32_t table_id,
                                                   int server_index,
                                                   int sample_size,
-                                                   std::vector<uint64_t>& ids);
+                                                   std::vector<int64_t>& ids);
  virtual std::future<int32_t> get_node_feat(
-      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const uint32_t& table_id, const std::vector<int64_t>& node_ids,
      const std::vector<std::string>& feature_names,
      std::vector<std::vector<std::string>>& res);

  virtual std::future<int32_t> set_node_feat(
-      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const uint32_t& table_id, const std::vector<int64_t>& node_ids,
      const std::vector<std::string>& feature_names,
      const std::vector<std::vector<std::string>>& features);

  virtual std::future<int32_t> clear_nodes(uint32_t table_id);
  virtual std::future<int32_t> add_graph_node(
-      uint32_t table_id, std::vector<uint64_t>& node_id_list,
+      uint32_t table_id, std::vector<int64_t>& node_id_list,
      std::vector<bool>& is_weighted_list);
  virtual std::future<int32_t> use_neighbors_sample_cache(uint32_t table_id,
                                                          size_t size_limit,
@@ -96,11 +96,11 @@ class GraphBrpcClient : public BrpcPsClient {
  virtual std::future<int32_t> load_graph_split_config(uint32_t table_id,
                                                       std::string path);
  virtual std::future<int32_t> remove_graph_node(
-      uint32_t table_id, std::vector<uint64_t>& node_id_list);
+      uint32_t table_id, std::vector<int64_t>& node_id_list);
  virtual int32_t initialize();
  int get_shard_num() { return shard_num; }
  void set_shard_num(int shard_num) { this->shard_num = shard_num; }
-  int get_server_index_by_id(uint64_t id);
+  int get_server_index_by_id(int64_t id);
  void set_local_channel(int index) {
    this->local_channel = get_cmd_channel(index);
  }

--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -140,9 +140,9 @@ int32_t GraphBrpcService::add_graph_node(Table *table,
    return 0;
  }

-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
  std::vector<bool> is_weighted_list;
  if (request.params_size() == 2) {
    size_t weight_list_size = request.params(1).size() / sizeof(bool);
@@ -165,9 +165,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table,
        "graph_get_node_feat request requires at least 1 argument");
    return 0;
  }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);

  ((GraphTable *)table)->remove_graph_node(node_ids);
  return 0;
@@ -386,9 +386,9 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
        "graph_random_sample_neighbors request requires at least 3 arguments");
    return 0;
  }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  int sample_size = *(uint64_t *)(request.params(1).c_str());
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  int sample_size = *(int64_t *)(request.params(1).c_str());
  bool need_weight = *(bool *)(request.params(2).c_str());
  std::vector<std::shared_ptr<char>> buffers(node_num);
  std::vector<int> actual_sizes(node_num, 0);
@@ -407,7 +407,7 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
 int32_t GraphBrpcService::graph_random_sample_nodes(
    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
    brpc::Controller *cntl) {
-  size_t size = *(uint64_t *)(request.params(0).c_str());
+  size_t size = *(int64_t *)(request.params(0).c_str());
  std::unique_ptr<char[]> buffer;
  int actual_size;
  if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) ==
@@ -430,9 +430,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table,
        "graph_get_node_feat request requires at least 2 arguments");
    return 0;
  }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);

  std::vector<std::string> feature_names =
      paddle::string::split_string<std::string>(request.params(1), "\t");
@@ -464,16 +464,16 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
                      "at least 3 arguments");
    return 0;
  }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t),
+  size_t node_num = request.params(0).size() / sizeof(int64_t),
         size_of_size_t = sizeof(size_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  int sample_size = *(uint64_t *)(request.params(1).c_str());
-  bool need_weight = *(uint64_t *)(request.params(2).c_str());
-  // std::vector<uint64_t> res = ((GraphTable
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  int sample_size = *(int64_t *)(request.params(1).c_str());
+  bool need_weight = *(int64_t *)(request.params(2).c_str());
+  // std::vector<int64_t> res = ((GraphTable
  // *)table).filter_out_non_exist_nodes(node_data, sample_size);
  std::vector<int> request2server;
  std::vector<int> server2request(server_size, -1);
-  std::vector<uint64_t> local_id;
+  std::vector<int64_t> local_id;
  std::vector<int> local_query_idx;
  size_t rank = get_rank();
  for (int query_idx = 0; query_idx < node_num; ++query_idx) {
@@ -496,7 +496,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
  std::vector<std::shared_ptr<char>> local_buffers;
  std::vector<int> local_actual_sizes;
  std::vector<size_t> seq;
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
  for (int query_idx = 0; query_idx < node_num; ++query_idx) {
    int server_index =
@@ -583,7 +583,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(

    closure->request(request_idx)
        ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
    closure->request(request_idx)
        ->add_params((char *)&sample_size, sizeof(int));
    closure->request(request_idx)
@@ -618,9 +618,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table,
        "graph_set_node_feat request requires at least 3 arguments");
    return 0;
  }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);

  std::vector<std::string> feature_names =
      paddle::string::split_string<std::string>(request.params(1), "\t");

--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
@@ -44,9 +44,9 @@ void GraphPyService::add_table_feat_conf(std::string table_name,
  }
 }

-void add_graph_node(std::vector<uint64_t> node_ids,
+void add_graph_node(std::vector<int64_t> node_ids,
                    std::vector<bool> weight_list) {}
-void remove_graph_node(std::vector<uint64_t> node_ids) {}
+void remove_graph_node(std::vector<int64_t> node_ids) {}
 void GraphPyService::set_up(std::string ips_str, int shard_num,
                            std::vector<std::string> node_types,
                            std::vector<std::string> edge_types) {
@@ -260,7 +260,7 @@ void GraphPyClient::clear_nodes(std::string name) {
 }

 void GraphPyClient::add_graph_node(std::string name,
-                                   std::vector<uint64_t>& node_ids,
+                                   std::vector<int64_t>& node_ids,
                                   std::vector<bool>& weight_list) {
  if (this->table_id_map.count(name)) {
    uint32_t table_id = this->table_id_map[name];
@@ -271,7 +271,7 @@ void GraphPyClient::add_graph_node(std::string name,
 }

 void GraphPyClient::remove_graph_node(std::string name,
-                                      std::vector<uint64_t>& node_ids) {
+                                      std::vector<int64_t>& node_ids) {
  if (this->table_id_map.count(name)) {
    uint32_t table_id = this->table_id_map[name];
    auto status = get_ps_client()->remove_graph_node(table_id, node_ids);
@@ -290,13 +290,12 @@ void GraphPyClient::load_node_file(std::string name, std::string filepath) {
  }
 }

-std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>>
+std::pair<std::vector<std::vector<int64_t>>, std::vector<float>>
 GraphPyClient::batch_sample_neighbors(std::string name,
-                                      std::vector<uint64_t> node_ids,
+                                      std::vector<int64_t> node_ids,
                                      int sample_size, bool return_weight,
                                      bool return_edges) {
-  // std::vector<std::vector<std::pair<uint64_t, float>>> v;
-  std::vector<std::vector<uint64_t>> v;
+  std::vector<std::vector<int64_t>> v;
  std::vector<std::vector<float>> v1;
  if (this->table_id_map.count(name)) {
    uint32_t table_id = this->table_id_map[name];
@@ -309,7 +308,7 @@ GraphPyClient::batch_sample_neighbors(std::string name,
  // res.first[1]: slice index
  // res.first[2]: src nodes
  // res.second: edges weight
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>> res;
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>> res;
  res.first.push_back({});
  res.first.push_back({});
  if (return_edges) res.first.push_back({});
@@ -342,10 +341,10 @@ void GraphPyClient::use_neighbors_sample_cache(std::string name,
    status.wait();
  }
 }
-std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
-                                                         int server_index,
-                                                         int sample_size) {
-  std::vector<uint64_t> v;
+std::vector<int64_t> GraphPyClient::random_sample_nodes(std::string name,
+                                                        int server_index,
+                                                        int sample_size) {
+  std::vector<int64_t> v;
  if (this->table_id_map.count(name)) {
    uint32_t table_id = this->table_id_map[name];
    auto status =
@@ -357,7 +356,7 @@ std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,

 // (name, dtype, ndarray)
 std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
-    std::string node_type, std::vector<uint64_t> node_ids,
+    std::string node_type, std::vector<int64_t> node_ids,
    std::vector<std::string> feature_names) {
  std::vector<std::vector<std::string>> v(
      feature_names.size(), std::vector<std::string>(node_ids.size()));
@@ -371,7 +370,7 @@ std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
 }

 void GraphPyClient::set_node_feat(
-    std::string node_type, std::vector<uint64_t> node_ids,
+    std::string node_type, std::vector<int64_t> node_ids,
    std::vector<std::string> feature_names,
    const std::vector<std::vector<std::string>> features) {
  if (this->table_id_map.count(node_type)) {

--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -70,18 +70,34 @@ class GraphPyService {
    ::paddle::distributed::TableAccessorParameter* accessor_proto =
        sparse_table_proto->mutable_accessor();

-    ::paddle::distributed::CommonAccessorParameter* common_proto =
-        sparse_table_proto->mutable_common();
+    // ::paddle::distributed::CommonAccessorParameter* common_proto =
+    //     sparse_table_proto->mutable_common();

+    ::paddle::distributed::GraphParameter* graph_proto =
+        sparse_table_proto->mutable_graph_parameter();
+
+    ::paddle::distributed::GraphFeature* graph_feature =
+        graph_proto->mutable_graph_feature();
+
+    graph_proto->set_task_pool_size(24);
+
+    graph_proto->set_table_name(table_name);
+    graph_proto->set_table_type(table_type);
+    graph_proto->set_use_cache(false);
    // Set GraphTable Parameter
-    common_proto->set_table_name(table_name);
-    common_proto->set_name(table_type);
+    // common_proto->set_table_name(table_name);
+    // common_proto->set_name(table_type);
+    // for (size_t i = 0; i < feat_name.size(); i++) {
+    //   common_proto->add_params(feat_dtype[i]);
+    //   common_proto->add_dims(feat_shape[i]);
+    //   common_proto->add_attributes(feat_name[i]);
+    // }
+
    for (size_t i = 0; i < feat_name.size(); i++) {
-      common_proto->add_params(feat_dtype[i]);
-      common_proto->add_dims(feat_shape[i]);
-      common_proto->add_attributes(feat_name[i]);
+      graph_feature->add_dtype(feat_dtype[i]);
+      graph_feature->add_shape(feat_shape[i]);
+      graph_feature->add_name(feat_name[i]);
    }
-
    accessor_proto->set_accessor_class("CommMergeAccessor");
  }

@@ -143,24 +159,24 @@ class GraphPyClient : public GraphPyService {
  void load_edge_file(std::string name, std::string filepath, bool reverse);
  void load_node_file(std::string name, std::string filepath);
  void clear_nodes(std::string name);
-  void add_graph_node(std::string name, std::vector<uint64_t>& node_ids,
+  void add_graph_node(std::string name, std::vector<int64_t>& node_ids,
                      std::vector<bool>& weight_list);
-  void remove_graph_node(std::string name, std::vector<uint64_t>& node_ids);
+  void remove_graph_node(std::string name, std::vector<int64_t>& node_ids);
  int get_client_id() { return client_id; }
  void set_client_id(int client_id) { this->client_id = client_id; }
  void start_client();
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>>
-  batch_sample_neighbors(std::string name, std::vector<uint64_t> node_ids,
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>>
+  batch_sample_neighbors(std::string name, std::vector<int64_t> node_ids,
                         int sample_size, bool return_weight,
                         bool return_edges);
-  std::vector<uint64_t> random_sample_nodes(std::string name, int server_index,
-                                            int sample_size);
+  std::vector<int64_t> random_sample_nodes(std::string name, int server_index,
+                                           int sample_size);
  std::vector<std::vector<std::string>> get_node_feat(
-      std::string node_type, std::vector<uint64_t> node_ids,
+      std::string node_type, std::vector<int64_t> node_ids,
      std::vector<std::string> feature_names);
  void use_neighbors_sample_cache(std::string name, size_t total_size_limit,
                                  size_t ttl);
-  void set_node_feat(std::string node_type, std::vector<uint64_t> node_ids,
+  void set_node_feat(std::string node_type, std::vector<int64_t> node_ids,
                     std::vector<std::string> feature_names,
                     const std::vector<std::vector<std::string>> features);
  std::vector<FeatureNode> pull_graph_list(std::string name, int server_index,

--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -53,7 +53,6 @@ cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_pro

 set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table)
-
 cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)

 target_link_libraries(table -fopenmp)
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -27,6 +27,288 @@
 namespace paddle {
 namespace distributed {

+#ifdef PADDLE_WITH_HETERPS
+
+int CompleteGraphSampler::run_graph_sampling() {
+  pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
+  pthread_rwlock_rdlock(rw_lock);
+  std::cout << "in graph sampling" << std::endl;
+  sample_nodes.clear();
+  sample_neighbors.clear();
+  sample_res.clear();
+  sample_nodes.resize(gpu_num);
+  sample_neighbors.resize(gpu_num);
+  sample_res.resize(gpu_num);
+  std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+      sample_nodes_ex(graph_table->task_pool_size_);
+  std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+      graph_table->task_pool_size_);
+  for (int i = 0; i < graph_table->task_pool_size_; i++) {
+    sample_nodes_ex[i].resize(gpu_num);
+    sample_neighbors_ex[i].resize(gpu_num);
+  }
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < graph_table->shards.size(); ++i) {
+    tasks.push_back(
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue([&, i, this]() -> int {
+              if (this->status == GraphSamplerStatus::terminating) return 0;
+              paddle::framework::GpuPsGraphNode node;
+              std::vector<Node *> &v =
+                  this->graph_table->shards[i]->get_bucket();
+              size_t ind = i % this->graph_table->task_pool_size_;
+              for (size_t j = 0; j < v.size(); j++) {
+                size_t location = v[j]->get_id() % this->gpu_num;
+                node.node_id = v[j]->get_id();
+                node.neighbor_size = v[j]->get_neighbor_size();
+                node.neighbor_offset =
+                    (int)sample_neighbors_ex[ind][location].size();
+                sample_nodes_ex[ind][location].emplace_back(node);
+                for (int k = 0; k < node.neighbor_size; k++)
+                  sample_neighbors_ex[ind][location].push_back(
+                      v[j]->get_neighbor_id(k));
+              }
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  tasks.clear();
+  for (size_t i = 0; i < gpu_num; i++) {
+    tasks.push_back(
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue([&, i, this]() -> int {
+              if (this->status == GraphSamplerStatus::terminating) return 0;
+              int total_offset = 0;
+              size_t ind = i % this->graph_table->task_pool_size_;
+              for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
+                for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) {
+                  sample_nodes[ind].push_back(sample_nodes_ex[j][ind][k]);
+                  sample_nodes[ind].back().neighbor_offset += total_offset;
+                }
+                size_t neighbor_size = sample_neighbors_ex[j][ind].size();
+                total_offset += neighbor_size;
+                for (size_t k = 0; k < neighbor_size; k++) {
+                  sample_neighbors[ind].push_back(
+                      sample_neighbors_ex[j][ind][k]);
+                }
+              }
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+
+  if (this->status == GraphSamplerStatus::terminating) {
+    pthread_rwlock_unlock(rw_lock);
+    return 0;
+  }
+  for (size_t i = 0; i < gpu_num; i++) {
+    sample_res[i].node_list = sample_nodes[i].data();
+    sample_res[i].neighbor_list = sample_neighbors[i].data();
+    sample_res[i].node_size = sample_nodes[i].size();
+    sample_res[i].neighbor_size = sample_neighbors[i].size();
+  }
+  pthread_rwlock_unlock(rw_lock);
+  if (this->status == GraphSamplerStatus::terminating) {
+    return 0;
+  }
+  callback(sample_res);
+  return 0;
+}
+void CompleteGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
+                                std::vector<std::string> args) {
+  this->gpu_num = gpu_num;
+  this->graph_table = graph_table;
+}
+
+int BasicBfsGraphSampler::run_graph_sampling() {
+  pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
+  pthread_rwlock_rdlock(rw_lock);
+  while (rounds > 0 && status == GraphSamplerStatus::running) {
+    for (size_t i = 0; i < sample_neighbors_map.size(); i++) {
+      sample_neighbors_map[i].clear();
+    }
+    sample_neighbors_map.clear();
+    std::vector<int> nodes_left(graph_table->shards.size(),
+                                node_num_for_each_shard);
+    std::promise<int> prom;
+    std::future<int> fut = prom.get_future();
+    sample_neighbors_map.resize(graph_table->task_pool_size_);
+    int task_size = 0;
+    std::vector<std::future<int>> tasks;
+    int init_size = 0;
+    //__sync_fetch_and_add
+    std::function<int(int, int64_t)> bfs = [&, this](int i, int id) -> int {
+      VLOG(0) << "in bfs " << i << " " << id;
+      if (this->status == GraphSamplerStatus::terminating) {
+        int task_left = __sync_sub_and_fetch(&task_size, 1);
+        if (task_left == 0) {
+          prom.set_value(0);
+        }
+        return 0;
+      }
+      size_t ind = i % this->graph_table->task_pool_size_;
+      if (nodes_left[i] > 0) {
+        nodes_left[i]--;
+        auto iter = sample_neighbors_map[ind].find(id);
+        if (iter == sample_neighbors_map[ind].end()) {
+          sample_neighbors_map[ind][id] = std::vector<int64_t>();
+          iter = sample_neighbors_map[ind].find(id);
+          Node *node = graph_table->shards[i]->find_node(id);
+          if (node != NULL) {
+            size_t edge_fetch_size =
+                std::min((size_t) this->edge_num_for_each_node,
+                         node->get_neighbor_size());
+            for (size_t k = 0; k < edge_fetch_size; k++) {
+              int64_t neighbor_id = node->get_neighbor_id(k);
+              int node_location = neighbor_id % this->graph_table->shard_num %
+                                  this->graph_table->task_pool_size_;
+              __sync_add_and_fetch(&task_size, 1);
+              graph_table->_shards_task_pool[node_location]->enqueue(
+                  bfs, neighbor_id % this->graph_table->shard_num, neighbor_id);
+              iter->second.push_back(neighbor_id);
+            }
+          }
+        }
+      }
+      int task_left = __sync_sub_and_fetch(&task_size, 1);
+      if (task_left == 0) {
+        prom.set_value(0);
+      }
+      return 0;
+    };
+    for (size_t i = 0; i < graph_table->shards.size(); ++i) {
+      std::vector<Node *> &v = graph_table->shards[i]->get_bucket();
+      if (v.size() > 0) {
+        init_size++;
+        __sync_add_and_fetch(&task_size, 1);
+        int64_t id = v[0]->get_id();
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue(bfs, i, id);
+      }  // if
+    }
+    if (init_size == 0) {
+      prom.set_value(0);
+    }
+    fut.get();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    std::cout << "bfs over" << std::endl;
+    sample_nodes.clear();
+    sample_neighbors.clear();
+    sample_res.clear();
+    sample_nodes.resize(gpu_num);
+    sample_neighbors.resize(gpu_num);
+    sample_res.resize(gpu_num);
+    std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+        sample_nodes_ex(graph_table->task_pool_size_);
+    std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+        graph_table->task_pool_size_);
+    for (int i = 0; i < graph_table->task_pool_size_; i++) {
+      sample_nodes_ex[i].resize(gpu_num);
+      sample_neighbors_ex[i].resize(gpu_num);
+    }
+    tasks.clear();
+    for (size_t i = 0; i < (size_t)graph_table->task_pool_size_; ++i) {
+      tasks.push_back(
+          graph_table->_shards_task_pool[i]->enqueue([&, i, this]() -> int {
+            if (this->status == GraphSamplerStatus::terminating) {
+              return 0;
+            }
+            paddle::framework::GpuPsGraphNode node;
+            auto iter = sample_neighbors_map[i].begin();
+            size_t ind = i;
+            for (; iter != sample_neighbors_map[i].end(); iter++) {
+              size_t location = iter->first % this->gpu_num;
+              node.node_id = iter->first;
+              node.neighbor_size = iter->second.size();
+              node.neighbor_offset =
+                  (int)sample_neighbors_ex[ind][location].size();
+              sample_nodes_ex[ind][location].emplace_back(node);
+              for (auto k : iter->second)
+                sample_neighbors_ex[ind][location].push_back(k);
+            }
+            return 0;
+          }));
+    }
+
+    for (size_t i = 0; i < tasks.size(); i++) {
+      tasks[i].get();
+      sample_neighbors_map[i].clear();
+    }
+    tasks.clear();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    for (size_t i = 0; i < gpu_num; i++) {
+      tasks.push_back(
+          graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+              ->enqueue([&, i, this]() -> int {
+                if (this->status == GraphSamplerStatus::terminating) {
+                  pthread_rwlock_unlock(rw_lock);
+                  return 0;
+                }
+                int total_offset = 0;
+                size_t ind = i % graph_table->task_pool_size_;
+                for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
+                  for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) {
+                    sample_nodes[i].push_back(sample_nodes_ex[j][ind][k]);
+                    sample_nodes[i].back().neighbor_offset += total_offset;
+                    // neighbor_offset[i].push_back(total_offset +
+                    // neighbor_offset_ex[j][i][k]);
+                  }
+                  size_t neighbor_size = sample_neighbors_ex[j][ind].size();
+                  total_offset += neighbor_size;
+                  for (size_t k = 0; k < neighbor_size; k++) {
+                    sample_neighbors[ind].push_back(
+                        sample_neighbors_ex[j][ind][k]);
+                  }
+                }
+                return 0;
+              }));
+    }
+    for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    // int64_t total_neighbors =
+    // std::accumulate(shard_neighbor_size.begin(),shard_neighbor_size.end(),0);
+    for (size_t i = 0; i < gpu_num; i++) {
+      sample_res[i].node_list = sample_nodes[i].data();
+      sample_res[i].neighbor_list = sample_neighbors[i].data();
+      sample_res[i].node_size = sample_nodes[i].size();
+      sample_res[i].neighbor_size = sample_neighbors[i].size();
+    }
+    pthread_rwlock_unlock(rw_lock);
+    if (this->status == GraphSamplerStatus::terminating) {
+      return 0;
+    }
+    callback(sample_res);
+    rounds--;
+    if (rounds > 0) {
+      for (int i = 0;
+           i < interval && this->status == GraphSamplerStatus::running; i++) {
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      }
+    }
+  }
+  return 0;
+}
+void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
+                                std::vector<std::string> args) {
+  this->gpu_num = gpu_num;
+  this->graph_table = graph_table;
+  node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10;
+  edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10;
+  rounds = args.size() > 2 ? std::stoi(args[2]) : 1;
+  interval = args.size() > 3 ? std::stoi(args[3]) : 60;
+}
+
+#endif
+
 std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
  if (start < 0) start = 0;
  std::vector<Node *> res;
@@ -38,10 +320,10 @@ std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {

 size_t GraphShard::get_size() { return bucket.size(); }

-int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
+int32_t GraphTable::add_graph_node(std::vector<int64_t> &id_list,
                                   std::vector<bool> &is_weight_list) {
  size_t node_size = id_list.size();
-  std::vector<std::vector<std::pair<uint64_t, bool>>> batch(task_pool_size_);
+  std::vector<std::vector<std::pair<int64_t, bool>>> batch(task_pool_size_);
  for (size_t i = 0; i < node_size; i++) {
    size_t shard_id = id_list[i] % shard_num;
    if (shard_id >= shard_end || shard_id < shard_start) {
@@ -65,9 +347,9 @@ int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
  return 0;
 }

-int32_t GraphTable::remove_graph_node(std::vector<uint64_t> &id_list) {
+int32_t GraphTable::remove_graph_node(std::vector<int64_t> &id_list) {
  size_t node_size = id_list.size();
-  std::vector<std::vector<uint64_t>> batch(task_pool_size_);
+  std::vector<std::vector<int64_t>> batch(task_pool_size_);
  for (size_t i = 0; i < node_size; i++) {
    size_t shard_id = id_list[i] % shard_num;
    if (shard_id >= shard_end || shard_id < shard_start) continue;
@@ -98,7 +380,7 @@ void GraphShard::clear() {

 GraphShard::~GraphShard() { clear(); }

-void GraphShard::delete_node(uint64_t id) {
+void GraphShard::delete_node(int64_t id) {
  auto iter = node_location.find(id);
  if (iter == node_location.end()) return;
  int pos = iter->second;
@@ -110,7 +392,7 @@ void GraphShard::delete_node(uint64_t id) {
  node_location.erase(id);
  bucket.pop_back();
 }
-GraphNode *GraphShard::add_graph_node(uint64_t id) {
+GraphNode *GraphShard::add_graph_node(int64_t id) {
  if (node_location.find(id) == node_location.end()) {
    node_location[id] = bucket.size();
    bucket.push_back(new GraphNode(id));
@@ -126,7 +408,7 @@ GraphNode *GraphShard::add_graph_node(Node *node) {
  }
  return (GraphNode *)bucket[node_location[id]];
 }
-FeatureNode *GraphShard::add_feature_node(uint64_t id) {
+FeatureNode *GraphShard::add_feature_node(int64_t id) {
  if (node_location.find(id) == node_location.end()) {
    node_location[id] = bucket.size();
    bucket.push_back(new FeatureNode(id));
@@ -134,11 +416,11 @@ FeatureNode *GraphShard::add_feature_node(uint64_t id) {
  return (FeatureNode *)bucket[node_location[id]];
 }

-void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) {
+void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) {
  find_node(id)->add_edge(dst_id, weight);
 }

-Node *GraphShard::find_node(uint64_t id) {
+Node *GraphShard::find_node(int64_t id) {
  auto iter = node_location.find(id);
  return iter == node_location.end() ? nullptr : bucket[iter->second];
 }
@@ -185,14 +467,14 @@ int32_t GraphTable::load(const std::string &path, const std::string &param) {
 }

 int32_t GraphTable::get_nodes_ids_by_ranges(
-    std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res) {
+    std::vector<std::pair<int, int>> ranges, std::vector<int64_t> &res) {
  int start = 0, end, index = 0, total_size = 0;
  res.clear();
-  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  std::vector<std::future<std::vector<int64_t>>> tasks;
  for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
    end = total_size + shards[i]->get_size();
    start = total_size;
-    while (start < end && index < ranges.size()) {
+    while (start < end && index < (int)ranges.size()) {
      if (ranges[index].second <= start)
        index++;
      else if (ranges[index].first >= end) {
@@ -204,7 +486,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
        first -= total_size;
        second -= total_size;
        tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
-            [this, first, second, i]() -> std::vector<uint64_t> {
+            [this, first, second, i]() -> std::vector<int64_t> {
              return shards[i]->get_ids_by_range(first, second);
            }));
      }
@@ -276,6 +558,9 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
 }

 int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
+#ifdef PADDLE_WITH_HETERPS
+  if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
+#endif
  auto paths = paddle::string::split_string<std::string>(path, ";");
  int64_t count = 0;
  std::string sample_type = "random";
@@ -351,6 +636,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
  /*-----------------------
  relocate the duplicate nodes to make them distributed evenly among threads.
 */
+  if (!use_duplicate_nodes) {
+#ifdef PADDLE_WITH_HETERPS
+    if (gpups_mode) pthread_rwlock_unlock(rw_lock.get());
+#endif
+
+    return 0;
+  }
  for (auto &shard : extra_shards) {
    auto bucket = shard->get_bucket();
    for (size_t i = 0; i < bucket.size(); i++) {
@@ -360,13 +652,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
  int size = extra_nodes_to_thread_index.size();
  if (size == 0) return 0;
  std::vector<int> index;
-  for (int i = 0; i < used.size(); i++) index.push_back(i);
+  for (int i = 0; i < (int)used.size(); i++) index.push_back(i);
  sort(index.begin(), index.end(),
       [&](int &a, int &b) { return used[a] < used[b]; });

  std::vector<int> alloc(index.size(), 0), has_alloc(index.size(), 0);
  int t = 1, aim = 0, mod = 0;
-  for (; t < used.size(); t++) {
+  for (; t < (int)used.size(); t++) {
    if ((used[index[t]] - used[index[t - 1]]) * t >= size) {
      break;
    } else {
@@ -380,7 +672,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
    if (t - x <= mod) alloc[index[x]]++;
    alloc[index[x]] -= used[index[x]];
  }
-  std::vector<uint64_t> vec[index.size()];
+  std::vector<int64_t> vec[index.size()];
  for (auto p : extra_nodes_to_thread_index) {
    has_alloc[p.second]++;
    vec[p.second].push_back(p.first);
@@ -395,7 +687,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
                     has_alloc[index[right]] - alloc[index[right]]);
    has_alloc[index[left]] += x;
    has_alloc[index[right]] -= x;
-    uint64_t id;
+    int64_t id;
    while (x--) {
      id = vec[index[right]].back();
      vec[index[right]].pop_back();
@@ -424,10 +716,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
    delete extra_shards[i];
    extra_shards[i] = extra_shards_copy[i];
  }
+#ifdef PADDLE_WITH_HETERPS
+  if (gpups_mode) pthread_rwlock_unlock(rw_lock.get());
+#endif
  return 0;
 }

-Node *GraphTable::find_node(uint64_t id) {
+Node *GraphTable::find_node(int64_t id) {
  size_t shard_id = id % shard_num;
  if (shard_id >= shard_end || shard_id < shard_start) {
    if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
@@ -443,7 +738,7 @@ Node *GraphTable::find_node(uint64_t id) {
  Node *node = shards[index]->find_node(id);
  return node;
 }
-uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
+uint32_t GraphTable::get_thread_pool_index(int64_t node_id) {
  if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
    return node_id % shard_num % shard_num_per_server % task_pool_size_;
  size_t src_shard_id = node_id % shard_num;
@@ -456,8 +751,7 @@ uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
  return src_shard_id % shard_num_per_server % task_pool_size_;
 }

-uint32_t GraphTable::get_thread_pool_index_by_shard_index(
-    uint64_t shard_index) {
+uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) {
  return shard_index % shard_num_per_server % task_pool_size_;
 }

@@ -484,7 +778,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
                                        std::unique_ptr<char[]> &buffer,
                                        int &actual_size) {
  int total_size = 0;
-  for (int i = 0; i < shards.size(); i++) {
+  for (int i = 0; i < (int)shards.size(); i++) {
    total_size += shards[i]->get_size();
  }
  if (sample_size > total_size) sample_size = total_size;
@@ -537,16 +831,16 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
    }
  }
  for (auto &pair : first_half) second_half.push_back(pair);
-  std::vector<uint64_t> res;
+  std::vector<int64_t> res;
  get_nodes_ids_by_ranges(second_half, res);
-  actual_size = res.size() * sizeof(uint64_t);
+  actual_size = res.size() * sizeof(int64_t);
  buffer.reset(new char[actual_size]);
  char *pointer = buffer.get();
  memcpy(pointer, res.data(), actual_size);
  return 0;
 }
 int32_t GraphTable::random_sample_neighbors(
-    uint64_t *node_ids, int sample_size,
+    int64_t *node_ids, int sample_size,
    std::vector<std::shared_ptr<char>> &buffers, std::vector<int> &actual_sizes,
    bool need_weight) {
  size_t node_num = buffers.size();
@@ -560,10 +854,10 @@ int32_t GraphTable::random_sample_neighbors(
    seq_id[index].emplace_back(idx);
    id_list[index].emplace_back(node_ids[idx], sample_size, need_weight);
  }
-  for (int i = 0; i < seq_id.size(); i++) {
+  for (int i = 0; i < (int)seq_id.size(); i++) {
    if (seq_id[i].size() == 0) continue;
    tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
-      uint64_t node_id;
+      int64_t node_id;
      std::vector<std::pair<SampleKey, SampleResult>> r;
      LRUResponse response = LRUResponse::blocked;
      if (use_cache) {
@@ -576,7 +870,7 @@ int32_t GraphTable::random_sample_neighbors(
      std::vector<SampleKey> sample_keys;
      auto &rng = _shards_task_rng_pool[i];
      for (size_t k = 0; k < id_list[i].size(); k++) {
-        if (index < r.size() &&
+        if (index < (int)r.size() &&
            r[index].first.node_key == id_list[i][k].node_key) {
          idx = seq_id[i][k];
          actual_sizes[idx] = r[index].second.actual_size;
@@ -597,7 +891,7 @@ int32_t GraphTable::random_sample_neighbors(
              res.size() * (need_weight ? (Node::id_size + Node::weight_size)
                                        : Node::id_size);
          int offset = 0;
-          uint64_t id;
+          int64_t id;
          float weight;
          char *buffer_addr = new char[actual_size];
          if (response == LRUResponse::ok) {
@@ -632,13 +926,13 @@ int32_t GraphTable::random_sample_neighbors(
  return 0;
 }

-int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
+int32_t GraphTable::get_node_feat(const std::vector<int64_t> &node_ids,
                                  const std::vector<std::string> &feature_names,
                                  std::vector<std::vector<std::string>> &res) {
  size_t node_num = node_ids.size();
  std::vector<std::future<int>> tasks;
  for (size_t idx = 0; idx < node_num; ++idx) {
-    uint64_t node_id = node_ids[idx];
+    int64_t node_id = node_ids[idx];
    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
        [&, idx, node_id]() -> int {
          Node *node = find_node(node_id);
@@ -646,7 +940,8 @@ int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
          if (node == nullptr) {
            return 0;
          }
-          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+          for (int feat_idx = 0; feat_idx < (int)feature_names.size();
+               ++feat_idx) {
            const std::string &feature_name = feature_names[feat_idx];
            if (feat_id_map.find(feature_name) != feat_id_map.end()) {
              // res[feat_idx][idx] =
@@ -665,19 +960,20 @@ int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
 }

 int32_t GraphTable::set_node_feat(
-    const std::vector<uint64_t> &node_ids,
+    const std::vector<int64_t> &node_ids,
    const std::vector<std::string> &feature_names,
    const std::vector<std::vector<std::string>> &res) {
  size_t node_num = node_ids.size();
  std::vector<std::future<int>> tasks;
  for (size_t idx = 0; idx < node_num; ++idx) {
-    uint64_t node_id = node_ids[idx];
+    int64_t node_id = node_ids[idx];
    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
        [&, idx, node_id]() -> int {
          size_t index = node_id % this->shard_num - this->shard_start;
          auto node = shards[index]->add_feature_node(node_id);
          node->set_feature_size(this->feat_name.size());
-          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+          for (int feat_idx = 0; feat_idx < (int)feature_names.size();
+               ++feat_idx) {
            const std::string &feature_name = feature_names[feat_idx];
            if (feat_id_map.find(feature_name) != feat_id_map.end()) {
              node->set_feature(feat_id_map[feature_name], res[feat_idx][idx]);
@@ -771,35 +1067,68 @@ int32_t GraphTable::pull_graph_list(int start, int total_size,
  return 0;
 }

-int32_t GraphTable::get_server_index_by_id(uint64_t id) {
+int32_t GraphTable::get_server_index_by_id(int64_t id) {
  return id % shard_num / shard_num_per_server;
 }
+int32_t GraphTable::initialize(const TableParameter &config,
+                               const FsClientParameter &fs_config) {
+  LOG(INFO) << "in graphTable initialize";
+  _config = config;
+  if (initialize_accessor() != 0) {
+    LOG(WARNING) << "Table accessor initialize failed";
+    return -1;
+  }

-int32_t GraphTable::initialize() {
+  if (_afs_client.initialize(fs_config) != 0) {
+    LOG(WARNING) << "Table fs_client initialize failed";
+    // return -1;
+  }
+  auto graph = config.graph_parameter();
+  shard_num = _config.shard_num();
+  LOG(INFO) << "in graphTable initialize over";
+  return initialize(graph);
+}
+int32_t GraphTable::initialize(const GraphParameter &graph) {
+#ifdef PADDLE_WITH_HETERPS
+  if (graph.gpups_mode()) {
+    gpups_mode = true;
+    if (shard_num == 0) {
+      shard_num = graph.gpups_mode_shard_num();
+      server_num = 1;
+      _shard_idx = 0;
+    }
+    auto *sampler =
+        CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class());
+    auto slices =
+        string::split_string<std::string>(graph.gpups_graph_sample_args(), ",");
+    std::cout << "slices" << std::endl;
+    for (auto x : slices) std::cout << x << std::endl;
+    sampler->init(graph.gpu_num(), this, slices);
+    graph_sampler.reset(sampler);
+  }
+#endif
+  task_pool_size_ = graph.task_pool_size();
  _shards_task_pool.resize(task_pool_size_);
  for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
    _shards_task_pool[i].reset(new ::ThreadPool(1));
    _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0));
  }
-  server_num = _shard_num;
-  // VLOG(0) << "in init graph table server num = " << server_num;
-  /*
-  _shard_num is actually server number here
-  when a server initialize its tables, it sets tables' _shard_num to server_num,
-  and _shard_idx to server
-  rank
-  */
-  auto common = _config.common();
-
-  this->table_name = common.table_name();
-  this->table_type = common.name();
+  auto graph_feature = graph.graph_feature();
+  // this->table_name = common.table_name();
+  // this->table_type = common.name();
+  this->table_name = graph.table_name();
+  this->table_type = graph.table_type();
  VLOG(0) << " init graph table type " << this->table_type << " table name "
          << this->table_name;
-  int feat_conf_size = static_cast<int>(common.attributes().size());
+  // int feat_conf_size = static_cast<int>(common.attributes().size());
+  int feat_conf_size = static_cast<int>(graph_feature.name().size());
  for (int i = 0; i < feat_conf_size; i++) {
-    auto &f_name = common.attributes()[i];
-    auto &f_shape = common.dims()[i];
-    auto &f_dtype = common.params()[i];
+    // auto &f_name = common.attributes()[i];
+    // auto &f_shape = common.dims()[i];
+    // auto &f_dtype = common.params()[i];
+    auto &f_name = graph_feature.name()[i];
+    auto &f_shape = graph_feature.shape()[i];
+    auto &f_dtype = graph_feature.dtype()[i];
    this->feat_name.push_back(f_name);
    this->feat_shape.push_back(f_shape);
    this->feat_dtype.push_back(f_dtype);
@@ -807,8 +1136,6 @@ int32_t GraphTable::initialize() {
    VLOG(0) << "init graph table feat conf name:" << f_name
            << " shape:" << f_shape << " dtype:" << f_dtype;
  }
-
-  shard_num = _config.shard_num();
  VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx"
          << _shard_idx;
  shard_num_per_server = sparse_local_shard_num(shard_num, server_num);
@@ -826,5 +1153,6 @@ int32_t GraphTable::initialize() {

  return 0;
 }
+
 }  // namespace distributed
 };  // namespace paddle
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -38,10 +38,14 @@
 #include <vector>
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/graph/class_macro.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/rw_lock.h"

+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
+#endif
 namespace paddle {
 namespace distributed {
 class GraphShard {
@@ -51,37 +55,37 @@ class GraphShard {
  ~GraphShard();
  std::vector<Node *> &get_bucket() { return bucket; }
  std::vector<Node *> get_batch(int start, int end, int step);
-  std::vector<uint64_t> get_ids_by_range(int start, int end) {
-    std::vector<uint64_t> res;
+  std::vector<int64_t> get_ids_by_range(int start, int end) {
+    std::vector<int64_t> res;
    for (int i = start; i < end && i < (int)bucket.size(); i++) {
      res.push_back(bucket[i]->get_id());
    }
    return res;
  }

-  GraphNode *add_graph_node(uint64_t id);
+  GraphNode *add_graph_node(int64_t id);
  GraphNode *add_graph_node(Node *node);
-  FeatureNode *add_feature_node(uint64_t id);
-  Node *find_node(uint64_t id);
-  void delete_node(uint64_t id);
+  FeatureNode *add_feature_node(int64_t id);
+  Node *find_node(int64_t id);
+  void delete_node(int64_t id);
  void clear();
-  void add_neighbor(uint64_t id, uint64_t dst_id, float weight);
-  std::unordered_map<uint64_t, int> &get_node_location() {
+  void add_neighbor(int64_t id, int64_t dst_id, float weight);
+  std::unordered_map<int64_t, int> &get_node_location() {
    return node_location;
  }

 private:
-  std::unordered_map<uint64_t, int> node_location;
+  std::unordered_map<int64_t, int> node_location;
  std::vector<Node *> bucket;
 };

 enum LRUResponse { ok = 0, blocked = 1, err = 2 };

 struct SampleKey {
-  uint64_t node_key;
+  int64_t node_key;
  size_t sample_size;
  bool is_weighted;
-  SampleKey(uint64_t _node_key, size_t _sample_size, bool _is_weighted)
+  SampleKey(int64_t _node_key, size_t _sample_size, bool _is_weighted)
      : node_key(_node_key),
        sample_size(_sample_size),
        is_weighted(_is_weighted) {}
@@ -300,7 +304,7 @@ class ScaledLRU {
      node_size += lru_pool[i].node_size - lru_pool[i].remove_count;
    }

-    if (node_size <= size_t(1.1 * size_limit) + 1) return 0;
+    if ((size_t)node_size <= size_t(1.1 * size_limit) + 1) return 0;
    if (pthread_rwlock_wrlock(&rwlock) == 0) {
      // VLOG(0)<"in shrink\n";
      global_count = 0;
@@ -308,9 +312,9 @@ class ScaledLRU {
        global_count += lru_pool[i].node_size - lru_pool[i].remove_count;
      }
      // VLOG(0)<<"global_count "<<global_count<<"\n";
-      if (global_count > size_limit) {
+      if ((size_t)global_count > size_limit) {
        size_t remove = global_count - size_limit;
-        for (int i = 0; i < lru_pool.size(); i++) {
+        for (size_t i = 0; i < lru_pool.size(); i++) {
          lru_pool[i].total_diff = 0;
          lru_pool[i].remove_count +=
              1.0 * (lru_pool[i].node_size - lru_pool[i].remove_count) /
@@ -352,9 +356,69 @@ class ScaledLRU {
  friend class RandomSampleLRU<K, V>;
 };

+#ifdef PADDLE_WITH_HETERPS
+enum GraphSamplerStatus { waiting = 0, running = 1, terminating = 2 };
+class GraphTable;
+class GraphSampler {
+ public:
+  GraphSampler() {
+    status = GraphSamplerStatus::waiting;
+    thread_pool.reset(new ::ThreadPool(1));
+    callback = [](std::vector<paddle::framework::GpuPsCommGraph> &res) {
+      return;
+    };
+  }
+  virtual int run_graph_sampling() = 0;
+  virtual int start_graph_sampling() {
+    if (status != GraphSamplerStatus::waiting) {
+      return -1;
+    }
+    std::promise<int> prom;
+    std::future<int> fut = prom.get_future();
+    graph_sample_task_over = thread_pool->enqueue([&prom, this]() {
+      prom.set_value(0);
+      status = GraphSamplerStatus::running;
+      return run_graph_sampling();
+    });
+    return fut.get();
+  }
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args) = 0;
+  virtual void set_graph_sample_callback(
+      std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+          callback) {
+    this->callback = callback;
+  }
+
+  virtual int end_graph_sampling() {
+    if (status == GraphSamplerStatus::running) {
+      status = GraphSamplerStatus::terminating;
+      return graph_sample_task_over.get();
+    }
+    return -1;
+  }
+  virtual GraphSamplerStatus get_graph_sampler_status() { return status; }
+
+ protected:
+  std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+      callback;
+  std::shared_ptr<::ThreadPool> thread_pool;
+  GraphSamplerStatus status;
+  std::future<int> graph_sample_task_over;
+  std::vector<paddle::framework::GpuPsCommGraph> sample_res;
+};
+#endif
+
 class GraphTable : public SparseTable {
 public:
-  GraphTable() { use_cache = false; }
+  GraphTable() {
+    use_cache = false;
+    shard_num = 0;
+#ifdef PADDLE_WITH_HETERPS
+    gpups_mode = false;
+#endif
+    rw_lock.reset(new pthread_rwlock_t());
+  }
  virtual ~GraphTable();
  virtual int32_t pull_graph_list(int start, int size,
                                  std::unique_ptr<char[]> &buffer,
@@ -362,7 +426,7 @@ class GraphTable : public SparseTable {
                                  int step);

  virtual int32_t random_sample_neighbors(
-      uint64_t *node_ids, int sample_size,
+      int64_t *node_ids, int sample_size,
      std::vector<std::shared_ptr<char>> &buffers,
      std::vector<int> &actual_sizes, bool need_weight);

@@ -370,9 +434,11 @@ class GraphTable : public SparseTable {
                              int &actual_sizes);

  virtual int32_t get_nodes_ids_by_ranges(
-      std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res);
-  virtual int32_t initialize();
-
+      std::vector<std::pair<int, int>> ranges, std::vector<int64_t> &res);
+  virtual int32_t initialize() { return 0; }
+  virtual int32_t initialize(const TableParameter &config,
+                             const FsClientParameter &fs_config);
+  virtual int32_t initialize(const GraphParameter &config);
  int32_t load(const std::string &path, const std::string &param);
  int32_t load_graph_split_config(const std::string &path);

@@ -380,13 +446,13 @@ class GraphTable : public SparseTable {

  int32_t load_nodes(const std::string &path, std::string node_type);

-  int32_t add_graph_node(std::vector<uint64_t> &id_list,
+  int32_t add_graph_node(std::vector<int64_t> &id_list,
                         std::vector<bool> &is_weight_list);

-  int32_t remove_graph_node(std::vector<uint64_t> &id_list);
+  int32_t remove_graph_node(std::vector<int64_t> &id_list);

-  int32_t get_server_index_by_id(uint64_t id);
-  Node *find_node(uint64_t id);
+  int32_t get_server_index_by_id(int64_t id);
+  Node *find_node(int64_t id);

  virtual int32_t pull_sparse(float *values,
                              const PullSparseValue &pull_value) {
@@ -407,16 +473,27 @@ class GraphTable : public SparseTable {
    return 0;
  }
  virtual int32_t initialize_shard() { return 0; }
-  virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index);
-  virtual uint32_t get_thread_pool_index(uint64_t node_id);
+  virtual int32_t set_shard(size_t shard_idx, size_t server_num) {
+    _shard_idx = shard_idx;
+    /*
+    _shard_num is not used in graph_table, this following operation is for the
+    purpose of
+    being compatible with base class table.
+    */
+    _shard_num = server_num;
+    this->server_num = server_num;
+    return 0;
+  }
+  virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index);
+  virtual uint32_t get_thread_pool_index(int64_t node_id);
  virtual std::pair<int32_t, std::string> parse_feature(std::string feat_str);

-  virtual int32_t get_node_feat(const std::vector<uint64_t> &node_ids,
+  virtual int32_t get_node_feat(const std::vector<int64_t> &node_ids,
                                const std::vector<std::string> &feature_names,
                                std::vector<std::vector<std::string>> &res);

  virtual int32_t set_node_feat(
-      const std::vector<uint64_t> &node_ids,
+      const std::vector<int64_t> &node_ids,
      const std::vector<std::string> &feature_names,
      const std::vector<std::vector<std::string>> &res);

@@ -433,11 +510,25 @@ class GraphTable : public SparseTable {
    }
    return 0;
  }
-
+#ifdef PADDLE_WITH_HETERPS
+  virtual int32_t start_graph_sampling() {
+    return this->graph_sampler->start_graph_sampling();
+  }
+  virtual int32_t end_graph_sampling() {
+    return this->graph_sampler->end_graph_sampling();
+  }
+  virtual int32_t set_graph_sample_callback(
+      std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+          callback) {
+    graph_sampler->set_graph_sample_callback(callback);
+    return 0;
+  }
+// virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); }
+#endif
 protected:
  std::vector<GraphShard *> shards, extra_shards;
  size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
-  const int task_pool_size_ = 24;
+  int task_pool_size_ = 24;
  const int random_sample_nodes_ranges = 3;

  std::vector<std::string> feat_name;
@@ -450,11 +541,61 @@ class GraphTable : public SparseTable {
  std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
  std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
  std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
-  std::unordered_set<uint64_t> extra_nodes;
-  std::unordered_map<uint64_t, size_t> extra_nodes_to_thread_index;
+  std::unordered_set<int64_t> extra_nodes;
+  std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index;
  bool use_cache, use_duplicate_nodes;
  mutable std::mutex mutex_;
+  std::shared_ptr<pthread_rwlock_t> rw_lock;
+#ifdef PADDLE_WITH_HETERPS
+  // paddle::framework::GpuPsGraphTable gpu_graph_table;
+  bool gpups_mode;
+  // std::shared_ptr<::ThreadPool> graph_sample_pool;
+  std::shared_ptr<GraphSampler> graph_sampler;
+  REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler)
+#endif
+};
+
+#ifdef PADDLE_WITH_HETERPS
+REGISTER_PSCORE_REGISTERER(GraphSampler);
+class CompleteGraphSampler : public GraphSampler {
+ public:
+  CompleteGraphSampler() {}
+  ~CompleteGraphSampler() {}
+  // virtual pthread_rwlock_t *export_rw_lock();
+  virtual int run_graph_sampling();
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args_);
+
+ protected:
+  GraphTable *graph_table;
+  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<int64_t>> sample_neighbors;
+  // std::vector<GpuPsCommGraph> sample_res;
+  // std::shared_ptr<std::mt19937_64> random;
+  int gpu_num;
+};
+
+class BasicBfsGraphSampler : public GraphSampler {
+ public:
+  BasicBfsGraphSampler() {}
+  ~BasicBfsGraphSampler() {}
+  // virtual pthread_rwlock_t *export_rw_lock();
+  virtual int run_graph_sampling();
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args_);
+
+ protected:
+  GraphTable *graph_table;
+  // std::vector<std::vector<GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<int64_t>> sample_neighbors;
+  size_t gpu_num;
+  int node_num_for_each_shard, edge_num_for_each_node;
+  int rounds, interval;
+  std::vector<std::unordered_map<int64_t, std::vector<int64_t>>>
+      sample_neighbors_map;
 };
+#endif
 }  // namespace distributed

 };  // namespace paddle

--- a/paddle/fluid/distributed/ps/table/depends/initializers.h
+++ b/paddle/fluid/distributed/ps/table/depends/initializers.h
@@ -23,6 +23,7 @@
 #include "gflags/gflags.h"

 #include "paddle/fluid/framework/generator.h"
+
 #include "paddle/fluid/operators/truncated_gaussian_random_op.h"

 namespace paddle {
@@ -117,13 +118,9 @@ class TruncatedGaussianInitializer : public Initializer {
    seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
    mean_ = std::stof(attrs[2]);
    std_ = std::stof(attrs[3]);
-    auto normal_cdf = [](float x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    float a_normal_cdf = normal_cdf((-2.0 - mean_) / std_);
-    float b_normal_cdf = normal_cdf((2.0 - mean_) / std_);
-    std::uniform_real_distribution<float> dist_(2.0 * a_normal_cdf - 1.0,
-                                                2.0 * b_normal_cdf - 1.0);
+
+    std::uniform_real_distribution<float> dist_(
+        std::numeric_limits<float>::min(), 1.0);
    random_engine_ = framework::GetCPURandomEngine(seed_);
  }


--- a/paddle/fluid/distributed/ps/table/graph/class_macro.h
+++ b/paddle/fluid/distributed/ps/table/graph/class_macro.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#define DECLARE_GRAPH_FRIEND_CLASS(a) friend class a;
+#define DECLARE_1_FRIEND_CLASS(a, ...) DECLARE_GRAPH_FRIEND_CLASS(a)
+#define DECLARE_2_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_1_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_3_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_2_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_4_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_3_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_5_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_4_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_6_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_5_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_7_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_6_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_8_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_7_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_9_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_8_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_10_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_9_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_11_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_10_FRIEND_CLASS(__VA_ARGS__)
+#define REGISTER_GRAPH_FRIEND_CLASS(n, ...) \
+  DECLARE_##n##_FRIEND_CLASS(__VA_ARGS__)
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
@@ -17,11 +17,11 @@
 namespace paddle {
 namespace distributed {

-void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+void GraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
  id_arr.push_back(id);
 }

-void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
  id_arr.push_back(id);
  weight_arr.push_back(weight);
 }

--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
@@ -24,19 +24,20 @@ class GraphEdgeBlob {
  GraphEdgeBlob() {}
  virtual ~GraphEdgeBlob() {}
  size_t size() { return id_arr.size(); }
-  virtual void add_edge(uint64_t id, float weight);
-  uint64_t get_id(int idx) { return id_arr[idx]; }
+  virtual void add_edge(int64_t id, float weight);
+  int64_t get_id(int idx) { return id_arr[idx]; }
  virtual float get_weight(int idx) { return 1; }
+  std::vector<int64_t>& export_id_array() { return id_arr; }

 protected:
-  std::vector<uint64_t> id_arr;
+  std::vector<int64_t> id_arr;
 };

 class WeightedGraphEdgeBlob : public GraphEdgeBlob {
 public:
  WeightedGraphEdgeBlob() {}
  virtual ~WeightedGraphEdgeBlob() {}
-  virtual void add_edge(uint64_t id, float weight);
+  virtual void add_edge(int64_t id, float weight);
  virtual float get_weight(int idx) { return weight_arr[idx]; }

 protected:

--- a/paddle/fluid/distributed/ps/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -48,6 +48,7 @@ class Node {
  virtual void set_feature(int idx, std::string str) {}
  virtual void set_feature_size(int size) {}
  virtual int get_feature_size() { return 0; }
+  virtual size_t get_neighbor_size() { return 0; }

 protected:
  uint64_t id;
@@ -70,6 +71,7 @@ class GraphNode : public Node {
  }
  virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
  virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
+  virtual size_t get_neighbor_size() { return edges->size(); }

 protected:
  Sampler *sampler;

--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -37,6 +37,8 @@ REGISTER_PSCORE_CLASS(Table, CommonDenseTable);
 REGISTER_PSCORE_CLASS(Table, CommonSparseTable);
 #ifdef PADDLE_WITH_HETERPS
 REGISTER_PSCORE_CLASS(Table, SSDSparseTable);
+REGISTER_PSCORE_CLASS(GraphSampler, CompleteGraphSampler);
+REGISTER_PSCORE_CLASS(GraphSampler, BasicBfsGraphSampler);
 #endif
 REGISTER_PSCORE_CLASS(Table, SparseGeoTable);
 REGISTER_PSCORE_CLASS(Table, BarrierTable);

--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -24,6 +24,9 @@ cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope serv
 set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})

+set_source_files_properties(graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS  scope server communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+
 set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table)


--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -236,7 +236,7 @@ void RunGraphSplit() {
  sleep(2);
  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
  dense_regions.insert(
-      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+      std::pair<int64_t, std::vector<paddle::distributed::Region>>(0, {}));
  auto regions = dense_regions[0];

  RunClient(dense_regions, 0, pserver_ptr_->get_service());
@@ -250,16 +250,16 @@ void RunGraphSplit() {
      worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
  srand(time(0));
  pull_status.wait();
-  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<int64_t>> _vs;
  std::vector<std::vector<float>> vs;
  pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 10240001024), 4, _vs, vs, true);
  pull_status.wait();
  ASSERT_EQ(0, _vs[0].size());
  _vs.clear();
  vs.clear();
  pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 97), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 97), 4, _vs, vs, true);
  pull_status.wait();
  ASSERT_EQ(3, _vs[0].size());
  std::remove(edge_file_name);

--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -48,10 +48,10 @@ namespace distributed = paddle::distributed;

 void testSampleNodes(
    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<uint64_t> ids;
+  std::vector<int64_t> ids;
  auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids);
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {37, 59};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {37, 59};
  pull_status.wait();
  for (auto id : ids) s.insert(id);
  ASSERT_EQ(true, s.size() == s1.size());
@@ -106,14 +106,14 @@ void testFeatureNodeSerializeFloat64() {

 void testSingleSampleNeighboor(
    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<std::vector<uint64_t>> vs;
+  std::vector<std::vector<int64_t>> vs;
  std::vector<std::vector<float>> vs1;
  auto pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 37), 4, vs, vs1, true);
+      0, std::vector<int64_t>(1, 37), 4, vs, vs1, true);
  pull_status.wait();

-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {112, 45, 145};
  for (auto g : vs[0]) {
    s.insert(g);
  }
@@ -126,7 +126,7 @@ void testSingleSampleNeighboor(
  vs.clear();
  vs1.clear();
  pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 96), 4, vs, vs1, true);
+      0, std::vector<int64_t>(1, 96), 4, vs, vs1, true);
  pull_status.wait();
  s1 = {111, 48, 247};
  for (auto g : vs[0]) {
@@ -147,30 +147,30 @@ void testAddNode(
    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
  worker_ptr_->clear_nodes(0);
  int total_num = 270000;
-  uint64_t id;
-  std::unordered_set<uint64_t> id_set;
+  int64_t id;
+  std::unordered_set<int64_t> id_set;
  for (int i = 0; i < total_num; i++) {
    while (id_set.find(id = rand()) != id_set.end())
      ;
    id_set.insert(id);
  }
-  std::vector<uint64_t> id_list(id_set.begin(), id_set.end());
+  std::vector<int64_t> id_list(id_set.begin(), id_set.end());
  std::vector<bool> weight_list;
  auto status = worker_ptr_->add_graph_node(0, id_list, weight_list);
  status.wait();
-  std::vector<uint64_t> ids[2];
+  std::vector<int64_t> ids[2];
  for (int i = 0; i < 2; i++) {
    auto sample_status =
        worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
    sample_status.wait();
  }
-  std::unordered_set<uint64_t> id_set_check(ids[0].begin(), ids[0].end());
+  std::unordered_set<int64_t> id_set_check(ids[0].begin(), ids[0].end());
  for (auto x : ids[1]) id_set_check.insert(x);
  ASSERT_EQ(id_set.size(), id_set_check.size());
  for (auto x : id_set) {
    ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true);
  }
-  std::vector<uint64_t> remove_ids;
+  std::vector<int64_t> remove_ids;
  for (auto p : id_set_check) {
    if (remove_ids.size() == 0)
      remove_ids.push_back(p);
@@ -187,7 +187,7 @@ void testAddNode(
        worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
    sample_status.wait();
  }
-  std::unordered_set<uint64_t> id_set_check1(ids[0].begin(), ids[0].end());
+  std::unordered_set<int64_t> id_set_check1(ids[0].begin(), ids[0].end());
  for (auto x : ids[1]) id_set_check1.insert(x);
  ASSERT_EQ(id_set_check1.size(), id_set_check.size());
  for (auto x : id_set_check1) {
@@ -196,14 +196,14 @@ void testAddNode(
 }
 void testBatchSampleNeighboor(
    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<std::vector<uint64_t>> vs;
+  std::vector<std::vector<int64_t>> vs;
  std::vector<std::vector<float>> vs1;
-  std::vector<std::uint64_t> v = {37, 96};
+  std::vector<std::int64_t> v = {37, 96};
  auto pull_status =
      worker_ptr_->batch_sample_neighbors(0, v, 4, vs, vs1, false);
  pull_status.wait();
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {112, 45, 145};
  for (auto g : vs[0]) {
    s.insert(g);
  }
@@ -417,7 +417,7 @@ void RunBrpcPushSparse() {

  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
  dense_regions.insert(
-      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+      std::pair<int64_t, std::vector<paddle::distributed::Region>>(0, {}));
  auto regions = dense_regions[0];

  RunClient(dense_regions, 0, pserver_ptr_->get_service());
@@ -427,14 +427,14 @@ void RunBrpcPushSparse() {
      worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
  srand(time(0));
  pull_status.wait();
-  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<int64_t>> _vs;
  std::vector<std::vector<float>> vs;
  testSampleNodes(worker_ptr_);
  sleep(5);
  testSingleSampleNeighboor(worker_ptr_);
  testBatchSampleNeighboor(worker_ptr_);
  pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 10240001024), 4, _vs, vs, true);
  pull_status.wait();
  ASSERT_EQ(0, _vs[0].size());
  paddle::distributed::GraphTable* g =
@@ -445,14 +445,14 @@ void RunBrpcPushSparse() {
  while (round--) {
    vs.clear();
    pull_status = worker_ptr_->batch_sample_neighbors(
-        0, std::vector<uint64_t>(1, 37), 1, _vs, vs, false);
+        0, std::vector<int64_t>(1, 37), 1, _vs, vs, false);
    pull_status.wait();

    for (int i = 0; i < ttl; i++) {
-      std::vector<std::vector<uint64_t>> vs1;
+      std::vector<std::vector<int64_t>> vs1;
      std::vector<std::vector<float>> vs2;
      pull_status = worker_ptr_->batch_sample_neighbors(
-          0, std::vector<uint64_t>(1, 37), 1, vs1, vs2, false);
+          0, std::vector<int64_t>(1, 37), 1, vs1, vs2, false);
      pull_status.wait();
      ASSERT_EQ(_vs[0].size(), vs1[0].size());

@@ -540,7 +540,7 @@ void RunBrpcPushSparse() {

  // Test Pull by step

-  std::unordered_set<uint64_t> count_item_nodes;
+  std::unordered_set<int64_t> count_item_nodes;
  // pull by step 2
  for (int test_step = 1; test_step < 4; test_step++) {
    count_item_nodes.clear();
@@ -558,18 +558,18 @@ void RunBrpcPushSparse() {
    ASSERT_EQ(count_item_nodes.size(), 12);
  }

-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>> res;
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>> res;
  res = client1.batch_sample_neighbors(
-      std::string("user2item"), std::vector<uint64_t>(1, 96), 4, true, false);
+      std::string("user2item"), std::vector<int64_t>(1, 96), 4, true, false);
  ASSERT_EQ(res.first[0].size(), 3);
-  std::vector<uint64_t> node_ids;
+  std::vector<int64_t> node_ids;
  node_ids.push_back(96);
  node_ids.push_back(37);
  res = client1.batch_sample_neighbors(std::string("user2item"), node_ids, 4,
                                       true, false);

  ASSERT_EQ(res.first[1].size(), 1);
-  std::vector<uint64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
+  std::vector<int64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
  ASSERT_EQ(nodes_ids.size(), 2);
  ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) ||
                      (nodes_ids[0] == 37 && nodes_ids[1] == 59));

--- a/paddle/fluid/distributed/test/graph_table_sample_test.cc
+++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_set>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include <chrono>
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+
+std::vector<std::string> edges = {
+    std::string("37\t45\t0.34"),  std::string("37\t145\t0.31"),
+    std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
+    std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
+    std::string("59\t45\t0.34"),  std::string("59\t145\t0.31"),
+    std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
+    std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
+// odd id:96 48 122 112
+char edge_file_name[] = "edges.txt";
+
+std::vector<std::string> nodes = {
+    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
+    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
+    std::string("user\t59\ta 0.11\tb 11 14"),
+    std::string("user\t97\ta 0.11\tb 12 11"),
+    std::string("item\t45\ta 0.21"),
+    std::string("item\t145\ta 0.21"),
+    std::string("item\t112\ta 0.21"),
+    std::string("item\t48\ta 0.21"),
+    std::string("item\t247\ta 0.21"),
+    std::string("item\t111\ta 0.21"),
+    std::string("item\t46\ta 0.21"),
+    std::string("item\t146\ta 0.21"),
+    std::string("item\t122\ta 0.21"),
+    std::string("item\t49\ta 0.21"),
+    std::string("item\t248\ta 0.21"),
+    std::string("item\t113\ta 0.21")};
+char node_file_name[] = "nodes.txt";
+
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+
+void testGraphSample() {
+#ifdef PADDLE_WITH_HETERPS
+  ::paddle::distributed::GraphParameter table_proto;
+  table_proto.set_gpups_mode(true);
+  table_proto.set_gpups_mode_shard_num(127);
+  table_proto.set_gpu_num(2);
+
+  distributed::GraphTable graph_table, graph_table1;
+  graph_table.initialize(table_proto);
+  prepare_file(edge_file_name, edges);
+  graph_table.load(std::string(edge_file_name), std::string("e>"));
+  std::vector<paddle::framework::GpuPsCommGraph> res;
+  std::promise<int> prom;
+  std::future<int> fut = prom.get_future();
+  graph_table.set_graph_sample_callback(
+      [&res, &prom](std::vector<paddle::framework::GpuPsCommGraph> &res0) {
+        res = res0;
+        prom.set_value(0);
+      });
+  graph_table.start_graph_sampling();
+  fut.get();
+  graph_table.end_graph_sampling();
+  ASSERT_EQ(2, res.size());
+  // 37 59 97
+  for (int i = 0; i < (int)res[1].node_size; i++) {
+    std::cout << res[1].node_list[i].node_id << std::endl;
+  }
+  ASSERT_EQ(3, res[1].node_size);
+
+  ::paddle::distributed::GraphParameter table_proto1;
+  table_proto1.set_gpups_mode(true);
+  table_proto1.set_gpups_mode_shard_num(127);
+  table_proto1.set_gpu_num(2);
+  table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler");
+  table_proto1.set_gpups_graph_sample_args("5,5,1,1");
+  graph_table1.initialize(table_proto1);
+  graph_table1.load(std::string(edge_file_name), std::string("e>"));
+  std::vector<paddle::framework::GpuPsCommGraph> res1;
+  std::promise<int> prom1;
+  std::future<int> fut1 = prom1.get_future();
+  graph_table1.set_graph_sample_callback(
+      [&res1, &prom1](std::vector<paddle::framework::GpuPsCommGraph> &res0) {
+        res1 = res0;
+        prom1.set_value(0);
+      });
+  graph_table1.start_graph_sampling();
+  fut1.get();
+  graph_table1.end_graph_sampling();
+  // distributed::BasicBfsGraphSampler *sampler1 =
+  //     (distributed::BasicBfsGraphSampler *)graph_table1.get_graph_sampler();
+  //     sampler1->start_graph_sampling();
+  //     std::this_thread::sleep_for (std::chrono::seconds(1));
+  // std::vector<paddle::framework::GpuPsCommGraph> res1;// =
+  // sampler1->fetch_sample_res();
+  ASSERT_EQ(2, res1.size());
+  // odd id:96 48 122 112
+  for (int i = 0; i < (int)res1[0].node_size; i++) {
+    std::cout << res1[0].node_list[i].node_id << std::endl;
+  }
+  ASSERT_EQ(4, res1[0].node_size);
+#endif
+}
+
+TEST(testGraphSample, Run) { testGraphSample(); }
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -370,7 +370,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
      if (grad_tensors[i].is_initialized()) {
        // Deep copy
        paddle::experimental::Tensor tmp_tensor;
-        tmp_tensor.copy_(grad_tensors[i], true);
+        tmp_tensor.copy_(grad_tensors[i], grad_tensors[i].inner_place(), true);
        node_input_buffers_dict[grad_node]->add(input_info.first,
                                                input_info.second, tmp_tensor);
      } else {

--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -128,6 +128,6 @@ TEST(Generated, ElementwiseAdd) {

 }  // namespace egr

-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -255,6 +255,6 @@ TEST(Hook_intermidiate, Matmul_v2) {
 }
 }  // namespace egr

-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -10,8 +10,9 @@ IF(WITH_GPU)
    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
    nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
    nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
-    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm)
+    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table)
    nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
+    nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps)
 ENDIF()
 IF(WITH_ROCM)
    hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)

--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace framework {
+struct GpuPsGraphNode {
+  int64_t node_id;
+  int neighbor_size, neighbor_offset;
+  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
+  // neighbor_size) of int64_t *neighbor_list;
+};
+
+struct GpuPsCommGraph {
+  int64_t *neighbor_list;
+  GpuPsGraphNode *node_list;
+  int neighbor_size, node_size;
+  // the size of neighbor array and graph_node_list array
+  GpuPsCommGraph()
+      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
+  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
+                 int neighbor_size_, int node_size_)
+      : neighbor_list(neighbor_list_),
+        node_list(node_list_),
+        neighbor_size(neighbor_size_),
+        node_size(node_size_) {}
+};
+
+/*
+suppose we have a graph like this
+
+0----3-----5----7
+ \   |\         |\
+ 17  8 9        1 2
+
+we save the nodes in arbitrary order,
+in this example,the order is
+[0,5,1,2,7,3,8,9,17]
+let us name this array u_id;
+we record each node's neighbors:
+0:3,17
+5:3,7
+1:7
+2:7
+7:1,2,5
+3:0,5,8,9
+8:3
+9:3
+17:0
+
+by concatenating each node's neighbor_list in the order we save the node id.
+we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
+this is the neighbor_list of GpuPsCommGraph
+given this neighbor_list and the order to save node id,
+we know,
+node 0's neighbors are in the range [0,1] of neighbor_list
+node 5's neighbors are in the range [2,3] of neighbor_list
+node 1's neighbors are in the range [4,4] of neighbor_list
+node 2:[5,5]
+node 7:[6,6]
+node 3:[9,12]
+node 8:[13,13]
+node 9:[14,14]
+node 17:[15,15]
+...
+by the above information,
+we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
+of size 9,
+where node_list[i].id = u_id[i]
+then we have:
+node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
+node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
+node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
+node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
+node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
+node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
+node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
+node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
+node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
+*/
+struct NeighborSampleResult {
+  int64_t *val;
+  int *actual_sample_size, sample_size, key_size;
+  NeighborSampleResult(int _sample_size, int _key_size)
+      : sample_size(_sample_size), key_size(_key_size) {
+    actual_sample_size = NULL;
+    val = NULL;
+  };
+  ~NeighborSampleResult() {
+    if (val != NULL) cudaFree(val);
+    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
+  }
+};
+
+struct NodeQueryResult {
+  int64_t *val;
+  int actual_sample_size;
+  NodeQueryResult() {
+    val = NULL;
+    actual_sample_size = 0;
+  };
+  ~NodeQueryResult() {
+    if (val != NULL) cudaFree(val);
+  }
+};
+}
+};
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -14,114 +14,25 @@

 #pragma once
 #include "heter_comm.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 #include "paddle/fluid/platform/enforce.h"
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace framework {
-struct GpuPsGraphNode {
-  int64_t node_id;
-  int neighbor_size, neighbor_offset;
-  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
-  // neighbor_size) of int64_t *neighbor_list;
-};
-
-struct GpuPsCommGraph {
-  int64_t *neighbor_list;
-  GpuPsGraphNode *node_list;
-  int neighbor_size, node_size;
-  // the size of neighbor array and graph_node_list array
-  GpuPsCommGraph()
-      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
-  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
-                 int neighbor_size_, int node_size_)
-      : neighbor_list(neighbor_list_),
-        node_list(node_list_),
-        neighbor_size(neighbor_size_),
-        node_size(node_size_) {}
-};
-
-/*
-suppose we have a graph like this

-0----3-----5----7
- \   |\         |\
- 17  8 9        1 2
-
-we save the nodes in arbitrary order,
-in this example,the order is
-[0,5,1,2,7,3,8,9,17]
-let us name this array u_id;
-we record each node's neighbors:
-0:3,17
-5:3,7
-1:7
-2:7
-7:1,2,5
-3:0,5,8,9
-8:3
-9:3
-17:0
-
-by concatenating each node's neighbor_list in the order we save the node id.
-we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
-this is the neighbor_list of GpuPsCommGraph
-given this neighbor_list and the order to save node id,
-we know,
-node 0's neighbors are in the range [0,1] of neighbor_list
-node 5's neighbors are in the range [2,3] of neighbor_list
-node 1's neighbors are in the range [4,4] of neighbor_list
-node 2:[5,5]
-node 7:[6,6]
-node 3:[9,12]
-node 8:[13,13]
-node 9:[14,14]
-node 17:[15,15]
-...
-by the above information,
-we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
-of size 9,
-where node_list[i].id = u_id[i]
-then we have:
-node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
-node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
-node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
-node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
-node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
-node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
-node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
-node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
-node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
-*/
-struct NeighborSampleResult {
-  int64_t *val;
-  int *actual_sample_size, sample_size, key_size;
-  NeighborSampleResult(int _sample_size, int _key_size)
-      : sample_size(_sample_size), key_size(_key_size) {
-    actual_sample_size = NULL;
-    val = NULL;
-  };
-  ~NeighborSampleResult() {
-    if (val != NULL) cudaFree(val);
-    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
-  }
-};
-
-struct NodeQueryResult {
-  int64_t *val;
-  int actual_sample_size;
-  NodeQueryResult() {
-    val = NULL;
-    actual_sample_size = 0;
-  };
-  ~NodeQueryResult() {
-    if (val != NULL) cudaFree(val);
-  }
-};
 class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
 public:
  GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource)
      : HeterComm<int64_t, int, int>(1, resource) {
    load_factor_ = 0.25;
+    rw_lock.reset(new pthread_rwlock_t());
+    cpu_table_status = -1;
+  }
+  ~GpuPsGraphTable() {
+    if (cpu_table_status != -1) {
+      end_graph_sampling();
+    }
  }
  void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
  NodeQueryResult *graph_node_sample(int gpu_id, int sample_size);
@@ -134,9 +45,19 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
                                                 int *h_right,
                                                 int64_t *src_sample_res,
                                                 int *actual_sample_size);
+  int init_cpu_table(const paddle::distributed::GraphParameter &graph);
+  int load(const std::string &path, const std::string &param);
+  virtual int32_t end_graph_sampling() {
+    return cpu_graph_table->end_graph_sampling();
+  }

 private:
  std::vector<GpuPsCommGraph> gpu_graph_list;
+  std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table;
+  std::shared_ptr<pthread_rwlock_t> rw_lock;
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  int cpu_table_status;
 };
 }
 };

--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -14,6 +14,7 @@

 #pragma once
 #ifdef PADDLE_WITH_HETERPS
+//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 namespace paddle {
 namespace framework {
 /*
@@ -45,6 +46,33 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
  }
 }

+int GpuPsGraphTable::init_cpu_table(
+    const paddle::distributed::GraphParameter& graph) {
+  cpu_graph_table.reset(new paddle::distributed::GraphTable);
+  cpu_table_status = cpu_graph_table->initialize(graph);
+  if (cpu_table_status != 0) return cpu_table_status;
+  std::function<void(std::vector<GpuPsCommGraph>&)> callback =
+      [this](std::vector<GpuPsCommGraph>& res) {
+        pthread_rwlock_wrlock(this->rw_lock.get());
+        this->clear_graph_info();
+        this->build_graph_from_cpu(res);
+        pthread_rwlock_unlock(this->rw_lock.get());
+        cv_.notify_one();
+      };
+  cpu_graph_table->set_graph_sample_callback(callback);
+  return cpu_table_status;
+}
+
+int GpuPsGraphTable::load(const std::string& path, const std::string& param) {
+  int status = cpu_graph_table->load(path, param);
+  if (status != 0) {
+    return status;
+  }
+  std::unique_lock<std::mutex> lock(mutex_);
+  cpu_graph_table->start_graph_sampling();
+  cv_.wait(lock);
+  return 0;
+}
 /*
 comment 1

@@ -68,6 +96,7 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
 that's what fill_dvals does.

 */
+
 void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
    int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right,
    int64_t* src_sample_res, int* actual_sample_size) {
@@ -258,7 +287,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,

  auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
  int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t));
+  auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t));
  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
  int* d_shard_actual_sample_size_ptr =

--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
+//#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include <queue>

 namespace paddle {

--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+using namespace paddle::framework;
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+char edge_file_name[] = "edges.txt";
+TEST(TEST_FLEET, graph_sample) {
+  std::vector<std::string> edges;
+  int gpu_count = 3;
+  std::vector<int> dev_ids;
+  dev_ids.push_back(0);
+  dev_ids.push_back(1);
+  dev_ids.push_back(2);
+
+  std::shared_ptr<HeterPsResource> resource =
+      std::make_shared<HeterPsResource>(dev_ids);
+  resource->enable_p2p();
+  GpuPsGraphTable g(resource);
+  int node_count = 10;
+  std::vector<std::vector<int64_t>> neighbors(node_count);
+  int ind = 0;
+  int64_t node_id = 0;
+  // std::vector<GpuPsCommGraph> graph_list(gpu_count);
+  while (ind < node_count) {
+    int neighbor_size = ind + 1;
+    while (neighbor_size--) {
+      edges.push_back(std::to_string(ind) + "\t" + std::to_string(node_id) +
+                      "\t1.0");
+      node_id++;
+    }
+    ind++;
+  }
+  /*
+  gpu 0:
+  0,3,6,9
+  gpu 1:
+  1,4,7
+  gpu 2:
+  2,5,8
+
+  query(2,6) returns nodes [6,9,1,4,7,2]
+  */
+  ::paddle::distributed::GraphParameter table_proto;
+  table_proto.set_gpups_mode(true);
+  table_proto.set_gpups_mode_shard_num(127);
+  table_proto.set_gpu_num(3);
+  table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler");
+  table_proto.set_gpups_graph_sample_args("5,5,1,1");
+  prepare_file(edge_file_name, edges);
+  g.init_cpu_table(table_proto);
+  g.load(std::string(edge_file_name), std::string("e>"));
+  /*
+   node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x]
+   so node 6's neighbors are [21,22...,27]
+   node 7's neighbors are [28,29,..35]
+    node 0's neighbors are [0]
+   query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23]
+   6 --index-->2
+   0 --index--->0
+   7 --index-->2
+  */
+  int64_t cpu_key[3] = {7, 0, 6};
+  void *key;
+  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
+  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
+  auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
+  int64_t *res = new int64_t[9];
+  cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost);
+  std::sort(res, res + 3);
+  std::sort(res + 6, res + 9);
+  int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
+  for (int i = 0; i < 9; i++) {
+    if (expected_sample_val[i] != -1) {
+      ASSERT_EQ(res[i], expected_sample_val[i]);
+    }
+  }
+  delete[] res;
+  delete neighbor_sample_res;
+}
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -78,6 +78,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
    return var_types[0] == proto::VarType::SELECTED_ROWS;
  }

+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    auto var_types = ctx_.GetInputsVarType(name);
+    return var_types[0] == proto::VarType::LOD_TENSOR_ARRAY;
+  }
+
  bool IsDenseTensorOutput(const std::string& name) const override {
    auto var_types = ctx_.GetOutputsVarType(name);
    return var_types[0] == proto::VarType::LOD_TENSOR;
@@ -125,9 +130,14 @@ class CompatMetaTensor : public phi::MetaTensor {
        return var->Get<phi::DenseTensor>().dims();
      } else if (var->IsType<phi::SelectedRows>()) {
        return var->Get<phi::SelectedRows>().dims();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // use tensor array size as dims
+        auto& tensor_array = var->Get<framework::LoDTensorArray>();
+        return phi::make_ddim({static_cast<int64_t>(tensor_array.size())});
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
-            "Currently, only can get dims from DenseTensor or SelectedRows."));
+            "Currently, only can get dims from DenseTensor or SelectedRows or "
+            "DenseTensorArray."));
      }
    } else {
      auto* var = BOOST_GET_CONST(VarDesc*, var_);
@@ -144,6 +154,10 @@ class CompatMetaTensor : public phi::MetaTensor {
        return var->Get<phi::DenseTensor>().dtype();
      } else if (var->IsType<phi::SelectedRows>()) {
        return var->Get<phi::SelectedRows>().dtype();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported get dtype from LoDTensorArray now
+        return phi::DataType::UNDEFINED;
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "Currently, only can get dtype from DenseTensor or SelectedRows."));
@@ -157,7 +171,19 @@ class CompatMetaTensor : public phi::MetaTensor {
  DataLayout layout() const override {
    if (is_runtime_) {
      auto* var = BOOST_GET_CONST(Variable*, var_);
-      return var->Get<LoDTensor>().layout();
+      if (var->IsType<phi::DenseTensor>()) {
+        return var->Get<phi::DenseTensor>().layout();
+      } else if (var->IsType<phi::SelectedRows>()) {
+        return var->Get<phi::SelectedRows>().layout();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported get layout from LoDTensorArray now
+        return phi::DataLayout::UNDEFINED;
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can get layout from DenseTensor or "
+            "SelectedRows."));
+      }
    } else {
      // NOTE(chenweihang): do nothing
      // Unsupported get layout for VarDesc now
@@ -174,6 +200,16 @@ class CompatMetaTensor : public phi::MetaTensor {
      } else if (var->IsType<phi::SelectedRows>()) {
        auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
        phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        auto* tensor_array = var->GetMutable<framework::LoDTensorArray>();
+        // Note: Here I want enforce `tensor_array->size() == 0UL`, because
+        // inplace using on LoDTensorArray is dangerous, but the unittest
+        // `test_list` contains this behavior
+        PADDLE_ENFORCE_EQ(dims.size(), 1UL,
+                          platform::errors::InvalidArgument(
+                              "LoDTensorArray can only have one dimension."));
+        // only set the array size for LoDTensorArray input
+        tensor_array->resize(dims[0]);
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "Currently, only can set dims from DenseTensor or SelectedRows."));
@@ -193,6 +229,9 @@ class CompatMetaTensor : public phi::MetaTensor {
      } else if (var->IsType<phi::SelectedRows>()) {
        auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
        phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported set dtype for LoDTensorArray now
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "Currently, only can set dtype from DenseTensor or SelectedRows."));
@@ -206,10 +245,20 @@ class CompatMetaTensor : public phi::MetaTensor {
  void set_layout(DataLayout layout) override {
    if (is_runtime_) {
      auto* var = BOOST_GET(Variable*, var_);
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(tensor))
-          ->layout = layout;
+      if (var->IsType<phi::DenseTensor>()) {
+        auto* tensor = var->GetMutable<phi::DenseTensor>();
+        phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
+      } else if (var->IsType<phi::SelectedRows>()) {
+        auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
+        phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported set dtype for LoDTensorArray now
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can set layout from DenseTensor or "
+            "SelectedRows."));
+      }
    } else {
      // NOTE(chenweihang): do nothing
      // Unsupported set layout for VarDesc now
@@ -251,9 +300,7 @@ class CompatMetaTensor : public phi::MetaTensor {
  void share_meta(const MetaTensor& meta_tensor) override {
    share_dims(meta_tensor);
    set_dtype(meta_tensor.dtype());
-    // VarDesc doesn't contains layout, so we cannot share layout
-    // set_layout(meta_tensor.layout());
-
+    set_layout(meta_tensor.layout());
    // special case: share lod of LoDTensor
    share_lod(meta_tensor);
  }
@@ -442,6 +489,51 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
              attr_name, infershape_input.size()));
        }
      }
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = attr_reader.GetAttr(attr_name);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct InferMetaContext.",
+            attr_names[i]));
+      }
    } else if (ctx->HasAttr(attr_name)) {
      // Emplace Back Attr according to the type of attr.
      auto& attr = attr_reader.GetAttr(attr_name);

--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
@@ -73,8 +73,10 @@ static void ShareVarInfoToCinnLaunch(
      varinfo_maps.at(cinn_launch_op->GetScopeIdx());

  // collect all MemOptVarInfos of external variables
-  // that would be eager deleted after the cinn_launch subgraph executed,
-  // and store them as attribute of the subgraph
+  // that were eager deleted after the cinn_launch subgraph executed,
+  // and we will delete them in advance among eager_deletion_ops
+  // inside cinn_launch subgraph, so store them as attribute of the subgraph
+  // to pass to the inner eager_deletion_ops.
  for (const auto& var_name : vars_to_delete) {
    auto it = src_varinfo_map.find(var_name);
    PADDLE_ENFORCE_NE(it, src_varinfo_map.end(),
@@ -82,6 +84,8 @@ static void ShareVarInfoToCinnLaunch(
                          "MemOptVarInfo of var[%s] not found", var_name));
    dst_varinfo_map.emplace(var_name, it->second);
  }
+  // skip running of the followed eager_deletion_op
+  followed_eager_deletion_op->SetSkipRunning(true);
 }

 static void TakeVarInfoFromMainGraph(

--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -31,7 +31,7 @@ USE_OP(slice);
 USE_OP(concat);
 USE_OP(matmul);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(tanh);
 USE_OP(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
@@ -47,7 +47,7 @@ USE_OP(square);
 USE_OP(transpose2_grad);
 USE_OP(concat_grad);
 USE_OP_ITSELF(elementwise_mul_grad);
-USE_OP(sigmoid_grad);
+USE_OP_ITSELF(sigmoid_grad);
 USE_OP_ITSELF(tanh_grad);
 USE_OP(sum);
 USE_OP(slice_grad);

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2103,16 +2103,25 @@ void OperatorWithKernel::BuildPhiKernelContext(
      auto* var = ins_vector[offset];
      if (var->IsType<framework::LoDTensor>()) {
        tensor_in = &(var->Get<framework::LoDTensor>());
+        pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
      } else if (var->IsType<phi::SelectedRows>()) {
        tensor_in = &(var->Get<phi::SelectedRows>());
+        pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        auto& tensor_array = var->Get<framework::LoDTensorArray>();
+        for (auto& t : tensor_array) {
+          tensor_vector.emplace_back(&t);
+        }
+        pt_kernel_context->EmplaceBackInputsWithoutSetRange(tensor_vector);
+        end_idx += tensor_array.size() - 1;
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "Unsupported input `%s` type when call pt kernel.",
            framework::ToTypeName(var->Type())));
      }
-
-      pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
    }
+    // Note: here cannot deal with vector<LoDTensorArray> input
    pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
  }
  VLOG(4) << "Done inputs";
@@ -2140,22 +2149,33 @@ void OperatorWithKernel::BuildPhiKernelContext(
    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
      phi::TensorBase* tensor_out = nullptr;
      auto* var = outs_vector[offset];
-
      if (var) {
        if (var->template IsType<framework::LoDTensor>()) {
          tensor_out = var->template GetMutable<framework::LoDTensor>();
+          pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
        } else if (var->template IsType<phi::SelectedRows>()) {
          tensor_out = var->template GetMutable<phi::SelectedRows>();
+          pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::LoDTensorArray>()) {
+          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          auto* tensor_array =
+              var->template GetMutable<framework::LoDTensorArray>();
+          // Note: If the input LoDTensorArray size is 0, the output
+          // LoDTensorArray is also 0
+          for (auto& t : *tensor_array) {
+            tensor_vector.emplace_back(&t);
+          }
+          pt_kernel_context->EmplaceBackOutputsWithoutSetRange(tensor_vector);
+          end_idx += tensor_array->size() - 1;
        } else {
          PADDLE_THROW(platform::errors::Unimplemented(
              "Unsupported output `%s` type when call pt kernel.",
              framework::ToTypeName(var->Type())));
        }
+      } else {
+        pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
      }
-
-      pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
    }
-
    pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
  }
  VLOG(4) << "Done outputs";

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -483,6 +483,10 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
    return ctx_.InputVar(name)->IsType<phi::SelectedRows>();
  }

+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    return ctx_.InputVar(name)->IsType<framework::LoDTensorArray>();
+  }
+
  bool IsDenseTensorOutput(const std::string& name) const override {
    return ctx_.OutputVar(name)->IsType<framework::LoDTensor>();
  }

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -423,7 +423,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
  }
  if (data_type == framework::proto::VarType::BF16) {
    if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      return TensorAddImpl<platform::CUDADeviceContext, platform::bfloat16>(
          src_tensor, dst_tensor, place);
 #else

--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -289,14 +289,23 @@ void BuildDygraphPhiKernelContext(
      auto& var = ins_vector[offset]->Var();
      if (var.template IsType<phi::DenseTensor>()) {
        tensor_in = &(var.template Get<phi::DenseTensor>());
+        kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
      } else if (var.template IsType<phi::SelectedRows>()) {
        tensor_in = &(var.template Get<phi::SelectedRows>());
+        kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var.template IsType<framework::LoDTensorArray>()) {
+        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        auto& tensor_array = var.template Get<framework::LoDTensorArray>();
+        for (auto& t : tensor_array) {
+          tensor_vector.emplace_back(&t);
+        }
+        kernel_ctx->EmplaceBackInputsWithoutSetRange(tensor_vector);
+        end_idx += tensor_array.size() - 1;
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "Unsupported input `%s` type when call pt kernel.",
            framework::ToTypeName(var.Type())));
      }
-      kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
    }
    kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
  }
@@ -326,16 +335,27 @@ void BuildDygraphPhiKernelContext(
      if (var) {
        if (var->template IsType<phi::DenseTensor>()) {
          tensor_out = var->template GetMutable<phi::DenseTensor>();
+          kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
        } else if (var->template IsType<phi::SelectedRows>()) {
          tensor_out = var->template GetMutable<phi::SelectedRows>();
+          kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::LoDTensorArray>()) {
+          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          auto* tensor_array =
+              var->template GetMutable<framework::LoDTensorArray>();
+          for (auto& t : *tensor_array) {
+            tensor_vector.emplace_back(&t);
+          }
+          kernel_ctx->EmplaceBackOutputsWithoutSetRange(tensor_vector);
+          end_idx += tensor_array->size() - 1;
        } else {
          PADDLE_THROW(platform::errors::Unimplemented(
              "Unsupported output `%s` type when call pt kernel.",
              framework::ToTypeName(var->Type())));
        }
+      } else {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
      }
-
-      kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
    }
    kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
  }

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -50,8 +50,7 @@
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/utils/string/split.h"

-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -374,8 +373,7 @@ static void DisablePrepareDataOpt(
 }

 bool AnalysisPredictor::PrepareExecutor() {
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  if (config_.dist_config().use_dist_model()) {
    VLOG(3) << "use_dist_model is enabled, will init FleetExecutor.";
    return PrepareFleetExecutor();
@@ -393,8 +391,7 @@ bool AnalysisPredictor::PrepareExecutor() {
  return true;
 }

-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 bool AnalysisPredictor::PrepareFleetExecutor() {
  VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()";
  if (config_.dist_config().nranks() > 1 && !CommInit()) {
@@ -1194,8 +1191,7 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
    const std::string &name) {
  framework::Scope *scope;
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  if (config_.dist_config().use_dist_model()) {
    scope = scope_.get();
  } else {
@@ -1244,8 +1240,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
    const std::string &name) {
  framework::Scope *scope;
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  if (config_.dist_config().use_dist_model()) {
    scope = scope_.get();
  } else {
@@ -1292,8 +1287,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 }

 bool AnalysisPredictor::ZeroCopyRun() {
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  if (config_.dist_config().use_dist_model()) {
    VLOG(3) << "ZeroCopyRun will use the fleet executor.";
    inference::Timer timer;

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,8 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #endif
 #include "paddle/fluid/framework/naive_executor.h"
@@ -395,8 +394,7 @@ class AnalysisPredictor : public PaddlePredictor {
  void StatisticShapeRangeInfo();
  void CollectShapeRangeInfo();

-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  // fleet exe related

  ///
@@ -488,8 +486,7 @@ class AnalysisPredictor : public PaddlePredictor {
  std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
  static int clone_num_;

-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
  // fleet executor related
  distributed::FleetExecutorDesc executor_desc_;
  std::shared_ptr<distributed::FleetExecutor> fleet_exe_;

--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -14,7 +14,11 @@
 #

 cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope)
-cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+if (WITH_ONNXRUNTIME)
+    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce onnxruntime)
+else (WITH_ONNXRUNTIME)
+    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+endif (WITH_ONNXRUNTIME)
 cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)

 cc_test(zero_copy_tensor_test SRCS zero_copy_tensor_test.cc DEPS paddle_inference_api)
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -22,12 +22,22 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/core/allocator.h"
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif

 namespace paddle_infer {

 using float16 = paddle::platform::float16;

 void Tensor::Reshape(const std::vector<int> &shape) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    shape_.assign(shape.begin(), shape.end());
+    return;
+  }
+#endif
+
  PADDLE_ENFORCE_EQ(
      name_.empty(), false,
      paddle::platform::errors::PreconditionNotMet(
@@ -123,6 +133,11 @@ T *Tensor::data(PlaceType *place, int *size) const {
 }

 DataType Tensor::type() const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    return dtype_;
+  }
+#endif
  EAGER_GET_TENSOR(paddle::framework::LoDTensor);
  auto type = paddle::framework::TransToProtoVarType(tensor->dtype());
  if (type == paddle::framework::proto::VarType::FP32) {
@@ -145,6 +160,13 @@ PlaceType Tensor::place() const { return place_; }

 template <typename T>
 void Tensor::CopyFromCpu(const T *data) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    ORTCopyFromCpu<T>(data);
+    return;
+  }
+#endif
+
  EAGER_GET_TENSOR(paddle::framework::LoDTensor);
  PADDLE_ENFORCE_GE(tensor->numel(), 0,
                    paddle::platform::errors::PreconditionNotMet(
@@ -382,6 +404,13 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,

 template <typename T>
 void Tensor::CopyToCpu(T *data) const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    ORTCopyToCpu<T>(data);
+    return;
+  }
+#endif
+
  CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
 }

@@ -489,12 +518,7 @@ template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
 template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
 template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);

-Tensor::Tensor(void *scope) : scope_{scope} {
-  PADDLE_ENFORCE_NOT_NULL(scope_,
-                          paddle::platform::errors::PreconditionNotMet(
-                              "The `scope` can not be nullptr. It should be "
-                              "set to the pointer of scope."));
-}
+Tensor::Tensor(void *scope) : scope_{scope} {}

 template <typename T>
 void *Tensor::FindTensor() const {
@@ -513,6 +537,26 @@ void *Tensor::FindTensor() const {
 }

 std::vector<int> Tensor::shape() const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    std::vector<int> shape;
+    // input handle
+    if (idx_ < 0) {
+      shape.assign(shape_.begin(), shape_.end());
+    } else {  // output handle
+      auto binding = binding_.lock();
+      PADDLE_ENFORCE_NOT_NULL(binding,
+                              paddle::platform::errors::PreconditionNotMet(
+                                  "output tensor [%s] no binding ptr", name_));
+      std::vector<Ort::Value> outputs = binding->GetOutputValues();
+      Ort::Value &value = outputs[idx_];
+      auto info = value.GetTensorTypeAndShapeInfo();
+      auto ort_shape = info.GetShape();
+      shape.assign(ort_shape.begin(), ort_shape.end());
+    }
+    return shape;
+  }
+#endif
  EAGER_GET_TENSOR(paddle::framework::LoDTensor);
  PADDLE_ENFORCE_NOT_NULL(
      tensor_, paddle::platform::errors::PreconditionNotMet(
@@ -573,4 +617,99 @@ void Tensor::SetPlace(PlaceType place, int device) {
  device_ = device;
 }

+#ifdef PADDLE_WITH_ONNXRUNTIME
+void Tensor::SetOrtMark(bool is_ort_tensor) { is_ort_tensor_ = is_ort_tensor; }
+
+void Tensor::SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding) {
+  binding_ = binding;
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<float>(memory_info, data, size, shape,
+                                         shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int64_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int64_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int32_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int32_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, uint8_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<uint8_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int8_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int8_t>(memory_info, data, size, shape,
+                                          shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float16 *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor(memory_info, static_cast<void *>(data),
+                                  size * sizeof(float16), shape, shape_len,
+                                  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+}
+
+template <typename T>
+void Tensor::ORTCopyFromCpu(const T *data) {
+  auto binding = binding_.lock();
+  PADDLE_ENFORCE_NOT_NULL(binding,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "input tensor [%s] no binding ptr", name_));
+  const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, device_,
+                              OrtMemTypeDefault);
+  size_t size = std::accumulate(begin(shape_), end(shape_), 1UL,
+                                std::multiplies<size_t>());
+  auto ort_value = GetOrtVaule(memory_info, const_cast<T *>(data), size,
+                               shape_.data(), shape_.size());
+  binding->BindInput(name_.c_str(), ort_value);
+}
+
+template <typename T>
+void Tensor::ORTCopyToCpu(T *data) const {
+  auto binding = binding_.lock();
+  PADDLE_ENFORCE_NOT_NULL(binding,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "output tensor [%s] no binding ptr", name_));
+  std::vector<Ort::Value> outputs = binding->GetOutputValues();
+  Ort::Value &value = outputs[idx_];
+  auto info = value.GetTensorTypeAndShapeInfo();
+  size_t size = info.GetElementCount() * sizeof(T);
+
+  if (place_ == PlaceType::kCPU) {
+    std::memcpy(static_cast<void *>(data), value.GetTensorData<void *>(), size);
+  } else {
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                         static_cast<void *>(data),
+                         paddle::platform::CUDAPlace(device_),
+                         value.GetTensorData<void>(), size, nullptr);
+  }
+}
+
+template void Tensor::ORTCopyFromCpu<float>(const float *data);
+template void Tensor::ORTCopyFromCpu<int64_t>(const int64_t *data);
+template void Tensor::ORTCopyFromCpu<int32_t>(const int32_t *data);
+template void Tensor::ORTCopyFromCpu<uint8_t>(const uint8_t *data);
+template void Tensor::ORTCopyFromCpu<int8_t>(const int8_t *data);
+template void Tensor::ORTCopyFromCpu<float16>(const float16 *data);
+
+template void Tensor::ORTCopyToCpu<float>(float *data) const;
+template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
+template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
+template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
+template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
+#endif
+
 }  // namespace paddle_infer
--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -25,11 +25,7 @@
 #include <vector>

 #include "paddle/fluid//platform/device/gpu/gpu_types.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
@@ -45,24 +41,23 @@

 namespace paddle {

-framework::proto::VarType::Type ConvertONNXType(
-    ONNXTensorElementDataType type) {
+paddle_infer::DataType ConvertONNXType(ONNXTensorElementDataType type) {
  switch (type) {
    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
-      return framework::proto::VarType::FP32;
-    // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
-    //   return DataType::FP16;
+      return paddle_infer::DataType::FLOAT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+      return paddle_infer::DataType::FLOAT16;
    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
-      return framework::proto::VarType::INT8;
+      return paddle_infer::DataType::INT8;
    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
-      return framework::proto::VarType::INT32;
+      return paddle_infer::DataType::INT32;
    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
-      return framework::proto::VarType::INT64;
+      return paddle_infer::DataType::INT64;
    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
-      return framework::proto::VarType::UINT8;
+      return paddle_infer::DataType::UINT8;
    default:
      LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast<int>(type);
-      return framework::proto::VarType::FP32;
+      return paddle_infer::DataType::FLOAT32;
  }
 }

@@ -87,13 +82,12 @@ bool ONNXRuntimePredictor::Init() {
  VLOG(3) << "ONNXRuntime Predictor::init()";

  // Now ONNXRuntime only suuport CPU
+  const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
  if (config_.use_gpu()) {
    place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
  } else {
    place_ = paddle::platform::CPUPlace();
  }
-  scope_.reset(new paddle::framework::Scope());
-  sub_scope_ = &scope_->NewScope();

  std::string onnx_proto;
  paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto,
@@ -125,13 +119,12 @@ bool ONNXRuntimePredictor::Init() {
               "generated.";
  }
  session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options};
+  binding_ = std::make_shared<Ort::IoBinding>(session_);

-  auto memory_info =
-      Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+                              place_.GetDeviceId(), OrtMemTypeDefault);
  Ort::Allocator allocator(session_, memory_info);

-  framework::proto::VarType::Type proto_type =
-      framework::proto::VarType::LOD_TENSOR;
  size_t n_inputs = session_.GetInputCount();
  for (size_t i = 0; i < n_inputs; ++i) {
    auto input_name = session_.GetInputName(i, allocator);
@@ -141,8 +134,6 @@ bool ONNXRuntimePredictor::Init() {
    ONNXTensorElementDataType data_type =
        type_info.GetTensorTypeAndShapeInfo().GetElementType();
    input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type});
-    auto *ptr = scope_->Var(input_name);
-    framework::InitializeVariable(ptr, proto_type);
    allocator.Free(input_name);
  }

@@ -155,11 +146,13 @@ bool ONNXRuntimePredictor::Init() {
    ONNXTensorElementDataType data_type =
        type_info.GetTensorTypeAndShapeInfo().GetElementType();
    output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type});
-    auto *ptr = scope_->Var(output_name);
-    framework::InitializeVariable(ptr, proto_type);
+
+    Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator,
+                                    place_.GetDeviceId(), OrtMemTypeDefault);
+    binding_->BindOutput(output_name, out_memory_info);
+
    allocator.Free(output_name);
  }
-
  return true;
 }

@@ -216,15 +209,26 @@ std::vector<std::string> ONNXRuntimePredictor::GetOutputNames() {
  return output_names;
 }

+bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name,
+                                        bool is_input) {
+  if (is_input) {
+    for (auto i : input_desc_)
+      if (i.name == name) return true;
+  } else {
+    for (auto i : output_desc_)
+      if (i.name == name) return true;
+  }
+  return false;
+}
+
 std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
    const std::string &name) {
-  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
-                          platform::errors::PreconditionNotMet(
-                              "The in variable named %s is not found in the "
-                              "scope of the ONNXPredictor.",
-                              name));
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true,
+                    platform::errors::PreconditionNotMet(
+                        "The in variable named %s is not found in the "
+                        "ONNXPredictor.",
+                        name));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
  res->input_or_output_ = true;
  res->SetName(name);
  if (platform::is_cpu_place(place_)) {
@@ -233,18 +237,19 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
    auto gpu_place = place_;
    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
  }
+  res->SetOrtMark(true);
+  res->SetOrtBinding(binding_);
  return res;
 }

 std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
    const std::string &name) {
-  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
-                          platform::errors::PreconditionNotMet(
-                              "The out variable named %s is not found in the "
-                              "scope of the ONNXPredictor.",
-                              name));
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true,
+                    platform::errors::PreconditionNotMet(
+                        "The out variable named %s is not found in the "
+                        "ONNXPredictor.",
+                        name));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
  res->input_or_output_ = false;
  res->SetName(name);
  if (platform::is_cpu_place(place_)) {
@@ -253,46 +258,18 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
    auto gpu_place = place_;
    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
  }
+  res->SetOrtMark(true);
+  res->SetOrtBinding(binding_);
+  int size = output_desc_.size();
+  for (int i = 0; i < size; ++i)
+    if (output_desc_[i].name == name) {
+      res->idx_ = i;
+      res->dtype_ = ConvertONNXType(output_desc_[i].dtype);
+      break;
+    }
  return res;
 }

-Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc,
-                                             const char *device_name) {
-  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
-                              place_.GetDeviceId(), OrtMemTypeDefault);
-  auto *var = scope_->FindVar(desc.name);
-  auto *tensor = var->GetMutable<framework::LoDTensor>();
-  size_t size =
-      tensor->numel() *
-      framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype()));
-  std::vector<int64_t> shape = phi::vectorize<int64_t>(tensor->dims());
-  return Ort::Value::CreateTensor(memory_info,
-                                  static_cast<void *>(tensor->data()), size,
-                                  shape.data(), shape.size(), desc.dtype);
-}
-
-void ONNXRuntimePredictor::AsTensor(const Ort::Value &value,
-                                    const ONNXDesc &desc) {
-  auto info = value.GetTensorTypeAndShapeInfo();
-
-  auto *var = scope_->FindVar(desc.name);
-  auto *tensor = var->GetMutable<framework::LoDTensor>();
-  tensor->Resize(phi::make_ddim(info.GetShape()));
-  auto dtype = ConvertONNXType(info.GetElementType());
-  auto *ptr = tensor->mutable_data(place_, dtype);
-
-  if (platform::is_cpu_place(place_)) {
-    std::memcpy(ptr, const_cast<void *>(value.GetTensorData<void>()),
-                tensor->numel() * framework::SizeOfType(dtype));
-  } else {
-    auto src_place = place_;
-    auto dst_place = place_;
-    memory::Copy(dst_place, ptr, src_place,
-                 const_cast<void *>(value.GetTensorData<void>()),
-                 tensor->numel() * framework::SizeOfType(dtype));
-  }
-}
-
 bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
                               std::vector<PaddleTensor> *output_data,
                               int batch_size) {
@@ -302,31 +279,7 @@ bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,

 bool ONNXRuntimePredictor::ZeroCopyRun() {
  try {
-    Ort::IoBinding binding(session_);
-    std::vector<Ort::Value> inputs;
-    std::vector<Ort::Value> outputs;
-    Ort::RunOptions options;
-
-    inputs.reserve(input_desc_.size());
-    const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
-    for (auto desc : input_desc_) {
-      inputs.push_back(GetOrtValue(desc, device_name));
-      binding.BindInput(desc.name.c_str(), inputs.back());
-    }
-
-    // TODO(heliqi): Optimization —— move to  Init()
-    for (auto desc : output_desc_) {
-      Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
-                                  place_.GetDeviceId(), OrtMemTypeDefault);
-      binding.BindOutput(desc.name.c_str(), memory_info);
-    }
-
-    session_.Run({}, binding);
-
-    outputs = binding.GetOutputValues();
-    for (size_t i = 0; i < output_desc_.size(); ++i) {
-      AsTensor(outputs[i], output_desc_[i]);
-    }
+    session_.Run({}, *(binding_.get()));
  } catch (const std::exception &e) {
    LOG(ERROR) << e.what();
    return false;
@@ -345,9 +298,9 @@ uint64_t ONNXRuntimePredictor::TryShrinkMemory() {
 }

 ONNXRuntimePredictor::~ONNXRuntimePredictor() {
-  if (sub_scope_) {
-    scope_->DeleteScope(sub_scope_);
-  }
+  binding_->ClearBoundInputs();
+  binding_->ClearBoundOutputs();
+
  memory::Release(place_);
 }


--- a/paddle/fluid/inference/api/onnxruntime_predictor.h
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -94,9 +94,8 @@ class ONNXRuntimePredictor : public PaddlePredictor {
  /// \param[in] AnalysisConfig config
  ///
  explicit ONNXRuntimePredictor(const AnalysisConfig &config)
-      : config_(config) {
+      : config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") {
    predictor_id_ = inference::GetUniqueId();
-    env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx");
  }
  ///
  /// \brief Destroy the ONNXRuntime Predictor object
@@ -177,30 +176,17 @@ class ONNXRuntimePredictor : public PaddlePredictor {
  ///
  std::unique_ptr<PaddlePredictor> Clone() override;

-  std::shared_ptr<framework::Scope> scope_;
-
 private:
  ///
-  /// \brief get the Ort Value(input Tensor).
-  ///
-  /// \param[in] desc ONNXDesce(name、shape、dtype)
-  ///
-  /// \param[in] device_name "cpu" or "gpu" of device
-  ///
-  /// \return get a Ort::Value
-  ///
-  Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name);
-
-  ///
-  /// \brief Ort::Value to Paddle::ZeroCopyTensor.
+  /// \brief Whether to find in/out by name.
  ///
-  /// \param[in] value Ort::Value(output Tensor)
+  /// \param[in] name input or output name
  ///
-  /// \param[in] desc a ONNXDesce(name、shape、dtype)
+  /// \param[in] is_input input(true) or output(false)
  ///
-  /// \return get a Ort::Value
+  /// \return Whether to find by name
  ///
-  void AsTensor(const Ort::Value &value, const ONNXDesc &desc);
+  bool FindONNXDesc(const std::string &name, bool is_input);

 private:
  AnalysisConfig config_;
@@ -208,9 +194,9 @@ class ONNXRuntimePredictor : public PaddlePredictor {
  // ONNXRuntime
  Ort::Env env_;
  Ort::Session session_{nullptr};
+  std::shared_ptr<Ort::IoBinding> binding_;

  platform::Place place_;
-  framework::Scope *sub_scope_{nullptr};
  std::vector<ONNXDesc> input_desc_;
  std::vector<ONNXDesc> output_desc_;
  int predictor_id_;

--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -18,6 +18,11 @@

 #include "paddle_infer_declare.h"  // NOLINT

+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "onnxruntime_c_api.h"    // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
+#endif
+
 namespace paddle_infer {

 /// \brief  Experimental.
@@ -175,6 +180,23 @@ class PD_INFER_DECL Tensor {
  PlaceType place_;
  int device_;

+#ifdef PADDLE_WITH_ONNXRUNTIME
+  bool is_ort_tensor_{false};
+  std::vector<int64_t> shape_;
+  std::weak_ptr<Ort::IoBinding> binding_;
+  int idx_{-1};
+
+  void SetOrtMark(bool is_ort_tensor);
+
+  void SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding);
+
+  template <typename T>
+  void ORTCopyFromCpu(const T* data);
+
+  template <typename T>
+  void ORTCopyToCpu(T* data) const;
+#endif
+
  friend class paddle_infer::contrib::TensorUtils;
 #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
  friend class paddle_infer::InferApiTesterUtils;

--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/layer_norm_op.h"
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"


--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -53,6 +53,6 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
 }  // namespace paddle

 USE_OP_ITSELF(relu);
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(tanh);
 USE_OP(relu6);
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -17,7 +17,7 @@
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"

 namespace paddle {
 namespace inference {
@@ -83,7 +83,7 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
  cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
                  cudaMemcpyHostToDevice, stream);

-  paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+  phi::LayerNormDirectCUDAFunctor<float> layer_norm;
  layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
             variance_d, begin_norm_axis, eps);
  return cudaGetLastError() != cudaSuccess;
@@ -177,7 +177,7 @@ int LayerNormPluginDynamic::enqueue(
    cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
                    cudaMemcpyHostToDevice, stream);

-    paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+    phi::LayerNormDirectCUDAFunctor<float> layer_norm;
    layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
               variance_d, begin_norm_axis, eps);
  } else {

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1492,6 +1492,10 @@ REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor,
 REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor,
                       TanhShrinkGradFunctor);
 REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor);
+REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
+                       HardSigmoidGradFunctor);
+REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor,
+                       LogSigmoidGradFunctor);

 /* ==========================    sigmoid register  =============================
 */
@@ -1526,30 +1530,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad,
                      ops::SigmoidTripleGradFunctor<float>::FwdDeps()>,
                  ops::ActivationTripleGradOpInplaceInferer);

-// Register Sigmoid/GradSigmoid Kernels
-REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
-                               SigmoidGradFunctor);
-
-// Register DoubleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_grad_grad,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<float>>,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<double>>,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>);
-
-// Register TripleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_triple_grad,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<float>>,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<double>>,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>);
-
 /* ========================================================================== */

 /* ==========================    tanh register  ============================= */

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -238,15 +238,6 @@ struct BaseActivationFunctor {
  AttrPair GetAttrs() { return AttrPair(); }
 };

-// sigmoid(x) = 1 / (1 + exp(-x))
-template <typename T>
-struct SigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-  }
-};
-
 #define USE_PHI_FUNCTOR(name)                         \
  template <typename T>                               \
  using name##Functor = phi::funcs::name##Functor<T>; \
@@ -286,6 +277,11 @@ USE_PHI_FUNCTOR(TanhShrink)
 USE_PHI_FUNCTOR(Silu)
 USE_PHI_FUNCTOR(ELU)
 USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU)
+USE_PHI_FUNCTOR(Sigmoid)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid)
+USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid)
+USE_PHI_FUNCTOR(LogSigmoid)
+USE_PHI_FUNCTOR(HardSigmoid)

 USE_PHI_FUNCTOR(Expm1)
 USE_PHI_FUNCTOR(Mish)
@@ -300,157 +296,7 @@ USE_PHI_FUNCTOR(Softsign)
 template <typename T>
 using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;

-template <typename T>
-struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out * (static_cast<T>(1) - out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-/*
-    Out
-    DOut -> SigmoidGradGrad -> DOutNew
-    DDX                        DDOut
-
-    DDOut = (1-Out)*Out*DDX
-    DOutNew = (1-2*Out)*DOut*DDX
-*/
-template <typename T>
-struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
-
-    if (dOutNew) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
-      auto dout_new = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad"));
-      dout_new.device(*d) =
-          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad"));
-      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-/*
-    Out
-    DOut                            D_Dout
-    DDx     -> SigmoidTripleGrad -> D_DDx
-    D_DDout                         d_OutNew
-    D_Dout_new
-
-    D_Dout = (1-2*Out)*DDx*D_Dout_new
-    D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
-    D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
-
-    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-    D_OutNew, D_DOut, D_DDx               // output
-*/
-template <typename T>
-struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  const framework::Tensor* d_DDOut,
-                  const framework::Tensor* d_dOut_New,
-                  framework::Tensor* d_d_Out, framework::Tensor* d_Out_New,
-                  framework::Tensor* d_DDx) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad"));
-    auto dout = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad"));
-    auto d_ddOut = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
-    auto d_dOutNew = framework::EigenVector<T>::Flatten(GET_DATA_SAFELY(
-        d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad"));
-
-    if (d_Out_New) {
-      auto d_OutNew = framework::EigenVector<T>::Flatten(GET_DATA_SAFELY(
-          d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad"));
-      d_OutNew.device(*d) = (ddx - static_cast<T>(2) * out * ddx) * d_ddOut -
-                            static_cast<T>(2) * dout * ddx * d_dOutNew;
-    }
-    if (d_d_Out) {
-      auto d_dOut = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad"));
-      d_dOut.device(*d) =
-          (static_cast<T>(1) - static_cast<T>(2) * out) * ddx * d_dOutNew;
-    }
-    if (d_DDx) {
-      auto d_ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad"));
-      d_ddx.device(*d) =
-          (static_cast<T>(1) - out) * out * d_ddOut +
-          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-// Originally: logsigmoid(x) = -log (1 + exp(-x))
-// For numerical stability, we can use the log-sum-exp trick:
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// We can rewrite the above equation as:
-// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
-//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
-//           max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
-//
-// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
-// + exp(-x - max(-x, 0))))
-template <typename T>
-struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
-  }
-};
-
-// Originally: f' = exp(-x) / (1 + exp(-x))
-// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
-// exp(-x - max(-x, 0)))
-template <typename T>
-struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    dx.device(d) =
-        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};

-// relu(x) = max(x, 0)

 template <typename T>
 using ReluCPUFunctor = phi::funcs::ReluCPUFunctor<T>;
@@ -872,22 +718,66 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
 };

 template <typename T>
+<<<<<<< HEAD
 struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
  float slope;
  float offset;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"slope", &slope}, {"offset", &offset}};
+=======
+struct LogitFunctor {
+  template <typename Device, typename X, typename Out, typename P>
+  void operator()(Device d, X x, Out out, P p, float eps) const {
+    // logit(x) = ln(x/(1-x))
+    auto tmp_x =
+        (x.cwiseMin(static_cast<T>(1.0 - eps))).cwiseMax(static_cast<T>(eps));
+
+    if (!eps) {
+      out.device(d) = (x < static_cast<T>(0.0) || x > static_cast<T>(1.0))
+                          .select(p.constant(static_cast<T>(NAN)),
+                                  (tmp_x / (static_cast<T>(1) - tmp_x)).log());
+    } else {
+      out.device(d) = (tmp_x / (static_cast<T>(1) - tmp_x)).log();
+    }
+  }
+};
+
+template <typename T>
+struct LogitGradFunctor {
+  template <typename Device, typename X, typename dOut, typename dX, typename P>
+  void operator()(Device d, X x, dOut dout, dX dx, P p, float eps) const {
+    // logit(x)' = 1/(x*(1-x))
+    dx.device(d) =
+        (x < static_cast<T>(eps) || x > static_cast<T>(1.0 - eps))
+            .select(p.constant(static_cast<T>(0)),
+                    dout * (static_cast<T>(1) / ((static_cast<T>(1) - x) * x)));
+  }
+};
+
+template <typename T>
+struct STanhFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+>>>>>>> 1904572ac8edb57dfb528e711588758002a168dd
  }

  template <typename Device, typename X, typename Out>
  void operator()(Device d, X x, Out out) const {
+<<<<<<< HEAD
    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
    out.device(d) =
        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+=======
+    out.device(d) =
+        static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
+>>>>>>> 1904572ac8edb57dfb528e711588758002a168dd
  }
 };

 template <typename T>
+<<<<<<< HEAD
 struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
  float slope;
  float offset;
@@ -906,6 +796,25 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() {
    return ActBwdOpFwdDeps::kDepOut;
  }
+=======
+struct STanhGradFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto a = static_cast<T>(scale_a);
+    auto b = static_cast<T>(scale_b);
+    auto temp = (a * x).tanh() * (a * x).tanh();
+    dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+>>>>>>> 1904572ac8edb57dfb528e711588758002a168dd
 };

 template <typename T>
@@ -1073,211 +982,6 @@ inline void ExtractDoubleGradTensorWithInputDOut(
  }
 }

-template <typename DeviceContext, typename Functor>
-class SigmoidDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut;
-    framework::Tensor *dOutNew, *ddOut;
-    Out = ddX = dOut = nullptr;
-    dOutNew = ddOut = nullptr;
-    // extract ddx(input) and out(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    // set output ddout
-    ddOut = ctx.Output<framework::Tensor>("DDOut");
-    // extract dOut(intput)
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    dOutNew = ctx.Output<framework::Tensor>("DOutNew");
-    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, dOutNew, ddOut);
-  }
-};
-
-// Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-// D_OutNew, D_DOut, D_DDx               // output
-template <typename DeviceContext, typename Functor>
-class SigmoidTripleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew;
-    framework::Tensor *d_OutNew, *d_dOut, *d_ddx;
-    Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr;
-    d_OutNew = d_dOut = d_ddx = nullptr;
-
-    // extract ddx(input), out(input), dOut(input), d_ddOut(input),
-    // d_dOutNew(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    d_ddOut = ctx.Input<framework::Tensor>("D_DDOut");
-    d_dOutNew = ctx.Input<framework::Tensor>("D_DOut_New");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_ddOut, platform::errors::NotFound(
-                     "Cannot get input Variable d_ddOut, variable name = %s",
-                     ctx.InputName("D_DDOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_dOutNew,
-        platform::errors::NotFound(
-            "Cannot get input Variable d_dOutNew, variable name = %s",
-            ctx.InputName("D_DOutNew")));
-
-    // set output d_OutNew、d_dOut、d_ddx
-    d_dOut = ctx.Output<framework::Tensor>("D_DOut");
-    d_OutNew = ctx.Output<framework::Tensor>("D_OutNew");
-    d_ddx = ctx.Output<framework::Tensor>("D_DDx");
-
-    if (d_dOut) d_dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_OutNew) d_OutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_ddx) d_ddx->mutable_data<T>(ddX->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew,  // input
-            d_dOut, d_OutNew, d_ddx);                   // output
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class TanhDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut;
-    framework::Tensor *dOutNew, *ddOut;
-    Out = ddX = dOut = nullptr;
-    dOutNew = ddOut = nullptr;
-
-    // extract ddx(input) and out(input)
-    auto ddx_var = ctx.InputVar("DDX");
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddx_var, platform::errors::NotFound(
-                     "Cannot get input Variable ddx, variable name = %s",
-                     ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var, platform::errors::NotFound(
-                     "Cannot get input Variable out, variable name = %s",
-                     ctx.InputName("Out")));
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-
-    // set output ddout
-    auto ddout_var = ctx.OutputVar("DDOut");
-    if (ddout_var) {
-      ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-
-    // extract dOut(intput)
-    auto dout_var = ctx.InputVar("DOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        dout_var, platform::errors::NotFound(
-                      "Cannot get input Variable dout_var, variable name = %s",
-                      ctx.InputName("DOut")));
-    dOut = ctx.Input<framework::Tensor>("DOut");
-
-    // set output dout_new
-    auto dout_new_var = ctx.OutputVar("DOutNew");
-    if (dout_new_var) {
-      dOutNew = ctx.Output<framework::Tensor>("DOutNew");
-    }
-
-    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, dOutNew, ddOut);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class TanhTripeGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew;
-    framework::Tensor *d_OutNew, *d_dOut, *d_ddx;
-    Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr;
-    d_OutNew = d_dOut = d_ddx = nullptr;
-
-    // extract ddx(input), out(input), dOut(input), d_ddOut(input),
-    // d_dOutNew(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    d_ddOut = ctx.Input<framework::Tensor>("D_DDOut");
-    d_dOutNew = ctx.Input<framework::Tensor>("D_DOut_New");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_ddOut, platform::errors::NotFound(
-                     "Cannot get input Variable d_ddOut, variable name = %s",
-                     ctx.InputName("D_DDOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_dOutNew,
-        platform::errors::NotFound(
-            "Cannot get input Variable d_dOutNew, variable name = %s",
-            ctx.InputName("D_DOutNew")));
-
-    // set output d_OutNew、d_dOut、d_ddx
-    d_dOut = ctx.Output<framework::Tensor>("D_DOut");
-    d_OutNew = ctx.Output<framework::Tensor>("D_OutNew");
-    d_ddx = ctx.Output<framework::Tensor>("D_DDx");
-
-    if (d_dOut) d_dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_OutNew) d_OutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_ddx) d_ddx->mutable_data<T>(ddX->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew,  // input
-            d_dOut, d_OutNew, d_ddx);                   // output
-  }
-};
-
 template <typename DeviceContext, typename Functor>
 class SquareDoubleGradKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -1623,7 +1327,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 }  // namespace paddle

 #define FOR_EACH_ACTIVATION_OP(__macro)                                      \
-  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                         \
  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                      \
  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                      \
@@ -1632,7 +1335,5 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
  __macro(log10, Log10, Log10Functor, Log10GradFunctor);                     \
  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);        \
  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                     \
-  __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                     \
-          HardSigmoidGradFunctor);                                           \
  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                     \
  __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -20,69 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
-struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // sigmoid(x) = 1 / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(one / (one + exp(-x)));
-  }
-};
-
-template <typename T>
-struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout * out * (1 - out)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * out * (one - out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-
-  // logsigmoid(x) = log(1 / (1 + exp(-x)))
-  // For numerical stability,
-  // logsigmoid(x) =
-  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp = x > zero ? zero : -x;
-    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
-  }
-};
-
-template <typename T>
-struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-
-  // dx = dout * exp(-x) / (1 + exp(-x))
-  // For numerical stability:
-  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
-  // 0)))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp1 = x > zero ? zero : -x;
-    MPType temp2 = exp(-x - temp1);
-    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaCeilFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
@@ -304,49 +241,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
  }
 };

-template <typename T>
-struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  // hard_sigmoid(x) = 0, when x <= -3
-  //                   1, when x >= 3
-  //                   x * slope + offset, otherwise
-  __device__ __forceinline__ T operator()(const T x) const {
-    T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    T temp_max = temp > zero ? temp : zero;
-    T temp_min = temp_max < one ? temp_max : one;
-    return temp_min;
-  }
-};
-
-template <typename T>
-struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  // dx = (out > 0 && out < 1) ? dout * slope : 0
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaSwishFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
@@ -580,6 +474,9 @@ USE_PHI_FUNCTOR(CudaSoftShrink)
 USE_PHI_FUNCTOR(CudaTanhShrink)
 USE_PHI_FUNCTOR(CudaSilu)
 USE_PHI_FUNCTOR(CudaELU)
+USE_PHI_FUNCTOR(CudaSigmoid)
+USE_PHI_FUNCTOR(CudaLogSigmoid)
+USE_PHI_FUNCTOR(CudaHardSigmoid)

 template <typename T>
 using CudaELUGradNegativeAlphaFunctor =
@@ -658,35 +555,6 @@ REGISTER_OP_CUDA_KERNEL(
                              ops::CELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */

-/* ===========================    sigmoid register  ============================
- */
-REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
-                                CudaSigmoidGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_grad_grad,
-    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<float>>,
-    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<double>>,
-    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>,
-    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::bfloat16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_triple_grad,
-    ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<float>>,
-    ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<double>>,
-    ops::SigmoidTripleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>,
-    ops::SigmoidTripleGradKernel<
-        plat::CUDADeviceContext,
-        ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
-/* ========================================================================== */
-
 /* ===========================   sqrt register  ============================= */

 REGISTER_OP_CUDA_KERNEL(
@@ -772,8 +640,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */

 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
-  __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
-          CudaLogSigmoidGradFunctor);                                         \
  __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
          CudaSoftShrinkGradFunctor);                                         \
  __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor);                  \
@@ -788,8 +654,6 @@ REGISTER_OP_CUDA_KERNEL(
          CudaTanhShrinkGradFunctor);                                         \
  __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor,                     \
          CudaHardShrinkGradFunctor);                                         \
-  __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor,                  \
-          CudaHardSigmoidGradFunctor);                                        \
  __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor);              \
  __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
          CudaHardSwishGradFunctor);

--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -16,6 +16,9 @@ limitations under the License. */

 #include <string>

+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -36,26 +39,6 @@ class AssignOp : public framework::OperatorWithKernel {
           const framework::AttributeMap &attrs)
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->HasInput("X")) {
-      auto type = ctx->GetInputsVarType("X")[0];
-      if (type == framework::proto::VarType::SELECTED_ROWS ||
-          type == framework::proto::VarType::LOD_TENSOR) {
-        ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-        if (type == framework::proto::VarType::LOD_TENSOR) {
-          ctx->ShareLoD("X", /*->*/ "Out");
-        }
-      } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
-        if (ctx->IsRuntime()) {
-          // The runtime output shape is determined in kernel.
-          return;
-        } else {
-          ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-        }
-      }
-    }
-  }
-
 protected:
  framework::OpKernelType GetKernelTypeForVar(
      const std::string &var_name, const framework::Tensor &tensor,
@@ -91,24 +74,6 @@ class AssignInferVarType : public framework::VarTypeInference {
  }
 };

-class AssignKernel {
- public:
-  void operator()(const framework::ExecutionContext &ctx) const {
-    auto *x = ctx.InputVar("X");
-    if (x == nullptr) {
-      return;
-    }
-    PADDLE_ENFORCE_EQ(
-        ctx.HasOutput("Out"), true,
-        platform::errors::NotFound("Output(Out) of assign_op is not found."));
-    auto *out = ctx.OutputVar("Out");
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(ctx.GetPlace());
-
-    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
-  }
-};
-
 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
@@ -147,23 +112,11 @@ DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"});

 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
+DECLARE_INFER_SHAPE_FUNCTOR(assign, AssignInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(assign, ops::AssignOp,
                  ops::AssignGradMaker<paddle::framework::OpDesc>,
                  ops::AssignGradMaker<paddle::imperative::OpBase>,
                  ops::AssignOpProtoMaker, ops::AssignOpInplaceInferer,
-                  ops::AssignInferVarType);
-
-REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                               ops::AssignKernel, int, ops::AssignKernel,
-                               int64_t, ops::AssignKernel, uint8_t,
-                               ops::AssignKernel, bool, ops::AssignKernel,
-                               plat::float16, ops::AssignKernel, plat::bfloat16,
-                               ops::AssignKernel);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                                ops::AssignKernel, int, ops::AssignKernel,
-                                int64_t, ops::AssignKernel, uint8_t,
-                                ops::AssignKernel, bool, ops::AssignKernel,
-                                plat::float16, ops::AssignKernel);
-#endif
+                  ops::AssignInferVarType, AssignInferShapeFunctor);
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;

-USE_OP(assign);
+USE_OP_ITSELF(assign);
 USE_OP_DEVICE_KERNEL(assign, NPU);

 template <typename T>

--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
@@ -30,7 +31,6 @@
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
 #include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
 #include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"


--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -21,13 +21,13 @@
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/slice.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"


--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -27,7 +27,7 @@ limitations under the License. */

 // only can include the headers in paddle/phi/include dirs
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #endif

 namespace paddle {

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/cpu_info.h"

-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -12,7 +12,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/expand_as_v2_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/infermeta/binary.h"

 namespace paddle {
 namespace operators {
@@ -22,27 +24,6 @@ using framework::Tensor;
 class ExpandAsV2Op : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAsV2");
-    auto x_dims = ctx->GetInputDim("X");
-    auto target_shape = ctx->Attrs().Get<std::vector<int>>("target_shape");
-    PADDLE_ENFORCE_GE(
-        target_shape.size(), static_cast<size_t>(x_dims.size()),
-        platform::errors::InvalidArgument(
-            "The rank of target_shape must be greater than or equal "
-            "to the rank of Input(X). But received Input(X): input "
-            "rank %u; received target_shape: rank %u.",
-            x_dims.size(), target_shape.size()));
-    PADDLE_ENFORCE_LE(target_shape.size(), MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank of target_shape must be less than or equal "
-                          "to %d. But received: rank %u.",
-                          MAX_RANK_SUPPORTED, target_shape.size()));
-    ctx->SetOutputDim("Out", phi::make_ddim(target_shape));
-  }
 };

 class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
@@ -116,9 +97,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsV2GradNoNeedBufVarsInferer, "X");
 }  // namespace paddle

 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(expand_as_v2, ExpandAsInferShapeFunctor,
+                            PD_INFER_META(phi::ExpandAsInferMeta));
 REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker,
                  ops::ExpandAsV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>);
+                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>,
+                  ExpandAsInferShapeFunctor);
 REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
                  ops::ExpandAsV2GradNoNeedBufVarsInferer);


--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -25,14 +25,16 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"

 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace memory = paddle::memory;

 USE_OP_ITSELF(dropout);
-USE_OP(layer_norm);
+USE_OP_ITSELF(layer_norm);

 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
@@ -136,18 +138,23 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
               const platform::CUDADeviceContext &ctx) {
  framework::Scope scope;
  auto place = ctx.GetPlace();
+  paddle::optional<const framework::LoDTensor &> scale_opt = paddle::none;
  if (scale.size() > 0) {
    auto var_scale = scope.Var("Scale");
    auto tensor_scale = var_scale->GetMutable<framework::LoDTensor>();
    framework::TensorFromVector(scale, ctx, tensor_scale);
    tensor_scale->Resize({cols});
+    scale_opt = *tensor_scale;
  }

+  paddle::optional<const framework::LoDTensor &> bias_opt = paddle::none;
  if (bias.size() > 0) {
    auto var_bias = scope.Var("Bias");
    auto tensor_bias = var_bias->GetMutable<framework::LoDTensor>();
    framework::TensorFromVector(bias, ctx, tensor_bias);
    tensor_bias->Resize({cols});
+
+    bias_opt = *tensor_bias;
  }

  auto var_x = scope.Var("X");
@@ -157,20 +164,19 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,

  auto var_y = scope.Var("Y");
  auto tensor_y = var_y->GetMutable<framework::LoDTensor>();
+  tensor_y->Resize({rows, cols});

  auto var_mean = scope.Var("Mean");
  auto tensor_mean = var_mean->GetMutable<framework::LoDTensor>();
+  tensor_mean->Resize({rows});

  auto var_variance = scope.Var("Variance");
  auto tensor_variance = var_variance->GetMutable<framework::LoDTensor>();
-
-  framework::AttributeMap attrs;
-  attrs.insert({"epsilon", epsilon});
-
-  auto op = framework::OpRegistry::CreateOp(
-      "layer_norm", {{"X", {"X"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}},
-      {{"Y", {"Y"}}, {"Mean", {"Mean"}}, {"Variance", {"Variance"}}}, attrs);
-  op->Run(scope, place);
+  tensor_variance->Resize({rows});
+  ctx.Wait();
+  phi::LayerNormKernel<T>(static_cast<const phi::GPUContext &>(ctx), *tensor_x,
+                          scale_opt, bias_opt, 1e-5, 1, false, tensor_y,
+                          tensor_mean, tensor_variance);
  framework::TensorToVector(*tensor_y, ctx, y);
  framework::TensorToVector(*tensor_mean, ctx, means);
  framework::TensorToVector(*tensor_variance, ctx, vars);

--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -198,7 +198,6 @@ struct TestFusedLayernormResidualDropoutBias {
            residual_vec[i * cols + j] + out2[i * cols + j];
      }
    }
-
    LayerNorm<T>(scale_vec, layernorm_bias_vec, correct_out, &correct_means,
                 &correct_vars, &correct_layernorm_out, epsilon, rows, cols,
                 *ctx);

--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
@@ -17,7 +17,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>

+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"

 namespace paddle {
 namespace operators {
@@ -26,27 +28,6 @@ class KronOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kron");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "kron");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kron");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_y = ctx->GetInputDim("Y");
-    auto rank_x = dim_x.size();
-    auto rank_y = dim_y.size();
-    auto rank = (rank_x > rank_y) ? rank_x : rank_y;
-
-    std::vector<int64_t> dim_out;
-    dim_out.reserve(rank);
-    for (int i = 0; i < rank; i++) {
-      int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x));
-      int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y));
-      dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi);
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(dim_out));
-  }
-
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -173,7 +154,10 @@ class KronGradOpMaker : public framework::SingleGradOpMaker<T> {

 namespace ops = paddle::operators;

+DECLARE_INFER_SHAPE_FUNCTOR(kron, KronInferShapeFunctor,
+                            PD_INFER_META(phi::KronInferMeta));
 REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker,
                  ops::KronGradOpMaker<paddle::framework::OpDesc>,
-                  ops::KronGradOpMaker<paddle::imperative::OpBase>);
+                  ops::KronGradOpMaker<paddle::imperative::OpBase>,
+                  KronInferShapeFunctor);
 REGISTER_OPERATOR(kron_grad, ops::KronGradOp);
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -758,12 +758,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
 */
 template <typename T, typename U, typename ScaleT = U,
          typename MaskType = uint8_t>
-void ln_bwd_1024_kernel_driver(
-    const platform::CUDADeviceContext &dev_ctx, const int rows, const int cols,
-    float epsilon, const T *x_ptr, const ScaleT *scale_ptr, const U *mean_ptr,
-    const U *var_ptr, const T *dout_ptr, T *dx_ptr, ScaleT *dscale_ptr,
-    ScaleT *dbias_ptr, const MaskType *mask_ptr = nullptr,
-    T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
+void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
+                               const int cols, float epsilon, const T *x_ptr,
+                               const ScaleT *scale_ptr, const U *mean_ptr,
+                               const U *var_ptr, const T *dout_ptr, T *dx_ptr,
+                               ScaleT *dscale_ptr, ScaleT *dbias_ptr,
+                               const MaskType *mask_ptr = nullptr,
+                               T factor = static_cast<T>(0),
+                               T *d_dropout_src_ptr = nullptr) {
  auto stream = dev_ctx.stream();
  if (cols == 1024) {
    // step-1: compute dx and reduced part results of dscale and dbias.
@@ -1334,8 +1336,7 @@ static void LayerNormBackward(
    const U *mean, const U *var, T *d_x,
    LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_scale,
    LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_bias, float epsilon,
-    int64_t batch_size, int64_t feature_size,
-    const platform::CUDADeviceContext &dev_ctx) {
+    int64_t batch_size, int64_t feature_size, const phi::GPUContext &dev_ctx) {
  auto stream = dev_ctx.stream();
 #ifdef __HIPCC__
  const int kMaxBlockDim = 256;

--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/layer_norm_op.h"
-
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"

 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -278,10 +277,3 @@ REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
                  ops::LayerNormGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp,
                  ops::LayerNormGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/operators/layer_norm_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
-                                               const T *input,
-                                               std::vector<int> input_shape,
-                                               const T *bias, const T *scale,
-                                               T *output, T *mean, T *variance,
-                                               int begin_norm_axis, float eps) {
-  const auto x_dims = phi::make_ddim(input_shape);
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-  switch (GetDesiredBlockDim(feature_size)) {
-    FIXED_BLOCK_DIM_CASE(
-        LayerNormForward<T, T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-            input, scale, bias, output, mean, variance, eps, feature_size));
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Product from begin_norm_axis to end in layer_norm must be larger "
-          "than 1"));
-      break;
-  }
-}
-
-template <typename T>
-class LayerNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using U = LayerNormParamType<T>;
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *x = ctx.Input<Tensor>("X");
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x->dims();
-    auto *x_data = x->data<T>();
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    auto *mean_data = mean->mutable_data<U>(ctx.GetPlace());
-    auto *var_data = var->mutable_data<U>(ctx.GetPlace());
-
-    auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
-    auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
-
-    framework::proto::VarType::Type x_dtype =
-        framework::TransToProtoVarType(x->dtype());
-    framework::proto::VarType::Type scale_bias_dtype;
-    if (void_scale_data != nullptr) {
-      scale_bias_dtype = framework::TransToProtoVarType(scale->dtype());
-      if (void_bias_data != nullptr) {
-        PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                          framework::TransToProtoVarType(bias->dtype()),
-                          platform::errors::InvalidArgument(
-                              "Thie Scale and Bias of layer_norm op "
-                              "should have the same data type."));
-      }
-    } else {
-      scale_bias_dtype = (void_bias_data != nullptr
-                              ? framework::TransToProtoVarType(bias->dtype())
-                              : x_dtype);
-    }
-
-    bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
-    if (!is_scale_bias_same_dtype_with_x) {
-      PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                        framework::DataTypeTrait<U>::DataType(),
-                        platform::errors::InvalidArgument(
-                            "Unsupported data type of Scale and Bias: %s",
-                            framework::DataTypeToString(scale_bias_dtype)));
-    }
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-    int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-
-    auto stream = ctx.cuda_device_context().stream();
-
-#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    switch (GetDesiredBlockDim(feature_size)) {                            \
-      FIXED_BLOCK_DIM_CASE(                                                \
-          LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX><<<  \
-              batch_size, kBlockDim, 0, stream>>>(                         \
-              x_data, static_cast<const ScaleBiasT *>(void_scale_data),    \
-              static_cast<const ScaleBiasT *>(void_bias_data), y_data,     \
-              mean_data, var_data, epsilon, feature_size));                \
-      default:                                                             \
-        PADDLE_THROW(platform::errors::InvalidArgument(                    \
-            "Product from begin_norm_axis to end must be larger than 1")); \
-        break;                                                             \
-    }                                                                      \
-  } while (0)
-
-#ifdef PADDLE_WITH_CUDA
-    bool can_call_1024_kernel = false;
-    if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
-      can_call_1024_kernel = true;
-    }
-    if (can_call_1024_kernel) {
-      const int WARPS_M = 4;
-      const int WARPS_N = 1;
-      const int THREADS_PER_WARP = 32;
-      const int BYTES_PER_LDG = 16;
-      const int VecSize = BYTES_PER_LDG / sizeof(T);
-
-      const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
-      const int ROWS_PER_CTA = WARPS_M;
-
-      const int grid = static_cast<int>(
-          std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
-      if (is_scale_bias_same_dtype_with_x) {
-        ln_fwd_1024_kernel<T, U, T, VecSize, WARPS_M, WARPS_N,
-                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-            batch_size, feature_size, epsilon, x_data,
-            static_cast<const T *>(void_scale_data),
-            static_cast<const T *>(void_bias_data), mean_data, var_data,
-            y_data);
-      } else {
-        ln_fwd_1024_kernel<T, U, U, VecSize, WARPS_M, WARPS_N,
-                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-            batch_size, feature_size, epsilon, x_data,
-            static_cast<const U *>(void_scale_data),
-            static_cast<const U *>(void_bias_data), mean_data, var_data,
-            y_data);
-      }
-    } else {
-#endif
-      if (is_scale_bias_same_dtype_with_x) {
-        PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
-      } else {
-        PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
-      }
-#ifdef PADDLE_WITH_CUDA
-    }
-#endif
-
-#undef PADDLE_LAUNCH_LAYERNORM_FWD
-  }
-};
-
-template <typename T>
-class LayerNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using U = LayerNormParamType<T>;
-    const float epsilon = ctx.Attr<float>("epsilon");
-    // d_x, d_scale, d_bias may be nullptr
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *mean = ctx.Input<Tensor>("Mean");
-    auto *var = ctx.Input<Tensor>("Variance");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-    int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-
-    auto *x_data = x->data<T>();
-    auto *d_y_data = d_y->data<T>();
-
-    auto *mean_data = mean->data<U>();
-    auto *var_data = var->data<U>();
-
-    auto *d_x_data =
-        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace()));
-
-    framework::proto::VarType::Type x_dtype =
-        framework::TransToProtoVarType(x->dtype());
-    framework::proto::VarType::Type scale_bias_dtype;
-    if (scale != nullptr) {
-      scale_bias_dtype = framework::TransToProtoVarType(scale->dtype());
-    } else {
-      // FIXME(zengjinle): do not find a better way to get the right
-      // data type of the d_scale and d_bias if scale == nullptr.
-      auto *bias = ctx.Input<Tensor>("Bias");
-      if (bias != nullptr) {
-        scale_bias_dtype = framework::TransToProtoVarType(bias->dtype());
-      } else {
-        scale_bias_dtype = x_dtype;
-      }
-    }
-
-#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    auto *scale_data =                                                     \
-        (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());          \
-    auto *d_scale_data =                                                   \
-        (d_scale == nullptr ? nullptr : d_scale->mutable_data<ScaleBiasT>( \
-                                            ctx.GetPlace()));              \
-    auto *d_bias_data =                                                    \
-        (d_bias == nullptr ? nullptr : d_bias->mutable_data<ScaleBiasT>(   \
-                                           ctx.GetPlace()));               \
-    auto *d_x_data =                                                       \
-        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace())); \
-    LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(                    \
-        x_data, d_y_data, scale_data, mean_data, var_data, d_x_data,       \
-        d_scale_data, d_bias_data, epsilon, batch_size, feature_size,      \
-        ctx.cuda_device_context());                                        \
-  } while (0)
-
-    if (scale_bias_dtype == x_dtype) {
-      PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
-    } else {
-      PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
-    }
-
-#undef PADDLE_LAUNCH_LAYERNORM_BWD
-  }
-};
-
-template class LayerNormDirectCUDAFunctor<float>;
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
-#elif CUDNN_VERSION_MIN(8, 1, 0)
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::bfloat16>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
-#endif
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__)
-#include "paddle/fluid/operators/jit/kernels.h"
-#endif
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-// Wrap RowwiseMean and ColwiseMean.
-// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
-// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
-// implementation only considers 2D.
-template <typename DeviceContext, typename T>
-struct RowwiseMean2D {
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class RowwiseMean2D<platform::CUDADeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({right_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
-  }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class RowwiseMean2D<platform::CPUDeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    row_mean_(context, input, out);
-  }
-
- private:
-  phi::funcs::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
-};
-
-template <typename DeviceContext, typename T>
-struct ColwiseSum2D {
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class ColwiseSum2D<platform::CUDADeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({left_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0);
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class ColwiseSum2D<platform::CPUDeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    col_wise_(context, input, out);
-  }
-
- private:
-  phi::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
-};
-
-template <typename T>
-struct SubAndSquareFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
-};
-
-template <typename T>
-struct DivAndSqrtFunctor {
-  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a / (sqrt(b + epsilon_));
-  }
-
- private:
-  T epsilon_;
-};
-
-template <typename T>
-struct MulInvVarFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a * std::sqrt(1.0 / b);
-  }
-};
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class LayerNormDirectCUDAFunctor {
- public:
-  void operator()(gpuStream_t stream, const T* input,
-                  std::vector<int> input_shape, const T* bias, const T* scale,
-                  T* output, T* mean, T* variance, int begin_norm_axis,
-                  float eps);
-};
-#endif
-
-template <typename DeviceContext, typename T>
-class LayerNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto x = *ctx.Input<Tensor>("X");
-
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x.dims();
-
-    y->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    x.Resize(matrix_shape);
-    Tensor out;
-    out.ShareDataWith(*y);
-    out.Resize(matrix_shape);
-
-#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
-    defined(__OSX__)
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
-
-    // get mean
-    row_mean(dev_ctx, x, mean);
-
-    // get variance
-    ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
-    row_mean(dev_ctx, out, var);
-
-    // get x_norm
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
-    ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-        ctx, &out, var, /*axis*/ 0,
-        DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
-
-    if (scale) {
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
-    }
-    if (bias) {
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
-    }
-#else
-    PADDLE_ENFORCE_EQ(mean->numel(), left,
-                      platform::errors::InvalidArgument(
-                          "mean's length (%d) is not equal with expected (%d).",
-                          mean->numel(), left));
-    PADDLE_ENFORCE_EQ(var->numel(), left,
-                      platform::errors::InvalidArgument(
-                          "var's length (%d) is not equal with expected (%d).",
-                          var->numel(), left));
-    if (scale) {
-      PADDLE_ENFORCE_EQ(
-          scale->numel(), right,
-          platform::errors::InvalidArgument(
-              "scale's length (%d) is not equal with expected (%d).",
-              scale->numel(), right));
-    }
-    if (bias) {
-      PADDLE_ENFORCE_EQ(
-          bias->numel(), right,
-          platform::errors::InvalidArgument(
-              "bias's length (%d) is not equal with expected (%d).",
-              bias->numel(), right));
-    }
-
-    auto ker =
-        jit::KernelFuncs<jit::LayerNormTuple<T>, platform::CPUPlace>::Cache()
-            .At(right);
-    ker(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
-        scale ? scale->data<T>() : nullptr, bias ? bias->data<T>() : nullptr,
-        static_cast<int>(left), static_cast<const float>(epsilon), right);
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LayerNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto x = *ctx.Input<Tensor>("X");
-    auto* mean = ctx.Input<Tensor>("Mean");
-    auto* var = ctx.Input<Tensor>("Variance");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    // init output
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    const auto& x_dims = x.dims();
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    d_y.Resize(matrix_shape);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
-                                               ctx.device_context());
-
-    Tensor temp;
-    Tensor temp_norm;
-    if (d_scale || d_x) {
-      x.Resize(matrix_shape);
-      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
-
-      temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
-      // get x_norm
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
-    }
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      colwise_sum(dev_ctx, d_y, d_bias);
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
-      colwise_sum(dev_ctx, temp, d_scale);
-    }
-
-    if (d_x) {
-      framework::DDim vec_shape({left});
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto dx_dim = d_x->dims();
-      Tensor temp_vec;
-      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
-
-      RowwiseMean2D<DeviceContext, T> row_mean(left, right,
-                                               ctx.device_context());
-
-      if (d_scale) {
-        // dy_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
-        framework::TensorCopy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, temp, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      } else {
-        // dy_dx
-        framework::TensorCopy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, d_y, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      }
-      // dy_var_dx
-      row_mean(dev_ctx, temp, &temp_vec);
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, d_x, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
-      d_x->Resize(dx_dim);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {

--- a/paddle/fluid/operators/layer_norm_op_xpu.cc
+++ b/paddle/fluid/operators/layer_norm_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */

 #ifdef PADDLE_WITH_XPU

-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"

 namespace paddle {

--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/common/data_type.h"

 namespace paddle {
 namespace operators {
@@ -139,7 +140,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    layer_norm_p->execute(astream, args);
    astream.wait();

-    y->set_layout(DataLayout::kMKLDNN);
+    y->set_layout(phi::DataLayout::kMKLDNN);
    y->set_format(platform::GetMKLDNNFormat(*dst_memory));
  }
 };

--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/roi_pool_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/kernels/roi_pool_kernel.h"

 namespace paddle {
 namespace operators {
@@ -57,7 +58,7 @@ class ROIPoolOp : public framework::OperatorWithKernel {
            "%d-dimensional LoDTensor",
            rois_dims.size()));
    PADDLE_ENFORCE_EQ(
-        rois_dims[1], kROISize,
+        rois_dims[1], phi::kROISize,
        platform::errors::InvalidArgument(
            "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)"
            "given as [[x1, y1, x2, y2], ...]. But the second dimension of  "
@@ -216,16 +217,7 @@ REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
                  ops::ROIPoolGradMaker<paddle::framework::OpDesc>,
                  ops::ROIPoolGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, int>);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool_grad,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+
 REGISTER_OP_VERSION(roi_pool)
    .AddCheckpoint(
        R"ROC(

--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-    int roi_start_w = round(offset_input_rois[0] * spatial_scale);
-    int roi_start_h = round(offset_input_rois[1] * spatial_scale);
-    int roi_end_w = round(offset_input_rois[2] * spatial_scale);
-    int roi_end_h = round(offset_input_rois[3] * spatial_scale);
-
-    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-
-    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
-                                        static_cast<double>(roi_height) /
-                                        static_cast<double>(pooled_height)));
-    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
-                                        static_cast<double>(roi_width) /
-                                        static_cast<double>(pooled_width)));
-    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
-                                     static_cast<double>(roi_height) /
-                                     static_cast<double>(pooled_height)));
-    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
-                                     static_cast<double>(roi_width) /
-                                     static_cast<double>(pooled_width)));
-    hstart = min(max(hstart + roi_start_h, 0), height);
-    hend = min(max(hend + roi_start_h, 0), height);
-    wstart = min(max(wstart + roi_start_w, 0), width);
-    wend = min(max(wend + roi_start_w, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
-    int maxidx = -1;
-    const T* offset_input_data =
-        input_data + (roi_batch_ind * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int input_data_index = h * width + w;
-        if (offset_input_data[input_data_index] > maxval) {
-          maxval = offset_input_data[input_data_index];
-          maxidx = input_data_index;
-        }
-      }
-    }
-    output_data[i] = maxval;
-    if (argmax_data) {
-      argmax_data[i] = maxidx;
-    }
-  }
-}
-
-template <typename T>
-__global__ void GPUROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad,
-    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, int* roi_batch_id_data,
-    T* input_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    int roi_batch_ind = roi_batch_id_data[n];
-    int input_offset = (roi_batch_ind * channels + c) * height * width;
-    int output_offset = (n * channels + c) * pooled_height * pooled_width;
-    const T* offset_output_grad = output_grad + output_offset;
-    T* offset_input_grad = input_grad + input_offset;
-    const int64_t* offset_argmax_data = argmax_data + output_offset;
-
-    int argmax = offset_argmax_data[ph * pooled_width + pw];
-    if (argmax != -1) {
-      platform::CudaAtomicAdd(
-          offset_input_grad + argmax,
-          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* argmax = ctx.Output<Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    auto in_stride = phi::stride(in_dims);
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      int rois_batch_size = rois_num_t->numel();
-
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be the same but "
-              "received batch size of input(ROIs) and input(X) is %d and %d "
-              "respectively.",
-              rois_batch_size, batch_size));
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace, rois_num_list.data(), gplace,
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be the same but "
-              "received batch size of input(ROIs) and input(X) is %d and %d "
-              "respectively.",
-              rois_batch_size, batch_size));
-
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        platform::errors::InvalidArgument(
-                            "The number of rois from input(ROIs) and its LOD "
-                            "must be the same. Received rois %d of input(ROIs) "
-                            "but the number of rois %d from its LOD is %d",
-                            rois_num, rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-
-    GPUROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()),
-        argmax->mutable_data<int64_t>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* rois_lod = ctx.Input<Tensor>("RoisNum");
-    auto* argmax = ctx.Input<Tensor>("Argmax");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (x_grad) {
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      auto cplace = platform::CPUPlace();
-      int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-
-      auto& dev_ctx = ctx.cuda_device_context();
-      auto gplace = ctx.GetPlace();
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-        int rois_batch_size = rois_num_t->numel();
-        std::vector<int> rois_num_list(rois_batch_size);
-        memory::Copy(cplace, rois_num_list.data(), gplace,
-                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_list[n]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-          start += rois_num_list[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        int rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-        }
-      }
-      int bytes = roi_batch_id_list.numel() * sizeof(int);
-      auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-      memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                   dev_ctx.stream());
-
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<Place, T> set_zero;
-      set_zero(dev_ctx, x_grad, static_cast<T>(0));
-
-      int output_grad_size = out_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            output_grad_size, rois->data<T>(), out_grad->data<T>(),
-            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
-            width, pooled_height, pooled_width, roi_id_data,
-            x_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool_grad,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr int kROISize = 4;
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = phi::stride(in_dims);
-    auto argmax_stride = phi::stride(argmax->dims());
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("The rois_batch_size and imgs "
-                                            "batch_size must be the same."));
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("The rois_batch_size and imgs "
-                                            "batch_size must be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num, rois_num_with_lod,
-          platform::errors::InvalidArgument("The rois_num from input "
-                                            "and lod must be the same."));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
-
-    const T* rois_data = rois->data<T>();
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = roi_batch_id_data[n];
-      int roi_start_w = round(rois_data[0] * spatial_scale);
-      int roi_start_h = round(rois_data[1] * spatial_scale);
-      int roi_end_w = round(rois_data[2] * spatial_scale);
-      int roi_end_h = round(rois_data[3] * spatial_scale);
-
-      // Force malformed ROIs to be 1x1
-      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
-      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
-
-      const float bin_size_h =
-          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-      const float bin_size_w =
-          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-
-      const T* batch_data = input_data + roi_batch_id * in_stride[0];
-
-      for (int c = 0; c < channels; ++c) {
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            //  Compute pooling region for this output unit:
-            //  start (included) = floor(ph * roi_height / pooled_height_)
-            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
-            int hstart =
-                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
-            int wstart =
-                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
-            int hend =
-                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
-            int wend =
-                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
-
-            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
-            hend = std::min(std::max(hend + roi_start_h, 0), height);
-            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
-            wend = std::min(std::max(wend + roi_start_w, 0), width);
-
-            const int pool_index = ph * pooled_width + pw;
-
-            // Define an empty pooling region to be zero
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            output_data[pool_index] =
-                is_empty ? 0 : -std::numeric_limits<T>::max();
-            argmax_data[pool_index] = -1;
-
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                const int index = h * width + w;
-                if (batch_data[index] > output_data[pool_index]) {
-                  output_data[pool_index] = batch_data[index];
-                  argmax_data[pool_index] = index;
-                }
-              }
-            }
-          }
-        }
-
-        batch_data += in_stride[1];
-        output_data += out_stride[1];
-        argmax_data += argmax_stride[1];
-      }
-      // Increment ROI data pointer
-      rois_data += roi_stride[0];
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-
-    if (in_grad) {
-      int rois_num = rois->dims()[0];
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      int* roi_batch_id_data =
-          roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        auto* rois_num_data = rois_num_t->data<int>();
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_data[n]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-          start += rois_num_data[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-        }
-      }
-
-      const T* rois_data = rois->data<T>();
-      const T* out_grad_data = out_grad->data<T>();
-      const int64_t* argmax_data = argmax->data<int64_t>();
-      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
-               static_cast<T>(0));
-
-      auto in_stride = phi::stride(in->dims());
-      auto argmax_stride = phi::stride(argmax->dims());
-      auto roi_stride = phi::stride(rois->dims());
-      auto out_stride = phi::stride(out_grad->dims());
-
-      int channels = in->dims()[1];
-
-      for (int n = 0; n < rois_num; ++n) {
-        int roi_batch_idx = roi_batch_id_data[n];
-        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
-        for (int c = 0; c < channels; ++c) {
-          for (int ph = 0; ph < pooled_height; ++ph) {
-            for (int pw = 0; pw < pooled_width; ++pw) {
-              int pool_index = ph * pooled_width + pw;
-              if (argmax_data[pool_index] >= 0) {
-                auto index = argmax_data[pool_index];
-                batch_grad_data[index] += out_grad_data[pool_index];
-              }
-            }
-          }
-          batch_grad_data += in_stride[1];
-          out_grad_data += out_stride[1];
-          argmax_data += argmax_stride[1];
-        }
-        rois_data += roi_stride[0];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/searchsorted_op.cc
+++ b/paddle/fluid/operators/searchsorted_op.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/infermeta/binary.h"

 namespace paddle {
 namespace operators {
@@ -21,60 +23,6 @@ namespace operators {
 class SearchSortedOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  static bool SearchsortedDimsMatchedBeforeLastDim(
-      const framework::DDim& sequences_dims,
-      const framework::DDim& values_dims) {
-    if (sequences_dims.size() != values_dims.size()) {
-      return false;
-    }
-    const auto& sequences_dims_size = sequences_dims.size();
-    for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) {
-      if (sequences_dims[dim] != values_dims[dim]) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("SortedSequence"), "Input", "SortedSequence",
-                   "searchsorted");
-    OP_INOUT_CHECK(ctx->HasInput("Values"), "Input", "Values", "searchsorted");
-
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "searchsorted");
-
-    auto sequences_dims = ctx->GetInputDim("SortedSequence");
-    auto values_dims = ctx->GetInputDim("Values");
-    auto out_int32 = ctx->Attrs().Get<bool>("out_int32");
-
-    if (sequences_dims.size() != 1) {
-      PADDLE_ENFORCE_EQ(
-          SearchsortedDimsMatchedBeforeLastDim(sequences_dims, values_dims),
-          true,
-          platform::errors::Unavailable(
-              "The dimensions of sorted_sequence tensor ( %s ) and values "
-              "tensor ( %s ) can not match. Because the input sorted_sequence "
-              "tensor must be 1 dimension or the first N-1 dimensions of "
-              "sorted_sequence tensor and input values tensor must match. "
-              "Please input appropriate sorted_sequence and values again! ",
-              sequences_dims, values_dims));
-    }
-
-    if (out_int32) {
-      PADDLE_ENFORCE_LT(
-          sequences_dims[sequences_dims.size() - 1],
-          std::numeric_limits<int>::max(),
-          platform::errors::Unavailable(
-              "The size of sorted_sequence %d exceed the maximum limit d%. "
-              "Because the size of sorted_sequence should be less than the "
-              "output maximum value for int32 bit. Please set appropriate "
-              "sorted_sequence to meet this requirement! ",
-              sequences_dims[sequences_dims.size() - 1],
-              std::numeric_limits<int>::max()));
-    }
-
-    ctx->SetOutputDim("Out", values_dims);
-  }

 protected:
  framework::OpKernelType GetExpectedKernelType(
@@ -115,4 +63,7 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker {

 namespace ops = paddle::operators;

-REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker);
+DECLARE_INFER_SHAPE_FUNCTOR(searchsorted, SearchsortedInferShapeFunctor,
+                            PD_INFER_META(phi::SearchsortedInferMeta));
+REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker,
+                  SearchsortedInferShapeFunctor);
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -13,9 +13,15 @@
 // limitations under the License.

 #include "paddle/fluid/operators/set_value_op.h"
+
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"

+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -34,6 +40,8 @@ class CPUDeviceContext;
 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+
 class SetValue : public framework::OperatorWithKernel {
 public:
  SetValue(const std::string &type, const framework::VariableNameMap &inputs,
@@ -41,17 +49,6 @@ class SetValue : public framework::OperatorWithKernel {
           const framework::AttributeMap &attrs)
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "SetValue");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SetValue");
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_LT(
-        in_dims.size(), 7,
-        platform::errors::InvalidArgument(
-            "The rank of input should be less than 7, but received %d.",
-            in_dims.size()));
-  }
-
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
@@ -236,10 +233,13 @@ DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"});
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

+DECLARE_INFER_SHAPE_FUNCTOR(set_value, SetValueInferShapeFunctor,
+                            PD_INFER_META(phi::SetValueInferMeta));
+
 REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
                  ops::SetValueGradMaker<paddle::framework::OpDesc>,
                  ops::SetValueGradMaker<paddle::imperative::OpBase>,
-                  ops::SetValueOpInplaceInferer);
+                  ops::SetValueOpInplaceInferer, SetValueInferShapeFunctor);

 REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad);


--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -14,7 +14,9 @@ limitations under the License. */

 #include <memory>

+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"

 namespace paddle {
 namespace operators {
@@ -23,56 +25,6 @@ class TopkV2Op : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "topk_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "topk_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "topk_v2");
-
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_EQ(
-        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
-        paddle::platform::errors::InvalidArgument(
-            "the axis of topk must be [-%d, %d), but you set axis is %d",
-            dim_size, dim_size, axis));
-
-    if (axis < 0) axis += dim_size;
-
-    int k;
-    auto k_is_tensor = ctx->HasInput("K");
-    if (k_is_tensor) {
-      k = -1;
-    } else {
-      k = static_cast<int>(ctx->Attrs().Get<int>("k"));
-      PADDLE_ENFORCE_EQ(k >= 1, true,
-                        paddle::platform::errors::InvalidArgument(
-                            "the attribute of k in the topk must >= 1 or be a "
-                            "Tensor, but received %d .",
-                            k));
-    }
-
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of topk must have >= 1d shape"));
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(
-          input_dims[axis], k,
-          paddle::platform::errors::InvalidArgument(
-              "input of topk op must have >= %d columns in axis of %d", k,
-              axis));
-    }
-
-    framework::DDim dims = input_dims;
-
-    dims[axis] = k;
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -169,8 +121,11 @@ class TopkV2GradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(top_k_v2, TopKInferShapeFunctor,
+                            PD_INFER_META(phi::TopKInferMeta));
 REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
                  ops::TopkV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
+                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>,
+                  TopKInferShapeFunctor);

 REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
--- a/paddle/fluid/operators/truncated_gaussian_random_op.h
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.h
 /* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
    http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -137,9 +140,19 @@ T Erfinv(T x) {
 template <typename T>
 struct TruncatedNormal {
  T mean, std;
-  TruncatedNormal(T mean, T std) : mean(mean), std(std) {}
+  T a_normal_cdf;
+  T b_normal_cdf;
+  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
+    auto normal_cdf = [](T x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    a_normal_cdf = normal_cdf(-2.0);
+    b_normal_cdf = normal_cdf(2.0);
+  }
+
  T operator()(T value) const {
-    return std::sqrt(2.0) * Erfinv(value) * std + mean;
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
  }
 };


--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -84,13 +84,8 @@ class NPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
    Tensor cpu_tensor(tensor->dtype());
    cpu_tensor.Resize(tensor->dims());
    T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-    auto normal_cdf = [](float x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    float a_normal_cdf = normal_cdf((-2.0 - mean) / std);
-    float b_normal_cdf = normal_cdf((2.0 - mean) / std);
-    std::uniform_real_distribution<float> dist(2.0 * a_normal_cdf - 1.0,
-                                               2.0 * b_normal_cdf - 1.0);
+    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                           1.0);
    TruncatedNormal<T> truncated_normal(mean, std);
    int64_t size = tensor->numel();


--- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
@@ -32,13 +32,8 @@ class XPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
    auto* tensor = context.Output<framework::Tensor>("Out");
    T* data = tensor->mutable_data<T>(context.GetPlace());

-    auto normal_cdf = [](float x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    float a_normal_cdf = normal_cdf((-2.0 - mean) / std);
-    float b_normal_cdf = normal_cdf((2.0 - mean) / std);
-    std::uniform_real_distribution<float> dist(2.0 * a_normal_cdf - 1.0,
-                                               2.0 * b_normal_cdf - 1.0);
+    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                           1.0);
    TruncatedNormal<T> truncated_normal(mean, std);
    int64_t size = tensor->numel();


--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -225,7 +225,7 @@ void BindGraphPyClient(py::module* m) {
      .def("stop_server", &GraphPyClient::stop_server)
      .def("get_node_feat",
           [](GraphPyClient& self, std::string node_type,
-              std::vector<uint64_t> node_ids,
+              std::vector<int64_t> node_ids,
              std::vector<std::string> feature_names) {
             auto feats =
                 self.get_node_feat(node_type, node_ids, feature_names);
@@ -239,7 +239,7 @@ void BindGraphPyClient(py::module* m) {
           })
      .def("set_node_feat",
           [](GraphPyClient& self, std::string node_type,
-              std::vector<uint64_t> node_ids,
+              std::vector<int64_t> node_ids,
              std::vector<std::string> feature_names,
              std::vector<std::vector<py::bytes>> bytes_feats) {
             std::vector<std::vector<std::string>> feats(bytes_feats.size());

--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
@@ -60,6 +60,10 @@ bool ProtoArgumentMappingContext::IsSelectedRowsInput(
    const std::string& name) const {
  return false;
 }
+bool ProtoArgumentMappingContext::IsDenseTensorVectorInput(
+    const std::string& name) const {
+  return false;
+}

 bool ProtoArgumentMappingContext::IsDenseTensorOutput(
    const std::string& name) const {

--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -42,6 +42,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {

  bool IsDenseTensorInput(const std::string& name) const override;
  bool IsSelectedRowsInput(const std::string& name) const override;
+  bool IsDenseTensorVectorInput(const std::string& name) const override;

  bool IsDenseTensorOutput(const std::string& name) const override;
  bool IsSelectedRowsOutput(const std::string& name) const override;

--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -89,6 +89,8 @@ class ArgumentMappingContext {

  virtual bool IsDenseTensorInput(const std::string& name) const = 0;
  virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
+  // For compatibility with LoDTensorArray
+  virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;

  virtual bool IsDenseTensorOutput(const std::string& name) const = 0;
  virtual bool IsSelectedRowsOutput(const std::string& name) const = 0;

--- a/paddle/phi/core/kernel_context.cc
+++ b/paddle/phi/core/kernel_context.cc
@@ -37,6 +37,13 @@ void KernelContext::EmplaceBackInputs(
                 std::make_move_iterator(inputs.end()));
 }

+void KernelContext::EmplaceBackInputsWithoutSetRange(
+    paddle::SmallVector<const TensorBase*> inputs) {
+  inputs_.insert(inputs_.end(),
+                 std::make_move_iterator(inputs.begin()),
+                 std::make_move_iterator(inputs.end()));
+}
+
 void KernelContext::EmplaceBackOutput(TensorBase* output) {
  int index = outputs_.size();
  outputs_.emplace_back(output);
@@ -59,6 +66,13 @@ void KernelContext::EmplaceBackOutputs(
                  std::make_move_iterator(outputs.end()));
 }

+void KernelContext::EmplaceBackOutputsWithoutSetRange(
+    paddle::SmallVector<TensorBase*> outputs) {
+  outputs_.insert(outputs_.end(),
+                  std::make_move_iterator(outputs.begin()),
+                  std::make_move_iterator(outputs.end()));
+}
+
 void KernelContext::EmplaceBackAttr(paddle::any attr) {
  attrs_.emplace_back(std::move(attr));
 }

--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -52,12 +52,18 @@ class KernelContext {

  void EmplaceBackInputs(paddle::SmallVector<const TensorBase*> inputs);

+  void EmplaceBackInputsWithoutSetRange(
+      paddle::SmallVector<const TensorBase*> inputs);
+
  void EmplaceBackOutput(TensorBase* output);

  void EmplaceBackOutputWithoutSetRange(TensorBase* output);

  void EmplaceBackOutputs(paddle::SmallVector<TensorBase*> outputs);

+  void EmplaceBackOutputsWithoutSetRange(
+      paddle::SmallVector<TensorBase*> outputs);
+
  void EmplaceBackAttr(paddle::any attr);

  const std::pair<int, int>& InputRangeAt(size_t idx) const;

--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -476,6 +476,33 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
  out->share_lod(x);
 }

+void ExpandAsInferMeta(const MetaTensor& x,
+                       paddle::optional<const MetaTensor&> y,
+                       const std::vector<int>& target_shape,
+                       MetaTensor* out) {
+#define MAX_RANK_SUPPORTED 6
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      target_shape.size(),
+      static_cast<size_t>(x_dims.size()),
+      phi::errors::InvalidArgument(
+          "The rank of target_shape must be greater than or equal "
+          "to the rank of Input(X). But received Input(X): input "
+          "rank %u; received target_shape: rank %u.",
+          x_dims.size(),
+          target_shape.size()));
+  PADDLE_ENFORCE_LE(target_shape.size(),
+                    MAX_RANK_SUPPORTED,
+                    phi::errors::InvalidArgument(
+                        "The rank of target_shape must be less than or equal "
+                        "to %d. But received: rank %u.",
+                        MAX_RANK_SUPPORTED,
+                        target_shape.size()));
+  out->set_dims(phi::make_ddim(target_shape));
+  out->set_dtype(x.dtype());
+#undef MAX_RANK_SUPPORTED
+}
+
 void GatherInferMeta(const MetaTensor& x,
                     const MetaTensor& index,
                     const Scalar& axis,
@@ -728,6 +755,24 @@ void IndexSelectInferMeta(const MetaTensor& x,
  output->share_lod(x);
 }

+void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+  auto rank_x = dim_x.size();
+  auto rank_y = dim_y.size();
+  auto rank = (rank_x > rank_y) ? rank_x : rank_y;
+
+  std::vector<int64_t> dim_out;
+  dim_out.reserve(rank);
+  for (int i = 0; i < rank; i++) {
+    int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x));
+    int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y));
+    dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi);
+  }
+  out->set_dims(phi::make_ddim(dim_out));
+  out->set_dtype(x.dtype());
+}
+
 void LogLossInferMeta(const MetaTensor& input,
                      const MetaTensor& label,
                      float epsilon,
@@ -873,6 +918,60 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
  out->share_lod(x);
 }

+void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
+                           const MetaTensor& value,
+                           bool out_int32,
+                           bool right,
+                           MetaTensor* out) {
+  auto sequences_dims = sorted_sequence.dims();
+  auto values_dims = value.dims();
+
+  bool flag = true;
+  if (sequences_dims.size() != values_dims.size()) {
+    flag = false;
+  }
+  const auto& sequences_dims_size = sequences_dims.size();
+  for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) {
+    if (sequences_dims[dim] != values_dims[dim]) {
+      flag = false;
+      break;
+    }
+  }
+  if (sequences_dims.size() != 1) {
+    PADDLE_ENFORCE_EQ(
+        flag,
+        true,
+        phi::errors::Unavailable(
+            "The dimensions of sorted_sequence tensor ( %s ) and values "
+            "tensor ( %s ) can not match. Because the input sorted_sequence "
+            "tensor must be 1 dimension or the first N-1 dimensions of "
+            "sorted_sequence tensor and input values tensor must match. "
+            "Please input appropriate sorted_sequence and values again! ",
+            sequences_dims,
+            values_dims));
+  }
+
+  if (out_int32) {
+    PADDLE_ENFORCE_LT(
+        sequences_dims[sequences_dims.size() - 1],
+        std::numeric_limits<int>::max(),
+        phi::errors::Unavailable(
+            "The size of sorted_sequence %d exceed the maximum limit d%. "
+            "Because the size of sorted_sequence should be less than the "
+            "output maximum value for int32 bit. Please set appropriate "
+            "sorted_sequence to meet this requirement! ",
+            sequences_dims[sequences_dims.size() - 1],
+            std::numeric_limits<int>::max()));
+  }
+
+  out->set_dims(values_dims);
+  if (out_int32) {
+    out->set_dtype(DataType::INT32);
+  } else {
+    out->set_dtype(DataType::INT64);
+  }
+}
+
 void SegmentPoolInferMeta(const MetaTensor& x,
                          const MetaTensor& segment_ids,
                          const std::string& pooltype,

--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -90,6 +90,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                             int axis,
                             MetaTensor* out);

+void ExpandAsInferMeta(const MetaTensor& x,
+                       paddle::optional<const MetaTensor&> y,
+                       const std::vector<int>& target_shape,
+                       MetaTensor* out);
+
 void GatherInferMeta(const MetaTensor& x,
                     const MetaTensor& index,
                     const Scalar& axis,
@@ -125,6 +130,8 @@ void IndexSelectInferMeta(const MetaTensor& x,
                          int dim,
                          MetaTensor* output);

+void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+
 void LogLossInferMeta(const MetaTensor& input,
                      const MetaTensor& label,
                      float epsilon,
@@ -139,6 +146,12 @@ void MatmulInferMeta(const MetaTensor& x,

 void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);

+void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
+                           const MetaTensor& value,
+                           bool out_int32,
+                           bool right,
+                           MetaTensor* out);
+
 void SegmentPoolInferMeta(const MetaTensor& x,
                          const MetaTensor& segment_ids,
                          const std::string& pooltype,

--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1090,6 +1090,16 @@ void RollInferMeta(const MetaTensor& x,
  out->set_dtype(x.dtype());
 }

+void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) {
+  auto in_dims = x.dims();
+  PADDLE_ENFORCE_LT(
+      in_dims.size(),
+      7,
+      phi::errors::InvalidArgument(
+          "The rank of input should be less than 7, but received %d.",
+          in_dims.size()));
+}
+
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) {
  auto in_dim = input.dims();
  out->set_dims(phi::make_ddim({in_dim.size()}));
@@ -1384,6 +1394,55 @@ void TileInferMeta(const MetaTensor& x,
  }
 }

+void TopKInferMeta(const MetaTensor& x,
+                   const Scalar& k_scalar,
+                   int axis,
+                   bool largest,
+                   bool sorted,
+                   MetaTensor* out,
+                   MetaTensor* indices,
+                   MetaConfig config) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_EQ(
+      (axis < dim_size) && (axis >= (-1 * dim_size)),
+      true,
+      phi::errors::InvalidArgument(
+          "the axis of topk must be [-%d, %d), but you set axis is %d",
+          dim_size,
+          dim_size,
+          axis));
+
+  if (axis < 0) axis += dim_size;
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    k = -1;
+  } else {
+    PADDLE_ENFORCE_EQ(k >= 1,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "the attribute of k in the topk must >= 1 or be a "
+                          "Tensor, but received %d .",
+                          k));
+  }
+
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      phi::errors::InvalidArgument("input of topk must have >= 1d shape"));
+
+  phi::DDim dims = input_dims;
+
+  dims[axis] = k;
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(DataType::INT64);
+}
+
 void TraceInferMeta(
    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) {
  int dim1 = axis1;

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -177,6 +177,8 @@ void RollInferMeta(const MetaTensor& x,
                   const std::vector<int64_t>& axis,
                   MetaTensor* out);

+void SetValueInferMeta(const MetaTensor& x, MetaTensor* out);
+
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);

 void ShardIndexInferMeta(const MetaTensor& in,
@@ -215,6 +217,15 @@ void TileInferMeta(const MetaTensor& x,
                   MetaTensor* out,
                   MetaConfig config = MetaConfig());

+void TopKInferMeta(const MetaTensor& x,
+                   const Scalar& k_scalar,
+                   int axis,
+                   bool largest,
+                   bool sorted,
+                   MetaTensor* out,
+                   MetaTensor* indices,
+                   MetaConfig config = MetaConfig());
+
 void TraceInferMeta(
    const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);


--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -27,7 +27,7 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
 # Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel
+set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel 
    matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
    put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
    softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
@@ -35,7 +35,6 @@ set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_k
 kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
 kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
 kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)

--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -19,14 +19,14 @@ limitations under the License. */

 namespace phi {

-#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(name) \
  template <typename T, typename Context>         \
  void name##GradKernel(const Context& dev_ctx,   \
                        const DenseTensor& x,     \
                        const DenseTensor& dout,  \
                        DenseTensor* dx);

-#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(name, attr) \
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(name, attr) \
  template <typename T, typename Context>                       \
  void name##GradKernel(const Context& dev_ctx,                 \
                        const DenseTensor& x,                   \
@@ -34,7 +34,7 @@ namespace phi {
                        float attr,                             \
                        DenseTensor* dx);

-#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(name, attr1, attr2) \
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(name, attr1, attr2) \
  template <typename T, typename Context>                               \
  void name##GradKernel(const Context& dev_ctx,                         \
                        const DenseTensor& x,                           \
@@ -43,19 +43,28 @@ namespace phi {
                        float attr2,                                    \
                        DenseTensor* dx);

-#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(name) \
  template <typename T, typename Context>           \
  void name##GradKernel(const Context& dev_ctx,     \
                        const DenseTensor& out,     \
                        const DenseTensor& dout,    \
                        DenseTensor* dx);

-#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \
-  template <typename T, typename Context>                                \
-  void name##GradKernel(const Context& dev_ctx,                          \
-                        const DenseTensor& out,                          \
-                        const DenseTensor& dout,                         \
-                        float attr,                                      \
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(name, attr) \
+  template <typename T, typename Context>                         \
+  void name##GradKernel(const Context& dev_ctx,                   \
+                        const DenseTensor& out,                   \
+                        const DenseTensor& dout,                  \
+                        float attr,                               \
+                        DenseTensor* dx);
+
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \
+  template <typename T, typename Context>                                 \
+  void name##GradKernel(const Context& dev_ctx,                           \
+                        const DenseTensor& out,                           \
+                        const DenseTensor& dout,                          \
+                        float attr1,                                      \
+                        float attr2,                                      \
                        DenseTensor* dx);

 template <typename T, typename Context>
@@ -131,5 +140,37 @@ DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, alpha)
            DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(HardShrink, threshold)

                DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, t_min, t_max)
+template <typename T, typename Context>
+void SigmoidDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             DenseTensor* dout_new,
+                             DenseTensor* ddout);
+
+template <typename T, typename Context>
+void SigmoidTripleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             const DenseTensor& d_ddout,
+                             const DenseTensor& d_dout_new,
+                             DenseTensor* d_out_new,
+                             DenseTensor* d_dout,
+                             DenseTensor* d_ddx);
+
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset);

 }  // namespace phi
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -57,6 +57,8 @@ DECLARE_ACTIVATION_KERNEL(Expm1)
 DECLARE_ACTIVATION_KERNEL(Softsign)
 DECLARE_ACTIVATION_KERNEL(TanhShrink)
 DECLARE_ACTIVATION_KERNEL(Silu)
+DECLARE_ACTIVATION_KERNEL(Sigmoid)
+DECLARE_ACTIVATION_KERNEL(LogSigmoid)

 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
@@ -79,4 +81,5 @@ void MishKernel(const Context& dev_ctx,
                float threshold,
                DenseTensor* out);

+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)
 }  // namespace phi
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/assign_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  paddle::optional<const DenseTensor&> x,
+                  DenseTensor* out) {
+  if (!x.is_initialized()) {
+    return;
+  }
+  auto& x_tensor = *x.get_ptr();
+  Copy<Context>(dev_ctx, x_tensor, x_tensor.place(), false, out);
+}
+
+// Note: use `const paddle::optional<std::vector<const DenseTensor*>&> x`
+// as input if needed
+template <typename Context>
+void AssignArrayKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       std::vector<DenseTensor*> out) {
+  for (size_t i = 0; i < x.size(); ++i) {
+    AssignKernel<Context>(dev_ctx, *x[i], out.at(i));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(
+    assign, CPU, ALL_LAYOUT, phi::AssignKernel<phi::CPUContext>, ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(assign_array,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::AssignArrayKernel<phi::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    assign, GPU, ALL_LAYOUT, phi::AssignKernel<phi::GPUContext>, ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(assign_array,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::AssignArrayKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
--- a/paddle/phi/kernels/assign_kernel.h
+++ b/paddle/phi/kernels/assign_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+// In order to be compatible with the `AsDispensable` input in the original
+// assign op maker, the input parameter here needs to be dispensable, but
+// this looks weird
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  paddle::optional<const DenseTensor&> x,
+                  DenseTensor* out);
+
+template <typename Context>
+void AssignArrayKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       std::vector<DenseTensor*> out);
+
+}  // namespace phi
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -90,6 +90,23 @@ namespace phi {
        dev_ctx, nullptr, &out, &dout, dx, functor);         \
  }

+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(    \
+    name, functor_class, attr1, attr2)                       \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& out,              \
+                        const DenseTensor& dout,             \
+                        float attr1,                         \
+                        float attr2,                         \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr1;                              \
+    *(attrs[1].second) = attr2;                              \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);         \
+  }
+
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor);
@@ -111,9 +128,11 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, ReciprocalGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, RsqrtGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);

 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, SigmoidGradFunctor);

 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                               LeakyReluGradFunctor,
@@ -146,6 +165,10 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
                                               SoftplusGradFunctor,
                                               beta,
                                               threshold);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
+                                                 HardSigmoidGradFunctor,
+                                                 slope,
+                                                 offset);

 template <typename T, typename Context>
 void EluGradKernel(const Context& dev_ctx,
@@ -261,3 +284,8 @@ PD_REGISTER_KERNEL(square_grad,
                   double,
                   int,
                   int64_t) {}
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -80,6 +80,8 @@ DEFINE_CPU_ACTIVATION_KERNEL(Square, SquareFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Sqrt, SqrtFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Softsign, SoftsignFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Rsqrt, RsqrtFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)

 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)

@@ -94,6 +96,12 @@ DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)

+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
+                                     HardSigmoidFunctor,
+                                     slope,
+                                     offset)
+
 }  // namespace phi
 PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}

@@ -140,3 +148,6 @@ PD_REGISTER_KERNEL(expm1,
 PD_REGISTER_KERNEL(logit, CPU, ALL_LAYOUT, phi::LogitKernel, float, double) {}
 PD_REGISTER_KERNEL(
    square, CPU, ALL_LAYOUT, phi::SquareKernel, float, double, int, int64_t) {}
+PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
--- a/paddle/phi/kernels/cpu/copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/copy_kernel.cc
@@ -38,7 +38,7 @@ void Copy(const Context& dev_ctx,
          << src_place;

  dst->Resize(src.dims());
-  auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  auto* dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());

  if (src_ptr == dst_ptr) {
    VLOG(3) << "Skip copy the same data async from " << src_place << " to "

--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -12,10 +12,81 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"

+namespace phi {
+
+#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
+  template <typename T, typename Context>                                   \
+  void name##RawKernel(const Context& dev_ctx,                              \
+                       const DenseTensor& x,                                \
+                       const DenseTensor& y,                                \
+                       int axis,                                            \
+                       DenseTensor* out) {                                  \
+    dev_ctx.template Alloc<T>(out);                                         \
+    if (x.dims() == y.dims()) {                                             \
+      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
+          dev_ctx, x, y, out);                                              \
+    } else {                                                                \
+      auto x_dims = x.dims();                                               \
+      auto y_dims = y.dims();                                               \
+      if (x_dims.size() >= y_dims.size()) {                                 \
+        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
+            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
+      } else {                                                              \
+        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
+            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
+      }                                                                     \
+    }                                                                       \
+  }
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
+    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
+        dev_ctx, x, y, out);
+  } else {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    if (x_dims.size() >= y_dims.size()) {
+      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
+    } else {
+      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
+    }
+  }
+}
+
+// Create the definition of Add
+DEFINE_CPU_ELEMENTWISE_OP(Add)
+
+// Create the definition of Subtract
+DEFINE_CPU_ELEMENTWISE_OP(Subtract)
+
+// Create the definition of Multiply
+DEFINE_CPU_ELEMENTWISE_OP(Multiply)
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
 PD_REGISTER_KERNEL(elementwise_fmax,
                   CPU,
                   ALL_LAYOUT,
@@ -33,3 +104,49 @@ PD_REGISTER_KERNEL(elementwise_fmin,
                   double,
                   int,
                   int64_t) {}
+
+PD_REGISTER_KERNEL(add_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(subtract_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(divide_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(multiply_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__)
+#include "paddle/fluid/operators/jit/kernels.h"
+#endif
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         paddle::optional<const DenseTensor&> scale_opt,
+                         paddle::optional<const DenseTensor&> bias_opt,
+                         const DenseTensor& out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad) {
+  auto* scale = scale_opt.get_ptr();
+  auto d_y = out_grad;
+
+  // init output
+  auto* d_x = x_grad;
+  auto* d_scale = scale_grad;
+  auto* d_bias = bias_grad;
+
+  const auto& x_dims = x.dims();
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  DDim matrix_shape({left, right});
+
+  d_y.Resize(matrix_shape);
+
+  funcs::ColwiseSum2D<phi::CPUContext, T> colwise_sum(left, right, dev_ctx);
+  DenseTensor x_tmp = x;
+
+  DenseTensor temp;
+  DenseTensor temp_norm;
+  if (d_scale || d_x) {
+    x_tmp.Resize(matrix_shape);
+    temp.Resize(matrix_shape);
+    dev_ctx.template Alloc<T>(&temp);
+
+    temp_norm.Resize(matrix_shape);
+    dev_ctx.template Alloc<T>(&temp_norm);
+    // get x_norm
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+        dev_ctx,
+        x_tmp,
+        mean,
+        /*axis*/ 0,
+        funcs::SubtractFunctor<T>(),
+        &temp_norm);
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+        dev_ctx,
+        temp_norm,
+        variance,
+        /*axis*/ 0,
+        funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+        &temp_norm);
+  }
+
+  if (d_bias) {
+    dev_ctx.template Alloc<T>(d_bias);
+    colwise_sum(dev_ctx, d_y, d_bias);
+  }
+  if (d_scale) {
+    dev_ctx.template Alloc<T>(d_scale);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx, temp_norm, d_y, 0, funcs::MultiplyFunctor<T>(), &temp);
+    colwise_sum(dev_ctx, temp, d_scale);
+  }
+
+  if (d_x) {
+    DDim vec_shape({left});
+    dev_ctx.template Alloc<T>(d_x);
+    auto dx_dim = d_x->dims();
+    DenseTensor temp_vec;
+    temp_vec.Resize(vec_shape);
+    dev_ctx.template Alloc<T>(&temp_vec);
+
+    funcs::RowwiseMean2D<phi::CPUContext, T> row_mean(left, right, dev_ctx);
+
+    if (d_scale) {
+      // dy_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx, d_y, *scale, /*axis*/ 1, funcs::MultiplyFunctor<T>(), &temp);
+      phi::Copy<Context>(dev_ctx, temp, dev_ctx.GetPlace(), false, d_x);
+
+      // dy_dmean_dx
+      row_mean(dev_ctx, temp, &temp_vec);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+          dev_ctx,
+          *d_x,
+          temp_vec,
+          /*axis*/ 0,
+          funcs::SubtractFunctor<T>(),
+          d_x);
+
+      // dy_var_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx,
+          temp,
+          temp_norm,
+          /*axis*/ 0,
+          funcs::MultiplyFunctor<T>(),
+          &temp);
+    } else {
+      // dy_dx
+      phi::Copy<Context>(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x);
+
+      // dy_dmean_dx
+      row_mean(dev_ctx, d_y, &temp_vec);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+          dev_ctx,
+          *d_x,
+          temp_vec,
+          /*axis*/ 0,
+          funcs::SubtractFunctor<T>(),
+          d_x);
+
+      // dy_var_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx,
+          d_y,
+          temp_norm,
+          /*axis*/ 0,
+          funcs::MultiplyFunctor<T>(),
+          &temp);
+    }
+    // dy_var_dx
+    row_mean(dev_ctx, temp, &temp_vec);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx,
+        temp_norm,
+        temp_vec,
+        /*axis*/ 0,
+        funcs::MultiplyFunctor<T>(),
+        &temp);
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+        dev_ctx, *d_x, temp, /*axis*/ 0, funcs::SubtractFunctor<T>(), d_x);
+
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+        dev_ctx,
+        *d_x,
+        variance,
+        /*axis*/ 0,
+        funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+        d_x);
+    d_x->Resize(dx_dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    layer_norm_grad, CPU, ALL_LAYOUT, phi::LayerNormGradKernel, float, double) {
+}
--- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__)
+#include "paddle/fluid/operators/jit/kernels.h"
+#endif
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     paddle::optional<const DenseTensor&> scale_opt,
+                     paddle::optional<const DenseTensor&> bias_opt,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor* y,
+                     DenseTensor* mean,
+                     DenseTensor* var) {
+  const auto x_dims = x.dims();
+  auto* scale = scale_opt.get_ptr();
+  auto* bias = bias_opt.get_ptr();
+
+  dev_ctx.template Alloc<T>(y);
+  dev_ctx.template Alloc<T>(mean);
+  dev_ctx.template Alloc<T>(var);
+
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  DDim matrix_shape({left, right});
+
+  auto x_tmp = x;
+  x_tmp.Resize(matrix_shape);
+  DenseTensor out;
+  out.ShareDataWith(*y);
+  out.Resize(matrix_shape);
+
+#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
+    defined(__OSX__)
+
+  funcs::RowwiseMean2D<phi::CPUContext, T> row_mean(left, right, dev_ctx);
+
+  // get mean
+  row_mean(dev_ctx, x_tmp, mean);
+
+  // get variance
+
+  phi::funcs::ElementwiseCompute<funcs::SubAndSquareFunctor<T>, T, T>(
+      dev_ctx, x_tmp, *mean, 0, funcs::SubAndSquareFunctor<T>(), &out);
+
+  row_mean(dev_ctx, out, var);
+
+  // get x_norm
+  phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+      dev_ctx, x_tmp, *mean, 0, funcs::SubtractFunctor<T>(), &out);
+
+  phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+      dev_ctx,
+      out,
+      *var,
+      0,
+      funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+      &out);
+
+  if (scale) {
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx, out, *scale, 1, funcs::MultiplyFunctor<T>(), &out);
+  }
+  if (bias) {
+    phi::funcs::ElementwiseCompute<funcs::AddFunctor<T>, T, T>(
+        dev_ctx, out, *bias, 1, funcs::AddFunctor<T>(), &out);
+  }
+#else
+  PADDLE_ENFORCE_EQ(mean->numel(),
+                    left,
+                    phi::errors::InvalidArgument(
+                        "mean's length (%d) is not equal with expected (%d).",
+                        mean->numel(),
+                        left));
+  PADDLE_ENFORCE_EQ(var->numel(),
+                    left,
+                    phi::errors::InvalidArgument(
+                        "var's length (%d) is not equal with expected (%d).",
+                        var->numel(),
+                        left));
+  if (scale) {
+    PADDLE_ENFORCE_EQ(
+        scale->numel(),
+        right,
+        phi::errors::InvalidArgument(
+            "scale's length (%d) is not equal with expected (%d).",
+            scale->numel(),
+            right));
+  }
+  if (bias) {
+    PADDLE_ENFORCE_EQ(bias->numel(),
+                      right,
+                      phi::errors::InvalidArgument(
+                          "bias's length (%d) is not equal with expected (%d).",
+                          bias->numel(),
+                          right));
+  }
+
+  auto ker = paddle::operators::jit::KernelFuncs<
+                 paddle::operators::jit::LayerNormTuple<T>,
+                 phi::CPUPlace>::Cache()
+                 .At(right);
+  ker(x_tmp.data<T>(),
+      out.data<T>(),
+      mean->data<T>(),
+      var->data<T>(),
+      scale ? scale->data<T>() : nullptr,
+      bias ? bias->data<T>() : nullptr,
+      static_cast<int>(left),
+      static_cast<const float>(epsilon),
+      right);
+#endif
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    layer_norm, CPU, ALL_LAYOUT, phi::LayerNormKernel, float, double) {}
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/math_kernel.cc
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/math_kernel.h"
-
-#include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/scalar.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/elementwise.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/complex.h"
-
-namespace phi {
-
-#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
-  template <typename T, typename Context>                                   \
-  void name##RawKernel(const Context& dev_ctx,                              \
-                       const DenseTensor& x,                                \
-                       const DenseTensor& y,                                \
-                       int axis,                                            \
-                       DenseTensor* out) {                                  \
-    dev_ctx.template Alloc<T>(out);                                         \
-    if (x.dims() == y.dims()) {                                             \
-      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
-          dev_ctx, x, y, out);                                              \
-    } else {                                                                \
-      auto x_dims = x.dims();                                               \
-      auto y_dims = y.dims();                                               \
-      if (x_dims.size() >= y_dims.size()) {                                 \
-        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
-            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
-      } else {                                                              \
-        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
-            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
-      }                                                                     \
-    }                                                                       \
-  }
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
-    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
-        dev_ctx, x, y, out);
-  } else {
-    auto x_dims = x.dims();
-    auto y_dims = y.dims();
-    if (x_dims.size() >= y_dims.size()) {
-      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
-    } else {
-      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
-    }
-  }
-}
-
-// Create the definition of Add
-DEFINE_CPU_ELEMENTWISE_OP(Add)
-
-// Create the definition of Subtract
-DEFINE_CPU_ELEMENTWISE_OP(Subtract)
-
-// Create the definition of Multiply
-DEFINE_CPU_ELEMENTWISE_OP(Multiply)
-
-}  // namespace phi
-
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
-// using bfloat16 = ::phi::dtype::bfloat16;
-PD_REGISTER_KERNEL(add_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -17,12 +17,12 @@
 #include <Eigen/Dense>
 #include <Eigen/SVD>
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/reduce_kernel.h"

 namespace phi {

--- a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPoolGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& boxes,
+                       paddle::optional<const DenseTensor&> boxes_num,
+                       const DenseTensor& arg_max,
+                       const DenseTensor& out_grad,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       DenseTensor* dx) {
+  if (dx) {
+    int rois_num = boxes.dims()[0];
+    DenseTensor box_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+    int* box_batch_id_data = box_batch_id_list.data<int>();
+
+    int boxes_batch_size;
+    if (boxes_num) {
+      boxes_batch_size = boxes_num->numel();
+      auto* boxes_num_data = boxes_num->data<int>();
+      int start = 0;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (int i = start; i < start + boxes_num_data[n]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+        start += boxes_num_data[n];
+      }
+    } else {
+      auto boxes_lod = boxes.lod().back();
+      boxes_batch_size = boxes_lod.size() - 1;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+      }
+    }
+
+    const T* boxes_data = boxes.data<T>();
+    const T* out_grad_data = out_grad.data<T>();
+    const int64_t* arg_max_data = arg_max.data<int64_t>();
+    T* dx_data = dev_ctx.template Alloc<T>(dx);
+
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+
+    auto in_stride = phi::stride(x.dims());
+    auto arg_max_stride = phi::stride(arg_max.dims());
+    auto roi_stride = phi::stride(boxes.dims());
+    auto out_stride = phi::stride(out_grad.dims());
+
+    int channels = x.dims()[1];
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_idx = box_batch_id_data[n];
+      T* batch_grad_data = dx_data + roi_batch_idx * in_stride[0];
+      for (int c = 0; c < channels; ++c) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int pool_index = ph * pooled_width + pw;
+            if (arg_max_data[pool_index] >= 0) {
+              auto index = arg_max_data[pool_index];
+              batch_grad_data[index] += out_grad_data[pool_index];
+            }
+          }
+        }
+        batch_grad_data += in_stride[1];
+        out_grad_data += out_stride[1];
+        arg_max_data += arg_max_stride[1];
+      }
+      boxes_data += roi_stride[0];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roi_pool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RoiPoolGradKernel,
+                   float,
+                   double,
+                   int) {
+  kernel->InputAt(3).SetDataType(phi::DataType::INT64);
+}
--- a/paddle/phi/kernels/cpu/roi_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max) {
+  auto x_dims = x.dims();
+  int batch_size = x_dims[0];
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  auto in_stride = phi::stride(x_dims);
+  auto arg_max_stride = phi::stride(arg_max->dims());
+  auto box_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out->dims());
+
+  const T* input_data = x.data<T>();
+
+  DenseTensor box_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* box_batch_id_data = box_batch_id_list.data<int>();
+
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument("The boxes_batch_size and imgs "
+                                     "batch_size must be the same."));
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument("The boxes_batch_size and imgs "
+                                     "batch_size must be the same."));
+    int rois_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        rois_num_with_lod,
+        phi::errors::InvalidArgument("The rois_num from input "
+                                     "and lod must be the same."));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
+
+  const T* boxes_data = boxes.data<T>();
+  for (int n = 0; n < rois_num; ++n) {
+    int box_batch_id = box_batch_id_data[n];
+    int box_start_w = round(boxes_data[0] * spatial_scale);
+    int box_start_h = round(boxes_data[1] * spatial_scale);
+    int box_end_w = round(boxes_data[2] * spatial_scale);
+    int box_end_h = round(boxes_data[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    int box_height = std::max(box_end_h - box_start_h + 1, 1);
+    int box_width = std::max(box_end_w - box_start_w + 1, 1);
+
+    const float bin_size_h =
+        static_cast<float>(box_height) / static_cast<float>(pooled_height);
+    const float bin_size_w =
+        static_cast<float>(box_width) / static_cast<float>(pooled_width);
+
+    const T* batch_data = input_data + box_batch_id * in_stride[0];
+
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          //  Compute pooling region for this output unit:
+          //  start (included) = floor(ph * box_height / pooled_height_)
+          //  end (excluded) = ceil((ph + 1) * box_height / pooled_height_)
+          int hstart =
+              static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+          int wstart =
+              static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+          int hend =
+              static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+          int wend =
+              static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+          hstart = std::min(std::max(hstart + box_start_h, 0), height);
+          hend = std::min(std::max(hend + box_start_h, 0), height);
+          wstart = std::min(std::max(wstart + box_start_w, 0), width);
+          wend = std::min(std::max(wend + box_start_w, 0), width);
+
+          const int pool_index = ph * pooled_width + pw;
+
+          // Define an empty pooling region to be zero
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          output_data[pool_index] =
+              is_empty ? 0 : -std::numeric_limits<T>::max();
+          arg_max_data[pool_index] = -1;
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              if (batch_data[index] > output_data[pool_index]) {
+                output_data[pool_index] = batch_data[index];
+                arg_max_data[pool_index] = index;
+              }
+            }
+          }
+        }
+      }
+
+      batch_data += in_stride[1];
+      output_data += out_stride[1];
+      arg_max_data += arg_max_stride[1];
+    }
+    // Increment ROI data pointer
+    boxes_data += box_stride[0];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool, CPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double, int) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
--- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
@@ -37,13 +37,8 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,

  T* data = dev_ctx.template Alloc<T>(tensor);

-  auto normal_cdf = [](float x) {
-    return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-  };
-  float a_normal_cdf = normal_cdf((-2.0 - mean) / std);
-  float b_normal_cdf = normal_cdf((2.0 - mean) / std);
-  std::uniform_real_distribution<float> dist(2.0 * a_normal_cdf - 1.0,
-                                             2.0 * b_normal_cdf - 1.0);
+  std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                         1.0);
  TruncatedNormal<T> truncated_normal(mean, std);
  int64_t size = tensor->numel();


--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"

 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"

--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -15,7 +15,7 @@
 #pragma once

 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/infermeta/binary.h"

 namespace phi {

@@ -33,4 +33,100 @@ void ElementwiseFMinKernel(const Context& dev_ctx,
                           int axis,
                           DenseTensor* out);

+template <typename T, typename Context>
+void AddRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Add(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Subtract(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Divide(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Multiply(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
 }  // namespace phi
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -1367,6 +1367,217 @@ struct SiluGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };

+// sigmoid(x) = 1 / (1 + exp(-x))
+template <typename T>
+struct SigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+  }
+};
+
+template <typename T>
+struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out * (static_cast<T>(1) - out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+/*
+    Out
+    DOut -> SigmoidGradGrad -> DOutNew
+    DDX                        DDOut
+
+    DDOut = (1-Out)*Out*DDX
+    DOutNew = (1-2*Out)*DOut*DDX
+*/
+template <typename T>
+struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  DenseTensor* dOutNew,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
+
+    if (dOutNew) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
+      auto dout_new = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad"));
+      dout_new.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+/*
+    Out
+    DOut                            D_Dout
+    DDx     -> SigmoidTripleGrad -> D_DDx
+    D_DDout                         d_OutNew
+    D_Dout_new
+
+    D_Dout = (1-2*Out)*DDx*D_Dout_new
+    D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
+    D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
+
+    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
+    D_OutNew, D_DOut, D_DDx               // output
+*/
+template <typename T>
+struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  const DenseTensor* d_DDOut,
+                  const DenseTensor* d_dOut_New,
+                  DenseTensor* d_d_Out,
+                  DenseTensor* d_Out_New,
+                  DenseTensor* d_DDx) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad"));
+    auto dout = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad"));
+    auto d_ddOut = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
+    auto d_dOutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
+        d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad"));
+
+    if (d_Out_New) {
+      auto d_OutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
+          d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad"));
+      d_OutNew.device(*d) = (ddx - static_cast<T>(2) * out * ddx) * d_ddOut -
+                            static_cast<T>(2) * dout * ddx * d_dOutNew;
+    }
+    if (d_d_Out) {
+      auto d_dOut = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad"));
+      d_dOut.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * ddx * d_dOutNew;
+    }
+    if (d_DDx) {
+      auto d_ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad"));
+      d_ddx.device(*d) =
+          (static_cast<T>(1) - out) * out * d_ddOut +
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+// Originally: logsigmoid(x) = -log (1 + exp(-x))
+// For numerical stability, we can use the log-sum-exp trick:
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// We can rewrite the above equation as:
+// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
+//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
+//           max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
+//
+// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
+// + exp(-x - max(-x, 0))))
+template <typename T>
+struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
+  }
+};
+
+// Originally: f' = exp(-x) / (1 + exp(-x))
+// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
+// exp(-x - max(-x, 0)))
+template <typename T>
+struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    dx.device(d) =
+        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    out.device(d) =
+        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
+                       .template cast<T>() *
+                   static_cast<T>(slope);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 template <typename T>
 struct CudaReluFunctor : public BaseActivationFunctor<T> {
@@ -2304,6 +2515,112 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };

+template <typename T>
+struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // sigmoid(x) = 1 / (1 + exp(-x))
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(one / (one + exp(-x)));
+  }
+};
+
+template <typename T>
+struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout * out * (1 - out)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * out * (one - out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // logsigmoid(x) = log(1 / (1 + exp(-x)))
+  // For numerical stability,
+  // logsigmoid(x) =
+  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType temp = x > zero ? zero : -x;
+    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
+  }
+};
+
+template <typename T>
+struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // dx = dout * exp(-x) / (1 + exp(-x))
+  // For numerical stability:
+  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
+  // 0)))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType temp1 = x > zero ? zero : -x;
+    MPType temp2 = exp(-x - temp1);
+    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // hard_sigmoid(x) = 0, when x <= -3
+  //                   1, when x >= 3
+  //                   x * slope + offset, otherwise
+  __device__ __forceinline__ T operator()(const T x) const {
+    T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    T temp_max = temp > zero ? temp : zero;
+    T temp_min = temp_max < one ? temp_max : one;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // dx = (out > 0 && out < 1) ? dout * slope : 0
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 #endif

 }  // namespace funcs

--- a/paddle/phi/kernels/funcs/layer_norm_util.h
+++ b/paddle/phi/kernels/funcs/layer_norm_util.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+// Wrap RowwiseMean and ColwiseMean.
+// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
+// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
+// implementation only considers 2D.
+template <typename DeviceContext, typename T>
+struct RowwiseMean2D {
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx);
+
+  void operator()(const DeviceContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* vec);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class RowwiseMean2D<phi::GPUContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx)
+      : left_(left), right_(right) {
+    DDim ones_dim({right_});
+    divisor_.Resize(ones_dim);
+    dev_ctx.template Alloc<T>(&divisor_);
+    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
+  }
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    phi::funcs::GetBlas<phi::GPUContext, T>(context).GEMV(false,
+                                                          left_,
+                                                          right_,
+                                                          1.,
+                                                          input.data<T>(),
+                                                          divisor_.data<T>(),
+                                                          0.,
+                                                          out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  DenseTensor divisor_;
+};
+#endif
+
+template <typename T>
+class RowwiseMean2D<phi::CPUContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx) {}
+
+  void operator()(const phi::CPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    row_mean_(context, input, out);
+  }
+
+ private:
+  phi::funcs::RowwiseMean<phi::CPUContext, T> row_mean_;
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum2D {
+  ColwiseSum2D(int left, int right, const DeviceContext& dev_ctx);
+
+  void operator()(const phi::DeviceContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* vec);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class ColwiseSum2D<phi::GPUContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const phi::GPUContext& dev_ctx)
+      : left_(left), right_(right) {
+    DDim ones_dim({left_});
+    divisor_.Resize(ones_dim);
+    dev_ctx.template Alloc<T>(&divisor_);
+    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0);
+  }
+
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    phi::funcs::GetBlas<phi::GPUContext, T>(context).GEMV(true,
+                                                          left_,
+                                                          right_,
+                                                          1.,
+                                                          input.data<T>(),
+                                                          divisor_.data<T>(),
+                                                          0.,
+                                                          out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  DenseTensor divisor_;
+};
+#endif
+
+template <typename T>
+class ColwiseSum2D<phi::CPUContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const phi::CPUContext& dev_ctx) {}
+
+  void operator()(const phi::CPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    col_wise_(context, input, out);
+  }
+
+ private:
+  phi::funcs::ColwiseSum<phi::CPUContext, T> col_wise_;
+};
+
+template <typename T>
+struct SubAndSquareFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
+};
+
+template <typename T>
+struct DivAndSqrtFunctor {
+  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a / (sqrt(b + epsilon_));
+  }
+
+ private:
+  T epsilon_;
+};
+
+template <typename T>
+struct MulInvVarFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a * std::sqrt(1.0 / b);
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -331,12 +331,20 @@ template struct ColwiseSum<paddle::platform::CPUDeviceContext, double>;
 template struct ColwiseSum<paddle::platform::CPUDeviceContext, int>;
 template struct ColwiseSum<paddle::platform::CPUDeviceContext, int64_t>;

+template struct ColwiseSum<phi::CPUContext, float>;
+template struct ColwiseSum<phi::CPUContext, double>;
+template struct ColwiseSum<phi::CPUContext, int>;
+template struct ColwiseSum<phi::CPUContext, int64_t>;
+
 template struct RowwiseSum<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseSum<paddle::platform::CPUDeviceContext, double>;

 template struct RowwiseMean<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseMean<paddle::platform::CPUDeviceContext, double>;

+template struct RowwiseMean<phi::CPUContext, float>;
+template struct RowwiseMean<phi::CPUContext, double>;
+
 template <typename T>
 struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
  void operator()(paddle::platform::CPUDeviceContext* ctx,

--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -14,8 +14,8 @@

 #pragma once

-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// CUDA, XPU and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(__xpu__)

 #include <algorithm>
 #include <cmath>
@@ -220,7 +220,7 @@ struct IndexCalculator {
  phi::Array<int, kMaxRank> dims;
  phi::Array<int, kMaxRank> strides;
  phi::Array<int, kMaxRank> reduce_strides;
-#ifndef PADDLE_WITH_XPU2
+#ifndef PADDLE_WITH_XPU_KP
  phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
 #endif
 };
@@ -231,81 +231,65 @@ struct ReduceIndexMapping {
  HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
      : dim(dims) {}

+#ifdef PADDLE_WITH_XPU_KP
  __device__ __forceinline__ int BlockIdX() {
-#ifdef PADDLE_WITH_XPU2
    if (ReduceLastDim) {
      return (cluster_id() / dim.split_num_x % dim.split_num_y);
    } else {
      return cluster_id() % dim.split_num_x;
    }
-#else
-    return blockIdx.x;
-#endif
  }

  __device__ __forceinline__ int BlockIdY() {
-#ifdef PADDLE_WITH_XPU2
    if (ReduceLastDim) {
      return (cluster_id() % dim.split_num_x);
    } else {
      return (cluster_id() / dim.split_num_x % dim.split_num_y);
    }
-#else
-    return blockIdx.y;
-#endif
  }

-  __device__ __forceinline__ int BlockDimX() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_x;
-#else
-    return blockDim.x;
-#endif
-  }
+  __device__ __forceinline__ int BlockDimX() { return dim.deal_size_x; }

-  __device__ __forceinline__ int BlockDimY() {
-#ifdef PADDLE_WITH_XPU2
-    return 1;
-#else
-    return blockDim.y;
-#endif
-  }
+  __device__ __forceinline__ int BlockDimY() { return 1; }

  __device__ __forceinline__ int GridDimX() {
-#ifdef PADDLE_WITH_XPU2
    if (ReduceLastDim) {
      return dim.split_num_y;
    } else {
      return dim.split_num_x;
    }
-#else
-    return gridDim.x;
-#endif
  }

  __device__ __forceinline__ int GridDimY() {
-#ifdef PADDLE_WITH_XPU2
    if (ReduceLastDim) {
      return dim.split_num_x;
    } else {
      return dim.split_num_y;
    }
-#else
-    return gridDim.y;
-#endif
  }

  __device__ __forceinline__ int GetLoopSize() {
-#ifdef PADDLE_WITH_XPU2
    if (ReduceLastDim) {
      return dim.deal_size_y;
    } else {
      return dim.deal_size_x;
    }
+  }
 #else
-    return 1;
+  __device__ __forceinline__ int BlockIdX() { return blockIdx.x; }
+
+  __device__ __forceinline__ int BlockIdY() { return blockIdx.y; }
+
+  __device__ __forceinline__ int BlockDimX() { return blockDim.x; }
+
+  __device__ __forceinline__ int BlockDimY() { return blockDim.y; }
+
+  __device__ __forceinline__ int GridDimX() { return gridDim.x; }
+
+  __device__ __forceinline__ int GridDimY() { return gridDim.y; }
+
+  __device__ int GetLoopSize() { return 1; }
 #endif
-  }
 };

 // when reduce_type == kReduceLastDim this struct will be used
@@ -341,7 +325,7 @@ struct ReduceConfig {

  // when should_reduce_again is true, we need malloc temp space for temp data
  void SetOutputData(Ty* y_data,
-                     const phi::GPUContext& dev_ctx,
+                     const KPDevice& dev_ctx,
                     phi::DenseTensor* tmp) {
    if (should_reduce_again) {
      tmp->Resize(phi::make_ddim(
@@ -640,9 +624,7 @@ struct ReduceConfig {
  int blocking_size;
  bool should_reduce_again;
  bool reduce_last_dim;
-
  Ty* output_data;
-
  dim3 block;
  dim3 grid;
 };
@@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x,

    kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
        &reduce_var, &reduce_var, reducer, reduce_last_dim);
-    if (need_store) {
-      y[store_offset + i] = static_cast<Ty>(reduce_var);
-    }
+
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::details::WriteData<Ty>(
+        y + store_offset + i, &result, static_cast<int>(need_store));
  }
 }

@@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data,
    dim.SetRem(config.reduce_num % config.block.x, 0, 0);

 #ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
+    auto grid_num = 8;
+    auto block_num = 64;
 #else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
    ReduceAnyKernel<Tx,
                    Ty,
                    MPType,
                    ReduceOp,
                    TransformOp,
-                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
+                    OneDimIndexCal><<<grid_num, block_num, 0, stream>>>(
        x_data,
        config.output_data,
        reducer,
@@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data,
        reduce_index_calculator,
        left_index_calculator,
        dim);
-#endif

  } else {
    int reduce_rank = config.reduce_strides.size();
@@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data,
    dim.SetRem(config.reduce_num % config.block.x, 0, 0);

 #ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
+    auto grid_num = 8;
+    auto block_num = 64;
 #else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
    ReduceAnyKernel<Tx,
                    Ty,
                    MPType,
                    ReduceOp,
                    TransformOp,
-                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
+                    IndexCalculator><<<grid_num, block_num, 0, stream>>>(
        x_data,
        config.output_data,
        reducer,
@@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data,
        reduce_index_calculator,
        left_index_calculator,
        dim);
-#endif
  }

  if (config.should_reduce_again) {
@@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data,
        kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
    dim.SetRem(config.left_num % block.x, 0, 0);
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim);
-#else
+    grid = 8;
+    block = 64;
+#endif
    ReduceHigherDimKernel<
        Ty,
        Ty,
@@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data,
        config.left_num,
        config.grid.y,
        dim);
-#endif
  }
 }

@@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data,
                    Ty* y_data,
                    const TransformOp& transform,
                    int reduce_num,
-                    const phi::GPUContext& dev_ctx,
+                    const KPDevice& dev_ctx,
                    KPStream stream) {
  auto reducer = ReduceOp<Ty>();
  cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
@@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data,
                    Ty* y_data,
                    const TransformOp& transform,
                    int reduce_num,
-                    const phi::GPUContext& dev_ctx,
+                    const KPDevice& dev_ctx,
                    KPStream stream) {
  PADDLE_THROW(phi::errors::InvalidArgument(
      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
@@ -1087,12 +1030,16 @@ template <typename Tx,
          typename Ty,
          template <typename> class ReduceOp,
          typename TransformOp>
-void ReduceKernel(const phi::GPUContext& dev_ctx,
+void ReduceKernel(const KPDevice& dev_ctx,
                  const phi::DenseTensor& x,
                  phi::DenseTensor* y,
                  const TransformOp& transform,
                  const std::vector<int>& origin_reduce_dims) {
+#ifdef PADDLE_WITH_XPU_KP
+  auto stream = dev_ctx.x_context()->xpu_stream;
+#else
  auto stream = dev_ctx.stream();
+#endif
  dev_ctx.Alloc<Ty>(y);

  auto x_dim = phi::vectorize<int>(x.dims());
@@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
               0);

 #ifdef PADDLE_WITH_XPU_KP
+    auto grid_num = 8;
+    auto block_num = 64;
+#else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
    ReduceHigherDimKernel<Tx,
                          Ty,
                          MPType,
                          ReduceOp<MPType>,
-                          TransformOp><<<8, 64, 0, stream>>>(
+                          TransformOp><<<grid_num, block_num, 0, stream>>>(
        x_data,
        config.output_data,
        reducer,
@@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
        config.left_num,
        config.blocking_size,
        dim);
-#else
-    ReduceHigherDimKernel<
-        Tx,
-        Ty,
-        MPType,
-        ReduceOp<MPType>,
-        TransformOp><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim);
-#endif

    if (config.should_reduce_again) {
      dim3 block = dim3(config.block.x, 1, 1);
@@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
      dim2.SetRem(config.left_num % config.block.x, 0, 0);

 #ifdef PADDLE_WITH_XPU_KP
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2);
-#else
+      grid = 8;
+      block = 64;
+#endif
      ReduceHigherDimKernel<
          Ty,
          Ty,
@@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
          config.left_num,
          config.grid.y,
          dim2);
-#endif
    }
    return;
  }

--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include <algorithm>
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace kps = phi::kps;
+
+namespace phi {
+namespace funcs {
+using Mode = kps::details::ReduceMode;
+
+/*
+* Count how many of the data being processed by the current block are true
+* 1. Load data from global memory and cast from bool to int64_t
+* 2. Get result of this thread according to thread reduce
+* 3. Get result of this block according to block reduce
+* 4. first block store 0 and current result
+*/
+template <typename T>
+struct NonZeroFunctor {
+  HOSTDEVICE NonZeroFunctor() {}
+  HOSTDEVICE inline T operator()(const T in) {
+    if (in) {
+      return static_cast<T>(1);
+    } else {
+      return static_cast<T>(0);
+    }
+  }
+};
+
+template <typename InT, typename OutT, int VecSize, int IsBoundary>
+__device__ void GetBlockCountImpl(const InT *in,
+                                  OutT *out,
+                                  int num,
+                                  int repeat) {
+  InT in_data[VecSize];
+  OutT temp[VecSize];
+  OutT result = static_cast<OutT>(0.0f);
+  using Add = kps::AddFunctor<OutT>;
+  using Cast = NonZeroFunctor<InT>;
+  int store_fix = BLOCK_ID_X + repeat * GRID_NUM_X;
+
+  kps::Init<InT, VecSize>(&in_data[0], static_cast<InT>(0.0f));
+  kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
+  kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Cast>(
+      &temp[0], &in_data[0], Cast());
+  kps::Reduce<OutT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+      &result, &temp[0], Add(), true);
+  kps::Reduce<OutT, 1, 1, 1, Add, Mode::kGlobalMode>(
+      &result, &result, Add(), true);
+  if (store_fix == 0) {
+    // first block's fix_size = 0;
+    OutT tmp = static_cast<OutT>(0.0f);
+    kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix, &tmp, 1);
+  }
+
+  // store num of this block
+  kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix + 1, &result, 1);
+}
+
+// Count how many data is not zero in current block
+template <typename InT, typename OutT, int VecSize>
+__global__ void GetBlockCountKernel(const InT *in,
+                                    OutT *out,
+                                    int64_t numel,
+                                    int64_t main_offset) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int repeat = 0;
+  for (; data_offset < main_offset; data_offset += stride) {
+    GetBlockCountImpl<InT, OutT, VecSize, false>(
+        in + data_offset, out, BLOCK_NUM_X * VecSize, repeat);
+    repeat++;  // to get the real blockIdx
+  }
+
+  int num = numel - data_offset;
+  if (num > 0) {
+    GetBlockCountImpl<InT, OutT, VecSize, true>(
+        in + data_offset, out, num, repeat);
+  }
+}
+
+/*
+* Get block num prefix us one block, VecSize must be 2
+* 1. Each thread load 2 data : threadIdx.x and threadIdx.x + blockDimx.x
+* 2. Cumsum limitation is blockDim.x must be less than 512
+*/
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          bool IsBoundary>
+__device__ void CumsumImpl(
+    const InT *in, OutT *out, OutT *pre_cumsum, int num, Functor func) {
+  __shared__ OutT max_thread_data;
+  OutT temp[VecSize];
+  InT arg[VecSize];
+  OutT result[VecSize];
+  // init data_pr
+  kps::Init<InT, VecSize>(&arg[0], static_cast<InT>(0.0f));
+  // set pre_cumsum
+  kps::Init<OutT, VecSize>(&temp[0], *pre_cumsum);
+  // load data to arg
+  kps::ReadData<InT, InT, VecSize, 1, 1, IsBoundary>(
+      &arg[0], in, num, 1, BLOCK_NUM_X, 1);
+  // block cumsum
+  kps::Cumsum<InT, OutT, 1, Functor>(&result[0], &arg[0], func);
+  // result = cumsum_result + pre_cumsum
+  kps::ElementwiseBinary<OutT, OutT, VecSize, 1, 1, Functor>(
+      &result[0], &result[0], &temp[0], func);
+  // get the last prefix sum
+  if ((THREAD_ID_X == BLOCK_NUM_X - 1) && !IsBoundary) {
+    max_thread_data = result[VecSize - 1];
+  }
+  __syncthreads();
+  // update pre_cumsum
+  *pre_cumsum = max_thread_data;
+  kps::WriteData<OutT, OutT, VecSize, 1, 1, IsBoundary>(
+      out, &result[0], num, 1, BLOCK_NUM_X, 1);
+}
+
+// Compute this store_offset of this block
+template <typename InT, typename OutT, typename Functor, int VecSize>
+__global__ void CumsumOneBlock(
+    const InT *in, OutT *out, int numel, int main_offset, Functor func) {
+  int stride = BLOCK_NUM_X * VecSize;
+  int offset = 0;
+  OutT pre_cumsum = static_cast<OutT>(0);
+  for (; offset < main_offset; offset += stride) {
+    CumsumImpl<InT, OutT, Functor, VecSize, false>(
+        in + offset, out + offset, &pre_cumsum, BLOCK_NUM_X * VecSize, func);
+  }
+
+  int num = numel - offset;
+  if (num > 0) {
+    CumsumImpl<InT, OutT, Functor, VecSize, true>(
+        in + offset, out + offset, &pre_cumsum, num, func);
+  }
+}
+
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename IdT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary,
+          int IsMaskData>
+struct SelectCaller {
+  __device__ void inline operator()(OutT *store_data,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int num,
+                                    int data_offset) {
+    // where_index op
+    IdT index_reg[VecSize];
+    // Set data index of global
+    kps::InitWithDataIndex<IdT, VecSize, 1, 1>(&index_reg[0], data_offset);
+    // Get store data according to mask_idt
+    kps::OperatorTernary<MT, IdT, OutT, Functor>(
+        store_data, mask_data, &index_reg[0], func, VecSize);
+  }
+};
+
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename IdT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary>
+struct SelectCaller<OutT,
+                    MT,
+                    InT,
+                    IdT,
+                    Functor,
+                    VecSize,
+                    IsBoundary,
+                    1> {  // masked_select
+  __device__ void inline operator()(OutT *store_data,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int num,
+                                    int data_offset) {
+    InT in_data[VecSize];
+    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
+    // Get store data according to mask_idt
+    kps::OperatorTernary<MT, InT, OutT, Functor>(
+        store_data, mask_data, &in_data[0], func, VecSize);
+  }
+};
+
+/**
+* Get mask's index if mask == true
+*/
+template <typename InT,
+          typename MT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int MaskData,
+          int IsBoundary>  // SelectType = 1 Mask_select else where_index
+__device__ void
+SelectKernelImpl(OutT *out,
+                 const MT *mask,
+                 const InT *in,
+                 Functor func,
+                 int num,
+                 int data_offset,
+                 int store_rank) {
+  const int kCVecSize = 2;
+  // each thread cumsum 2 data
+  using IdT = int64_t;
+  // Set index data type
+  using Add = kps::AddFunctor<IdT>;  // for cumsum
+  using Cast = NonZeroFunctor<InT>;  // for mask
+
+  IdT init_idx = static_cast<IdT>(0.0f);
+  MT init_mask = static_cast<MT>(0.0f);
+
+  IdT num_thread[kCVecSize];
+  IdT cumsum_thread[kCVecSize];
+
+  OutT store_data[VecSize * phi::DDim::kMaxRank];
+  MT mask_data[VecSize];
+  IdT mask_idt[VecSize];
+  // init data_pr
+  kps::Init<IdT, kCVecSize>(&cumsum_thread[0], init_idx);
+  kps::Init<IdT, kCVecSize>(&num_thread[0], init_idx);
+  kps::Init<MT, VecSize>(&mask_data[0], init_mask);
+  // Load mask
+  kps::ReadData<MT, VecSize, 1, 1, IsBoundary>(&mask_data[0], mask, num);
+  // Cast from MT to int
+  kps::ElementwiseUnary<MT, IdT, VecSize, 1, 1, Cast>(
+      &mask_idt[0], &mask_data[0], Cast());
+  // Get the num of thread only num_thread[1] has data
+  kps::Reduce<IdT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+      &num_thread[0], &mask_idt[0], Add(), true);
+  // Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
+  // thread_fix
+  kps::Cumsum<IdT, IdT, 1, Add>(&cumsum_thread[0], &num_thread[0], Add());
+  // Get store data(index) according to mask_idt
+  SelectCaller<OutT, MT, InT, IdT, Functor, VecSize, IsBoundary, MaskData>
+      compute;
+  compute(&store_data[0], &mask_data[0], in, func, num, data_offset);
+  // get thread_fix
+  int thread_fix =
+      (static_cast<int>(cumsum_thread[0] - num_thread[0]) * store_rank);
+  // get how many data need to store
+  int store_num = static_cast<int>(num_thread[0]) * store_rank;
+  // thread store num data, each thread may has different num
+  kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
+}
+
+template <typename MT,
+          typename InT,
+          typename CT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int MaskData>
+__global__ void SelectKernel(OutT *out,
+                             const MT *mask,
+                             const InT *in,
+                             CT *cumsum,
+                             Functor func,
+                             const int64_t numel,
+                             int64_t main_offset,
+                             int store_rank) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int repeat = 0;
+  int size = VecSize * BLOCK_ID_X;
+  for (; data_offset < main_offset; data_offset += stride) {
+    // Cumsum index
+    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
+    // niuliling todo: us ReadData API
+    int block_store_offset = cumsum[idx_cumsum];
+    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, false>(
+        out + block_store_offset * store_rank,
+        mask + data_offset,
+        in + data_offset,
+        func,
+        size,
+        data_offset,
+        store_rank);
+    repeat++;
+  }
+
+  int num = numel - data_offset;
+  if (num > 0) {
+    // Cumsum index
+    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
+    // niuliling todo: us ReadData API
+    int block_store_offset = static_cast<int>(cumsum[idx_cumsum]);
+    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, true>(
+        out + block_store_offset * store_rank,
+        mask + data_offset,
+        in + data_offset,
+        func,
+        num,
+        data_offset,
+        store_rank);
+  }
+}
+
+inline int64_t Floor(int64_t in, int64_t div) { return in / div * div; }
+
+// SelectData = 1 then masked_select; SelectData = 0 then where_index
+template <typename MT,
+          typename InT,
+          typename OutT,
+          int SelectData,
+          typename Functor>
+void SelectKernel(const KPDevice &dev_ctx,
+                  const DenseTensor &condition,
+                  const DenseTensor &in_data,
+                  DenseTensor *out,
+                  Functor func) {
+  const MT *cond_data = condition.data<MT>();
+  const int64_t numel = condition.numel();
+  auto dims = condition.dims();
+  int rank = SelectData ? 1 : dims.size();
+  const InT *in_data_ptr = SelectData ? in_data.data<InT>() : nullptr;
+  // calculate the inclusive prefix sum of "true_num_array"
+  // to get the index of "out" tensor,
+  // and the total number of cond_data[i]==true.
+  // Example:
+  // condition: F T T F F F T T
+  // before:    0 1 1 0 0 0 1 1
+  // after:     0 1 2 2 2 2 3 4
+  // out:       1 2 6 7
+  // alloc for cpu
+  using CT = int64_t;  // set Count_data Type
+  const int t_size = sizeof(CT);
+
+  const paddle::platform::CUDAPlace &cuda_place = dev_ctx.GetPlace();
+  paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
+
+  // 1.1 get stored data num of per block
+  int total_true_num = 0;  // init
+  const int kVecSize = 4;
+#ifdef PADDLE_WITH_XPU_KP
+  int block = 64;
+  auto stream = dev_ctx.x_context()->xpu_stream;
+  const int num_per_block = kVecSize * block;
+  const int need_grids = (numel + num_per_block - 1) / num_per_block;
+  const int grid = std::min(need_grids, 8);
+#else
+  const int block = 256;
+  const int num_per_block = kVecSize * block;
+  const int need_grids = (numel + num_per_block - 1) / num_per_block;
+  const int grid = std::min(need_grids, 256);
+  auto stream = dev_ctx.stream();
+#endif
+  const int64_t main_offset = Floor(numel, num_per_block);
+  // 1.2 alloc tmp data for CoutBlock
+  const int size_count_block = need_grids + 1;
+  std::vector<int> dims_vec = {size_count_block * 2};
+  ScalarArray dims_array(dims_vec);
+  DenseTensor count_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
+  CT *count_data = count_mem.data<CT>();
+  // 1.3 launch CountKernl
+  GetBlockCountKernel<MT, CT, kVecSize><<<grid, block, 0, stream>>>(
+      cond_data, count_data, numel, main_offset);
+  // 2.1 alloc cumsum data for CoutBlock prefix
+  DenseTensor cumsum_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
+  CT *cumsum_data = cumsum_mem.data<CT>();
+  // 2.2 get prefix of count_data for real out_index
+  const int kCumVesize = 2;
+  const int block_c = 256;
+  const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c));
+  using Add = kps::AddFunctor<CT>;
+  CumsumOneBlock<CT, CT, Add, kCumVesize><<<1, block_c, 0, stream>>>(
+      count_data, cumsum_data, size_count_block, main_offset_c, Add());
+  // 3.1 set temp ptr for in;
+  // 3.1 alloc for out
+  // 3.1.1 get true_num for gpu place the last cumsum is the true_num
+  paddle::memory::Copy(cpu_place,
+                       &total_true_num,
+                       cuda_place,
+                       cumsum_data + need_grids,
+                       t_size,
+                       dev_ctx.stream());
+
+  dev_ctx.Wait();
+  // 3.1.2 allock for out with total_true_num
+  std::vector<int64_t> out_dim = {static_cast<int64_t>(total_true_num)};
+  if (SelectData == 0) {  // where_index
+    out_dim.push_back(rank);
+  }
+  out->Resize(phi::make_ddim(out_dim));
+  auto out_data = out->mutable_data<OutT>(cuda_place);
+  // 3.2 get true data's index according to cond_data and cumsum_data
+  if (total_true_num <= 0) return;
+  SelectKernel<MT,
+               InT,
+               CT,
+               OutT,
+               Functor,
+               kVecSize,
+               SelectData><<<grid, block, 0, stream>>>(out_data,
+                                                       cond_data,
+                                                       in_data_ptr,
+                                                       cumsum_data,
+                                                       func,
+                                                       numel,
+                                                       main_offset,
+                                                       rank);
+}
+
+}  // namespace funcs
+}  // namespace phi
+
+#endif
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -142,8 +142,27 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
        dev_ctx, nullptr, &out, &dout, dx, functor);            \
  }

+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(       \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& out,                 \
+                        const DenseTensor& dout,                \
+                        float attr1,                            \
+                        float attr2,                            \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);            \
+  }
+
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, CudaSigmoidGradFunctor);
+
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor);
@@ -157,6 +176,7 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
+<<<<<<< HEAD
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Softsign, CudaSoftsignGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, CudaSquareGradFunctor);

@@ -165,6 +185,9 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, CudaReciprocalGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, CudaSqrtGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, CudaRsqrtGradFunctor);
+=======
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
+>>>>>>> 1904572ac8edb57dfb528e711588758002a168dd

 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                               CudaLeakyReluGradFunctor,
@@ -188,6 +211,7 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                               t_min,
                                               t_max);

+<<<<<<< HEAD
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
                                               CudaSTanhGradFunctor,
                                               scale_a,
@@ -197,6 +221,12 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
                                               CudaSoftplusGradFunctor,
                                               beta,
                                               threshold);
+=======
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
+                                                 CudaHardSigmoidGradFunctor,
+                                                 slope,
+                                                 offset);
+>>>>>>> 1904572ac8edb57dfb528e711588758002a168dd

 template <typename T, typename Context>
 void EluGradKernel(const Context& dev_ctx,
@@ -326,3 +356,8 @@ PD_REGISTER_KERNEL(square_grad,
                   double,
                   int,
                   int64_t) {}
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -99,6 +99,8 @@ DEFINE_GPU_ACTIVATION_KERNEL(Square, CudaSquareFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Sqrt, CudaSqrtFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Rsqrt, CudaRsqrtFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Softsign, CudaSoftsignFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)

 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
@@ -118,6 +120,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus,
                                     CudaSoftplusFunctor,
                                     beta,
                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
+                                     CudaHardSigmoidFunctor,
+                                     slope,
+                                     offset)

 }  // namespace phi

@@ -190,3 +196,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
--- a/paddle/phi/kernels/gpu/elementwise_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu
@@ -13,9 +13,50 @@
 // limitations under the License.

 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"

+namespace phi {
+
+#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
+  template <typename T, typename Context>                            \
+  void name##RawKernel(const Context& dev_ctx,                       \
+                       const DenseTensor& x,                         \
+                       const DenseTensor& y,                         \
+                       int axis,                                     \
+                       DenseTensor* out) {                           \
+    std::vector<const DenseTensor*> inputs;                          \
+    std::vector<DenseTensor*> outputs;                               \
+    inputs.emplace_back(&x);                                         \
+    inputs.emplace_back(&y);                                         \
+    outputs.emplace_back(out);                                       \
+    dev_ctx.template Alloc<T>(out);                                  \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
+        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
+  }
+
+/**
+ * Kernels
+ */
+
+// Create the definition of Add
+DEFINE_CUDA_ELEMENTWISE_OP(Add)
+// Create the definition of Subtract
+DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
+// Create the definition of Multiply
+DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
+// Create the definition of Divide
+DEFINE_CUDA_ELEMENTWISE_OP(Divide)
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
 PD_REGISTER_KERNEL(elementwise_fmax,
                   GPU,
                   ALL_LAYOUT,
@@ -33,3 +74,55 @@ PD_REGISTER_KERNEL(elementwise_fmin,
                   double,
                   int,
                   int64_t) {}
+
+PD_REGISTER_KERNEL(add_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(subtract_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(divide_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(multiply_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   float16,
+                   complex64,
+                   complex128,
+                   bfloat16) {}
--- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context &dev_ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &mean,
+                         const DenseTensor &variance,
+                         paddle::optional<const DenseTensor &> scale_opt,
+                         paddle::optional<const DenseTensor &> bias_opt,
+                         const DenseTensor &out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor *x_grad,
+                         DenseTensor *scale_grad,
+                         DenseTensor *bias_grad) {
+  using U = paddle::operators::LayerNormParamType<T>;
+  // d_x, d_scale, d_bias may be nullptr
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  auto *scale = scale_opt.get_ptr();
+  auto *bias = bias_opt.get_ptr();
+  auto *d_y = &out_grad;
+
+  const auto &x_dims = x.dims();
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+
+  auto *x_data = x.data<T>();
+  auto *d_y_data = d_y->data<T>();
+
+  auto *mean_data = mean.data<U>();
+  auto *var_data = variance.data<U>();
+
+  auto *d_x_data = (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));
+
+  auto x_dtype = x.dtype();
+
+  phi::DataType scale_bias_dtype;
+  if (scale != nullptr) {
+    scale_bias_dtype = scale->dtype();
+  } else {
+    // FIXME(zengjinle): do not find a better way to get the right
+    // data type of the d_scale and d_bias if scale == nullptr.
+    if (bias != nullptr) {
+      scale_bias_dtype = bias->dtype();
+    } else {
+      scale_bias_dtype = x_dtype;
+    }
+  }
+
+#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX)  \
+  do {                                                                      \
+    auto *scale_data =                                                      \
+        (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());           \
+    auto *d_scale_data =                                                    \
+        (d_scale == nullptr ? nullptr                                       \
+                            : dev_ctx.template Alloc<ScaleBiasT>(d_scale)); \
+    auto *d_bias_data =                                                     \
+        (d_bias == nullptr ? nullptr                                        \
+                           : dev_ctx.template Alloc<ScaleBiasT>(d_bias));   \
+    auto *d_x_data =                                                        \
+        (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));        \
+    paddle::operators::LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(  \
+        x_data,                                                             \
+        d_y_data,                                                           \
+        scale_data,                                                         \
+        mean_data,                                                          \
+        var_data,                                                           \
+        d_x_data,                                                           \
+        d_scale_data,                                                       \
+        d_bias_data,                                                        \
+        epsilon,                                                            \
+        batch_size,                                                         \
+        feature_size,                                                       \
+        dev_ctx);                                                           \
+  } while (0)
+
+  if (scale_bias_dtype == x_dtype) {
+    PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
+  } else {
+    PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
+  }
+
+#undef PADDLE_LAUNCH_LAYERNORM_BWD
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+
+namespace phi {
+
+template <typename T>
+void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                               const T *input,
+                                               std::vector<int> input_shape,
+                                               const T *bias,
+                                               const T *scale,
+                                               T *output,
+                                               T *mean,
+                                               T *variance,
+                                               int begin_norm_axis,
+                                               float eps) {
+  const auto x_dims = phi::make_ddim(input_shape);
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+  switch (paddle::operators::GetDesiredBlockDim(feature_size)) {
+    FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<
+                         T,
+                         T,
+                         kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+        input, scale, bias, output, mean, variance, eps, feature_size));
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Product from begin_norm_axis to end in layer_norm must be larger "
+          "than 1"));
+      break;
+  }
+}
+
+template class LayerNormDirectCUDAFunctor<float>;
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context &dev_ctx,
+                     const DenseTensor &x,
+                     paddle::optional<const DenseTensor &> scale_opt,
+                     paddle::optional<const DenseTensor &> bias_opt,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor *y,
+                     DenseTensor *mean,
+                     DenseTensor *var) {
+  using U = paddle::operators::LayerNormParamType<T>;
+  auto *scale = scale_opt.get_ptr();
+  auto *bias = bias_opt.get_ptr();
+
+  const auto x_dims = x.dims();
+  auto *x_data = x.data<T>();
+  auto *y_data = dev_ctx.template Alloc<T>(y);
+  auto *mean_data = dev_ctx.template Alloc<U>(mean);
+  auto *var_data = dev_ctx.template Alloc<U>(var);
+
+  auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
+  auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
+
+  auto x_dtype = x.dtype();
+  phi::DataType scale_bias_dtype;
+  if (void_scale_data != nullptr) {
+    scale_bias_dtype = scale->dtype();
+    if (void_bias_data != nullptr) {
+      PADDLE_ENFORCE_EQ(
+          scale->dtype(),
+          bias->dtype(),
+          phi::errors::InvalidArgument("Thie Scale and Bias of layer_norm op "
+                                       "should have the same data type."));
+    }
+  } else {
+    scale_bias_dtype = (void_bias_data != nullptr ? bias->dtype() : x_dtype);
+  }
+
+  bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
+  if (!is_scale_bias_same_dtype_with_x) {
+    PADDLE_ENFORCE_EQ(scale_bias_dtype,
+                      paddle::experimental::CppTypeToDataType<U>::Type(),
+                      phi::errors::InvalidArgument(
+                          "Unsupported data type of Scale and Bias"));
+  }
+
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+
+  auto stream = dev_ctx.stream();
+
+#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
+  do {                                                                     \
+    switch (paddle::operators::GetDesiredBlockDim(feature_size)) {         \
+      FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<            \
+                           T,                                              \
+                           U,                                              \
+                           kBlockDim,                                      \
+                           IsScaleBiasSameDTypeWithX><<<batch_size,        \
+                                                        kBlockDim,         \
+                                                        0,                 \
+                                                        stream>>>(         \
+          x_data,                                                          \
+          static_cast<const ScaleBiasT *>(void_scale_data),                \
+          static_cast<const ScaleBiasT *>(void_bias_data),                 \
+          y_data,                                                          \
+          mean_data,                                                       \
+          var_data,                                                        \
+          epsilon,                                                         \
+          feature_size));                                                  \
+      default:                                                             \
+        PADDLE_THROW(phi::errors::InvalidArgument(                         \
+            "Product from begin_norm_axis to end must be larger than 1")); \
+        break;                                                             \
+    }                                                                      \
+  } while (0)
+
+#ifdef PADDLE_WITH_CUDA
+  bool can_call_1024_kernel = false;
+  if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
+    can_call_1024_kernel = true;
+  }
+  if (can_call_1024_kernel) {
+    const int WARPS_M = 4;
+    const int WARPS_N = 1;
+    const int THREADS_PER_WARP = 32;
+    const int BYTES_PER_LDG = 16;
+    const int VecSize = BYTES_PER_LDG / sizeof(T);
+
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
+    const int ROWS_PER_CTA = WARPS_M;
+
+    const int grid = static_cast<int>(
+        std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
+    if (is_scale_bias_same_dtype_with_x) {
+      paddle::operators::ln_fwd_1024_kernel<
+          T,
+          U,
+          T,
+          VecSize,
+          WARPS_M,
+          WARPS_N,
+          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+          batch_size,
+          feature_size,
+          epsilon,
+          x_data,
+          static_cast<const T *>(void_scale_data),
+          static_cast<const T *>(void_bias_data),
+          mean_data,
+          var_data,
+          y_data);
+    } else {
+      paddle::operators::ln_fwd_1024_kernel<
+          T,
+          U,
+          U,
+          VecSize,
+          WARPS_M,
+          WARPS_N,
+          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+          batch_size,
+          feature_size,
+          epsilon,
+          x_data,
+          static_cast<const U *>(void_scale_data),
+          static_cast<const U *>(void_bias_data),
+          mean_data,
+          var_data,
+          y_data);
+    }
+  } else {
+#endif
+    if (is_scale_bias_same_dtype_with_x) {
+      PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
+    } else {
+      PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
+    }
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+
+#undef PADDLE_LAUNCH_LAYERNORM_FWD
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -19,34 +19,27 @@

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/masked_select_kernel.h"

 namespace phi {

-__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx])
-      mask_array[idx] = 1;
-    else
-      mask_array[idx] = 0;
-  }
-}
+template <typename MT, typename InT, typename OutT>
+struct MaskedSelectFunctor {
+  HOSTDEVICE MaskedSelectFunctor() {}

-template <typename T>
-__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum,
-                                     const bool* mask,
-                                     const T* input,
-                                     T* out,
-                                     int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx]) {
-      int index = mask_prefix_sum[idx];
-      out[index] = input[idx];
+  HOSTDEVICE inline void operator()(OutT* out,
+                                    const MT* mask,
+                                    const InT* value,
+                                    int num) {
+    int store_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        out[store_fix++] = value[idx];
+      }
    }
  }
-}
+};

 template <typename T, typename Context>
 void MaskedSelectKernel(const Context& dev_ctx,
@@ -68,42 +61,9 @@ void MaskedSelectKernel(const Context& dev_ctx,
                        "value.",
                        input_dim,
                        mask_dim));
-
-  thrust::device_ptr<const bool> mask_dev_ptr =
-      thrust::device_pointer_cast(mask_data);
-  thrust::device_vector<T> mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size);
-  auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true);
-
-  DDim out_dim{out_size};
-  out->Resize(out_dim);
-  auto out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-  DenseTensor mask_array;
-  DenseTensor mask_prefix_sum;
-  mask_array.Resize(mask_dim);
-  mask_prefix_sum.Resize(mask_dim);
-
-  int32_t* mask_array_data =
-      mask_array.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int32_t* mask_prefix_sum_data =
-      mask_prefix_sum.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int threads = 512;
-  int grid = (mask_size + threads - 1) / threads;
-  auto stream = dev_ctx.stream();
-  SetMaskArray<<<grid, threads, 0, stream>>>(
-      mask_data, mask_array_data, mask_size);
-
-  thrust::device_ptr<int32_t> mask_array_dev_ptr =
-      thrust::device_pointer_cast(mask_array_data);
-  thrust::device_vector<int32_t> mask_array_vec(mask_array_dev_ptr,
-                                                mask_array_dev_ptr + mask_size);
-  thrust::exclusive_scan(thrust::device,
-                         mask_array_vec.begin(),
-                         mask_array_vec.end(),
-                         mask_prefix_sum_data);
-
-  SelectWithPrefixMask<T><<<grid, threads, 0, stream>>>(
-      mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  using Functor = MaskedSelectFunctor<bool, T, T>;
+  phi::funcs::SelectKernel<bool, T, T, 1, Functor>(
+      dev_ctx, mask, x, out, Functor());
 }

 }  // namespace phi

--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/math_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-
-#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
-  template <typename T, typename Context>                            \
-  void name##RawKernel(const Context& dev_ctx,                       \
-                       const DenseTensor& x,                         \
-                       const DenseTensor& y,                         \
-                       int axis,                                     \
-                       DenseTensor* out) {                           \
-    std::vector<const DenseTensor*> inputs;                          \
-    std::vector<DenseTensor*> outputs;                               \
-    inputs.emplace_back(&x);                                         \
-    inputs.emplace_back(&y);                                         \
-    outputs.emplace_back(out);                                       \
-    dev_ctx.template Alloc<T>(out);                                  \
-    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
-        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
-  }
-
-/**
- * Kernels
- */
-
-// Create the definition of Add
-DEFINE_CUDA_ELEMENTWISE_OP(Add)
-// Create the definition of Subtract
-DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
-// Create the definition of Multiply
-DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
-// Create the definition of Divide
-DEFINE_CUDA_ELEMENTWISE_OP(Divide)
-
-}  // namespace phi
-
-using float16 = phi::dtype::float16;
-using bfloat16 = phi::dtype::bfloat16;
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-PD_REGISTER_KERNEL(add_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   float16,
-                   complex64,
-                   complex128,
-                   bfloat16) {}
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -23,11 +23,11 @@
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/abs_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/reduce_kernel.h"

 namespace phi {

--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -18,7 +18,6 @@
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/empty_kernel.h"

 #include "paddle/fluid/memory/memory.h"


--- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPURoiPoolBackward(const int nthreads,
+                                   const T* input_rois,
+                                   const T* output_grad,
+                                   const int64_t* arg_max_data,
+                                   const int num_rois,
+                                   const float spatial_scale,
+                                   const int channels,
+                                   const int height,
+                                   const int width,
+                                   const int pooled_height,
+                                   const int pooled_width,
+                                   int* box_batch_id_data,
+                                   T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    int roi_batch_ind = box_batch_id_data[n];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_arg_max_data = arg_max_data + output_offset;
+
+    int arg_max = offset_arg_max_data[ph * pooled_width + pw];
+    if (arg_max != -1) {
+      paddle::platform::CudaAtomicAdd(
+          offset_input_grad + arg_max,
+          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiPoolGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& boxes,
+                       paddle::optional<const DenseTensor&> boxes_num,
+                       const DenseTensor& arg_max,
+                       const DenseTensor& out_grad,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       DenseTensor* dx) {
+  auto x_dims = x.dims();
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  if (dx) {
+    DenseTensor box_batch_id_list;
+    box_batch_id_list.Resize({rois_num});
+    int* box_batch_id_data =
+        dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+
+    auto gplace = dev_ctx.GetPlace();
+    if (boxes_num) {
+      int boxes_batch_size = boxes_num->numel();
+      std::vector<int> boxes_num_list(boxes_batch_size);
+      paddle::memory::Copy(phi::CPUPlace(),
+                           boxes_num_list.data(),
+                           gplace,
+                           boxes_num->data<int>(),
+                           sizeof(int) * boxes_batch_size,
+                           0);
+      int start = 0;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (int i = start; i < start + boxes_num_list[n]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+        start += boxes_num_list[n];
+      }
+    } else {
+      auto boxes_lod = boxes.lod().back();
+      int boxes_batch_size = boxes_lod.size() - 1;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+      }
+    }
+    int bytes = box_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    paddle::memory::Copy(gplace,
+                         roi_id_data,
+                         phi::CPUPlace(),
+                         box_batch_id_data,
+                         bytes,
+                         dev_ctx.stream());
+
+    dev_ctx.template Alloc<T>(dx);
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+
+    int output_grad_size = out_grad.numel();
+    int blocks = NumBlocks(output_grad_size);
+    int threads = kNumCUDAThreads;
+
+    if (output_grad_size > 0) {
+      GPURoiPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          output_grad_size,
+          boxes.data<T>(),
+          out_grad.data<T>(),
+          arg_max.data<int64_t>(),
+          rois_num,
+          spatial_scale,
+          channels,
+          height,
+          width,
+          pooled_height,
+          pooled_width,
+          roi_id_data,
+          dx->data<T>());
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool_grad, GPU, ALL_LAYOUT, phi::RoiPoolGradKernel, float, double) {
+  kernel->InputAt(3).SetDataType(phi::DataType::INT64);
+}
--- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPURoiPoolForward(const int nthreads,
+                                  const T* input_data,
+                                  const T* input_rois,
+                                  const float spatial_scale,
+                                  const int channels,
+                                  const int height,
+                                  const int width,
+                                  const int pooled_height,
+                                  const int pooled_width,
+                                  int* box_batch_id_data,
+                                  T* output_data,
+                                  int64_t* arg_max_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int box_batch_ind = box_batch_id_data[n];
+    int box_start_w = round(offset_input_rois[0] * spatial_scale);
+    int box_start_h = round(offset_input_rois[1] * spatial_scale);
+    int box_end_w = round(offset_input_rois[2] * spatial_scale);
+    int box_end_h = round(offset_input_rois[3] * spatial_scale);
+
+    int box_width = max(box_end_w - box_start_w + 1, 1);
+    int box_height = max(box_end_h - box_start_h + 1, 1);
+
+    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
+                                        static_cast<double>(box_height) /
+                                        static_cast<double>(pooled_height)));
+    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
+                                        static_cast<double>(box_width) /
+                                        static_cast<double>(pooled_width)));
+    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
+                                     static_cast<double>(box_height) /
+                                     static_cast<double>(pooled_height)));
+    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
+                                     static_cast<double>(box_width) /
+                                     static_cast<double>(pooled_width)));
+    hstart = min(max(hstart + box_start_h, 0), height);
+    hend = min(max(hend + box_start_h, 0), height);
+    wstart = min(max(wstart + box_start_w, 0), width);
+    wend = min(max(wend + box_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (box_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
+        }
+      }
+    }
+    output_data[i] = maxval;
+    if (arg_max_data) {
+      arg_max_data[i] = maxidx;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max) {
+  auto x_dims = x.dims();
+  int batch_size = x_dims[0];
+  auto in_stride = phi::stride(x_dims);
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+
+  int rois_num = boxes.dims()[0];
+
+  if (rois_num == 0) return;
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+
+  DenseTensor box_batch_id_list;
+  box_batch_id_list.Resize({rois_num});
+  int* box_batch_id_data = dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+  auto gplace = dev_ctx.GetPlace();
+
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument(
+            "The batch size of input(ROIs) and input(X) must be the same but "
+            "received batch size of input(ROIs) and input(X) is %d and %d "
+            "respectively.",
+            boxes_batch_size,
+            batch_size));
+    std::vector<int> boxes_num_list(boxes_batch_size);
+    paddle::memory::Copy(phi::CPUPlace(),
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_list[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_list[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument(
+            "The batch size of input(ROIs) and input(X) must be the same but "
+            "received batch size of input(ROIs) and input(X) is %d and %d "
+            "respectively.",
+            boxes_batch_size,
+            batch_size));
+
+    int boxes_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num,
+                      boxes_num_with_lod,
+                      phi::errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num,
+                          boxes_num_with_lod));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+
+  int bytes = box_batch_id_list.numel() * sizeof(int);
+  auto box_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+  int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
+  paddle::memory::Copy(gplace,
+                       box_id_data,
+                       phi::CPUPlace(),
+                       box_batch_id_data,
+                       bytes,
+                       dev_ctx.stream());
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
+
+  GPURoiPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      boxes.data<T>(),
+      spatial_scale,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      box_id_data,
+      output_data,
+      arg_max_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool, GPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -33,27 +33,23 @@ struct GPUTruncatedNormal {
  T mean, std;
  T a_normal_cdf;
  T b_normal_cdf;
-
  unsigned int seed;
  T numeric_min;

  __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
      : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
-    auto normal_cdf = [](float x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    a_normal_cdf = normal_cdf((-2.0 - mean) / std);
-    b_normal_cdf = normal_cdf((2.0 - mean) / std);
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
  }

  __host__ __device__ T operator()(const unsigned int n) const {
    thrust::minstd_rand rng;
    rng.seed(seed);
-    thrust::uniform_real_distribution<T> dist(2.0 * a_normal_cdf - 1.0,
-                                              2.0 * b_normal_cdf - 1.0);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
    rng.discard(n);
    T value = dist(rng);
-    return std::sqrt(2.0) * erfinvf(value) * std + mean;
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
  }
 };

@@ -73,21 +69,18 @@ struct TruncatedNormalOffset {
        seed(seed),
        numeric_min(numeric_min),
        offset_(offset) {
-    auto normal_cdf = [](float x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    a_normal_cdf = normal_cdf((-2.0 - mean) / std);
-    b_normal_cdf = normal_cdf((2.0 - mean) / std);
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
  }

  __host__ __device__ T operator()(const unsigned int n) const {
    thrust::minstd_rand rng;
    rng.seed(seed);
-    thrust::uniform_real_distribution<T> dist(2.0 * a_normal_cdf - 1.0,
-                                              2.0 * b_normal_cdf - 1.0);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
    rng.discard(n + offset_);
    T value = dist(rng);
-    return std::sqrt(2.0) * erfinvf(value) * std + mean;
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
  }
 };


--- a/paddle/phi/kernels/gpu/where_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -20,150 +20,59 @@
 namespace cub = hipcub;
 #endif

-#include "paddle/phi/kernels/where_index_kernel.h"
-
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
+#include "paddle/phi/kernels/where_index_kernel.h"

 namespace phi {
-
-template <typename T>
-__global__ void GetTrueNum(const T *cond_data,
-                           const int64_t numel,
-                           int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    true_num_array[idx] =
-        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
+template <typename T1, typename T2, typename OutT>
+struct IndexFunctor {
+  T2 stride[phi::DDim::kMaxRank];
+  int dims;
+  explicit IndexFunctor(const phi::DDim &in_dims) {
+    dims = in_dims.size();
+    std::vector<T2> strides_in_tmp;
+    strides_in_tmp.resize(dims, 1);
+    // get strides according to in_dims
+    for (T2 i = 1; i < dims; i++) {
+      strides_in_tmp[i] = strides_in_tmp[i - 1] * in_dims[dims - i];
+    }
+    memcpy(stride, strides_in_tmp.data(), dims * sizeof(T2));
  }
-}
-
-template <typename T>
-__global__ void SetTrueIndex(int64_t *out_ptr,
-                             const T *cond_data,
-                             const int64_t numel,
-                             const int64_t *stride_array,
-                             const int64_t rank,
-                             const int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;

-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    // true_num_array is calculated by cub::InclusiveSum,
-    // cause the first element of true_num_array is 1,
-    // so we need substract 1 to get true index.
-    const int64_t true_index = true_num_array[idx] - 1;
-    if (static_cast<bool>(cond_data[idx])) {
-      int64_t rank_index = idx;
-      for (int j = 0; j < rank; j++) {
-        const int64_t out_index = rank_index / stride_array[j];
-        out_ptr[true_index * rank + j] = out_index;
-        rank_index -= out_index * stride_array[j];
+  HOSTDEVICE inline void operator()(OutT *out,
+                                    const T1 *mask,
+                                    const T2 *index,
+                                    const int num) {
+    int store_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        T2 data_index = index[idx];
+        // get index
+        for (int rank_id = dims - 1; rank_id >= 0; --rank_id) {
+          out[store_fix] = static_cast<OutT>(data_index / stride[rank_id]);
+          data_index = data_index % stride[rank_id];
+          store_fix++;
+        }
      }
    }
  }
-}
+};

 template <typename T, typename Context>
 void WhereIndexKernel(const Context &dev_ctx,
                      const DenseTensor &condition,
                      DenseTensor *out) {
-  const T *cond_data = condition.data<T>();
-  const int64_t numel = condition.numel();
+  DenseTensor in_data;
  auto dims = condition.dims();
-  const int rank = dims.size();
-
-  auto d_array_mem =
-      paddle::memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
-  auto h_array_mem =
-      paddle::memory::Alloc(phi::CPUPlace(), (rank + 1) * sizeof(int64_t));
-
-  // "stride_array" is an array and len(stride_array)==rank,
-  // each element is the stride of each dimension -- the length from i to i+1.
-  int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
-  int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
-
-  // "true_num_array" is an array and len(stride_array)==numel,
-  // at the beginning,
-  // "true_num_array" will set 1 if condition[i] == true else 0,
-  // then it will be calculated by cub::InclusiveSum,
-  // so that we can get the true number before i as the out index
-  int64_t *d_true_num_array = d_stride_array + rank;
-
-  // the total_true_num is the total number of condition[i] == true
-  int64_t *h_total_true_num = h_stride_array + rank;
-
-  // alloce cub memory
-  size_t cub_size = 0;
-  cub::DeviceScan::InclusiveSum(nullptr,
-                                cub_size,
-                                d_true_num_array,
-                                d_true_num_array,
-                                numel,
-                                dev_ctx.stream());
-  auto cub_mem = paddle::memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
-  void *cub_data = cub_mem->ptr();
-
-  // set d_true_num_array[i]=1 if cond_data[i]==true else 0
-  const int threads = std::min(numel, static_cast<int64_t>(128));
-  const int64_t need_grids = (numel + threads - 1) / threads;
-  const int grids = std::min(need_grids, static_cast<int64_t>(256));
-  GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-      cond_data, numel, d_true_num_array);
-
-  // calculate the inclusive prefix sum of "true_num_array"
-  // to get the index of "out" tensor,
-  // and the total number of cond_data[i]==true.
-  // Example:
-  // condition: F T T F F F T T
-  // before:    0 1 1 0 0 0 1 1
-  // after:     0 1 2 2 2 2 3 4
-  // out:       1 2 6 7
-  cub::DeviceScan::InclusiveSum(cub_data,
-                                cub_size,
-                                d_true_num_array,
-                                d_true_num_array,
-                                numel,
-                                dev_ctx.stream());
-
-  // calculate each dimension's stride
-  h_stride_array[rank - 1] = 1;
-  for (int i = rank - 2; i >= 0; i--) {
-    h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
-  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       d_stride_array,
-                       phi::CPUPlace(),
-                       h_stride_array,
-                       rank * sizeof(int64_t),
-                       dev_ctx.stream());
-
-  // get total ture number and set output size
-  // the last element of cub::InclusiveSum is the total number
-  paddle::memory::Copy(phi::CPUPlace(),
-                       h_total_true_num,
-                       dev_ctx.GetPlace(),
-                       d_true_num_array + numel - 1,
-                       sizeof(int64_t),
-                       dev_ctx.stream());
-  dev_ctx.Wait();
-
-  int64_t true_num = *h_total_true_num;
-  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-  auto *out_data = dev_ctx.template Alloc<int64_t>(out);
-
-  if (true_num == 0) {
-    return;
-  }
-
-  // using true_num_array and stride_array to calculate the output index
-  SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-      out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
+  using Functor = IndexFunctor<T, int64_t, int64_t>;
+  Functor index_functor = Functor(dims);
+  phi::funcs::SelectKernel<T, T, int64_t, 0, Functor>(
+      dev_ctx, condition, in_data, out, index_functor);
 }
-
 }  // namespace phi

 PD_REGISTER_KERNEL(where_index,

--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -238,6 +238,56 @@ void LogitGradKernel(const Context& dev_ctx,

  funcs::LogitGradFunctor<T> functor;
  functor(place, eigen_x, eigen_dout, eigen_dx, eigen_p, eps);
+void SigmoidDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             DenseTensor* dout_new,
+                             DenseTensor* ddout) {
+  if (dout_new) {
+    dout_new->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout_new);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::SigmoidGradGradFunctor<T> functor;
+  functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout);
+}
+
+template <typename T, typename Context>
+void SigmoidTripleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             const DenseTensor& d_ddout,
+                             const DenseTensor& d_dout_new,
+                             DenseTensor* d_out_new,
+                             DenseTensor* d_dout,
+                             DenseTensor* d_ddx) {
+  if (d_dout) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_out_new) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_out_new);
+  }
+  if (d_ddx) {
+    d_dout->Resize(ddx.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  funcs::SigmoidTripleGradFunctor<T> functor;
+  functor(dev_ctx,
+          &out,
+          &ddx,
+          &dout,
+          &d_ddout,
+          &d_dout_new,
+          d_dout,
+          d_out_new,
+          d_ddx);
 }

 }  // namespace phi
--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -19,6 +19,7 @@
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -27,7 +28,6 @@
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/matrix_reduce.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"

 namespace phi {

--- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -17,13 +17,13 @@
 #include "paddle/phi/kernels/determinant_grad_kernel.h"

 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"

 namespace phi {

--- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -16,11 +16,11 @@

 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"


--- a/paddle/phi/kernels/layer_norm_grad_kernel.h
+++ b/paddle/phi/kernels/layer_norm_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         paddle::optional<const DenseTensor&> scale,
+                         paddle::optional<const DenseTensor&> bias,
+                         const DenseTensor& out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad);
+
+}  // namespace phi
--- a/paddle/phi/kernels/layer_norm_kernel.h
+++ b/paddle/phi/kernels/layer_norm_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     paddle::optional<const DenseTensor&> scale,
+                     paddle::optional<const DenseTensor&> bias,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor* out,
+                     DenseTensor* mean,
+                     DenseTensor* variance);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class LayerNormDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T* input,
+                  std::vector<int> input_shape,
+                  const T* bias,
+                  const T* scale,
+                  T* output,
+                  T* mean,
+                  T* variance,
+                  int begin_norm_axis,
+                  float eps);
+};
+#endif
+
+}  // namespace phi
--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/math_kernel.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/infermeta/binary.h"
-namespace phi {
-
-template <typename T, typename Context>
-void AddRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  int axis,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void AddKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& y,
-               DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
-template <typename T, typename Context>
-DenseTensor Add(const Context& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Subtract(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Divide(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Multiply(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-}  // namespace phi
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -22,7 +22,6 @@
 #endif

 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-// #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"

 namespace phi {
@@ -591,7 +590,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
    int index = (tidx + 1) * 2 * stride - 1;
    if (index < (blockDim.x * 2)) {
      temp[index + index / 32] =
-          compute(temp[index + index / 2],
+          compute(temp[index + index / 32],
                  temp[index - stride + (index - stride) / 32]);
    }
  }

--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -115,6 +115,14 @@ struct BroadcastConfig {
  }
 };

+template <typename T>
+__device__ __forceinline__ void WriteData(T* dst,
+                                          T* __restrict__ src,
+                                          int num) {
+  for (int i = 0; i < num; i++) {
+    dst[i] = src[i];
+  }
+}
 #undef INT_BITS
 }  // namespace details


--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -76,6 +76,16 @@ struct BroadcastConfig {
 };
 #pragma pack()

+template <typename T>
+__device__ __forceinline__ void WriteData(T* _global_ptr_ dst,
+                                          T* src,
+                                          int num) {
+  if (num > 0) {
+    LM2GM(src, dst, num * sizeof(T));
+  }
+}
+#undef INT_BITS
+
 }  // namespace details

 /**

--- a/paddle/phi/kernels/roi_pool_grad_kernel.h
+++ b/paddle/phi/kernels/roi_pool_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPooGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& boxes,
+                      paddle::optional<const DenseTensor&> boxes_num,
+                      const DenseTensor& arg_max,
+                      const DenseTensor& out_grad,
+                      int pooled_height,
+                      int pooled_width,
+                      float spatial_scale,
+                      DenseTensor* dx);
+
+}  // namespace phi
--- a/paddle/phi/kernels/roi_pool_kernel.h
+++ b/paddle/phi/kernels/roi_pool_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+static constexpr int kROISize = 4;
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max);
+
+}  // namespace phi
--- a/paddle/phi/kernels/selected_rows/assign_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/assign_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/assign_kernel.h"
+
+namespace phi {
+namespace sr {
+
+// Note: use `const paddle::optional<const SelectedRows&> x`
+// as input if needed
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  phi::AssignKernel<Context>(dev_ctx, x.value(), out->mutable_value());
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(assign_sr,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::sr::AssignKernel<phi::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(assign_sr,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::sr::AssignKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
--- a/paddle/phi/kernels/selected_rows/assign_kernel.h
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
--- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h
+++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
@@ -141,9 +141,19 @@ T Erfinv(T x) {
 template <typename T>
 struct TruncatedNormal {
  T mean, std;
-  TruncatedNormal(T mean, T std) : mean(mean), std(std) {}
+  T a_normal_cdf;
+  T b_normal_cdf;
+  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
+    auto normal_cdf = [](T x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    a_normal_cdf = normal_cdf(-2.0);
+    b_normal_cdf = normal_cdf(2.0);
+  }
+
  T operator()(T value) const {
-    return std::sqrt(2.0) * Erfinv(value) * std + mean;
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
  }
 };


--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -56,9 +56,14 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", );  // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", );               // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", );   // NOLINT

-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );  // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", );  // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid,
+                                 "hard_sigmoid",
+                                 "slope" comma "offset");  // NOLINT

 KernelSignature ReluDoubleGradOpArgumentMapping(
    const ArgumentMappingContext& ctx) {
@@ -79,6 +84,20 @@ KernelSignature TanhTripleGradOpArgumentMapping(
                         {"D_OutNew", "D_DOut", "D_DDx"});
 }

+KernelSignature SigmoidDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "sigmoid_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
+}
+
+KernelSignature SigmoidTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sigmoid_triple_grad",
+                         {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
+                         {},
+                         {"D_OutNew", "D_DOut", "D_DDx"});
+}
+
 KernelSignature LeakyReluDoubleGradOpArgumentMapping(
    const ArgumentMappingContext& ctx) {
  return KernelSignature(
@@ -114,6 +133,7 @@ PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink);
 PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad);

 PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
@@ -152,3 +172,12 @@ PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad,
+                           phi::SigmoidDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_triple_grad,
+                           phi::SigmoidTripleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad,
+                           phi::LogSigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad,
+                           phi::HardSigmoidGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/assign_sig.cc
+++ b/paddle/phi/ops/compat/assign_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AssignOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("X")) {
+    if (ctx.IsDenseTensorVectorInput("X")) {
+      return KernelSignature("assign_array", {"X"}, {}, {"Out"});
+    } else if (ctx.IsSelectedRowsInput("X")) {
+      return KernelSignature("assign_sr", {"X"}, {}, {"Out"});
+    } else {
+      return KernelSignature("assign", {"X"}, {}, {"Out"});
+    }
+  } else {
+    return KernelSignature("assign", {"X"}, {}, {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(assign, phi::AssignOpArgumentMapping);
--- a/paddle/phi/ops/compat/layer_norm_sig.cc
+++ b/paddle/phi/ops/compat/layer_norm_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("layer_norm",
+                         {"X", "Scale", "Bias"},
+                         {"epsilon", "begin_norm_axis", "is_test"},
+                         {"Y", "Mean", "Variance"});
+}
+
+KernelSignature LayerNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "layer_norm_grad",
+      {"X", "Mean", "Variance", "Scale", "Bias", GradVarName("Y")},
+      {"epsilon", "begin_norm_axis", "is_test"},
+      {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(layer_norm, phi::LayerNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(layer_norm_grad,
+                           phi::LayerNormGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/roi_pool_sig.cc
+++ b/paddle/phi/ops/compat/roi_pool_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RoiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_pool",
+                         {"X", "ROIs", "RoisNum"},
+                         {"pooled_height", "pooled_width", "spatial_scale"},
+                         {"Out", "Argmax"});
+}
+
+KernelSignature RoiPoolOpGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_pool_grad",
+                         {"X", "ROIs", "RoisNum", "Argmax", GradVarName("Out")},
+                         {"pooled_height", "pooled_width", "spatial_scale"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roi_pool, phi::RoiPoolOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_pool_grad, phi::RoiPoolOpGradArgumentMapping);
--- a/paddle/phi/ops/compat/set_value_sig.cc
+++ b/paddle/phi/ops/compat/set_value_sig.cc
@@ -19,9 +19,9 @@ namespace phi {

 KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
  if (ctx.IsDenseTensorInput("Input")) {
-    if (ctx.HasInput("StartsTensorList")) {
-      if (ctx.HasInput("EndsTensorList")) {
-        if (ctx.HasInput("StepsTensorList")) {
+    if (ctx.InputSize("StartsTensorList") > 0) {
+      if (ctx.InputSize("EndsTensorList") > 0) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
          if (ctx.HasInput("ValueTensor")) {
            return KernelSignature("set_value_with_tensor",
                                   {"Input", "ValueTensor"},
@@ -197,7 +197,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
          }
        }
      } else {
-        if (ctx.HasInput("StepsTensorList")) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
          if (ctx.HasInput("ValueTensor")) {
            return KernelSignature("set_value_with_tensor",
                                   {"Input", "ValueTensor"},
@@ -374,8 +374,8 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
        }
      }
    } else {
-      if (ctx.HasInput("EndsTensorList")) {
-        if (ctx.HasInput("StepsTensorList")) {
+      if (ctx.InputSize("EndsTensorList") > 0) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
          if (ctx.HasInput("ValueTensor")) {
            return KernelSignature("set_value_with_tensor",
                                   {"Input", "ValueTensor"},
@@ -551,7 +551,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
          }
        }
      } else {
-        if (ctx.HasInput("StepsTensorList")) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
          if (ctx.HasInput("ValueTensor")) {
            return KernelSignature("set_value_with_tensor",
                                   {"Input", "ValueTensor"},
@@ -734,9 +734,9 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {

 KernelSignature SetValueGradOpArgumentMapping(
    const ArgumentMappingContext& ctx) {
-  if (ctx.HasInput("StartsTensorList")) {
-    if (ctx.HasInput("EndsTensorList")) {
-      if (ctx.HasInput("StepsTensorList")) {
+  if (ctx.InputSize("StartsTensorList") > 0) {
+    if (ctx.InputSize("EndsTensorList") > 0) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
        return KernelSignature(
            "set_value_grad",
            {GradVarName("Out")},
@@ -760,7 +760,7 @@ KernelSignature SetValueGradOpArgumentMapping(
            {GradVarName("Input"), GradVarName("ValueTensor")});
      }
    } else {
-      if (ctx.HasInput("StepsTensorList")) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
        return KernelSignature(
            "set_value_grad",
            {GradVarName("Out")},
@@ -785,8 +785,8 @@ KernelSignature SetValueGradOpArgumentMapping(
      }
    }
  } else {
-    if (ctx.HasInput("EndsTensorList")) {
-      if (ctx.HasInput("StepsTensorList")) {
+    if (ctx.InputSize("EndsTensorList") > 0) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
        return KernelSignature(
            "set_value_grad",
            {GradVarName("Out")},
@@ -810,7 +810,7 @@ KernelSignature SetValueGradOpArgumentMapping(
            {GradVarName("Input"), GradVarName("ValueTensor")});
      }
    } else {
-      if (ctx.HasInput("StepsTensorList")) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
        return KernelSignature(
            "set_value_grad",
            {GradVarName("Out")},

--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -61,6 +61,10 @@ TEST(DEV_API, copy) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
  dev_ctx.Init();
  phi::Copy(
      dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());

--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>

 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"

 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"

--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -58,6 +58,10 @@ TEST(DEV_API, flatten) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
  dev_ctx.Init();

  // 2. test API

--- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
@@ -50,6 +50,10 @@ TEST(DEV_API, reshape) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
  dev_ctx.Init();
  auto out = phi::Reshape<float>(dev_ctx, dense_x, shape);
  // 3. check result

--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -72,6 +72,11 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
    return selected_rows_inputs.count(name) > 0;
  }

+  // add member if needed
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    return false;
+  }
+
  bool IsDenseTensorOutput(const std::string& name) const override {
    return dense_tensor_outputs.count(name) > 0;
  }

--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -215,6 +215,8 @@ class TestLayerNormOp(unittest.TestCase):
                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
                              },
                              fetch_list=fetch_list)
+                # print(y)
+                # print(out[0])
                self.__assert_close(y, out[0], "y")
                self.__assert_close(mean, out[1], "mean")
                self.__assert_close(variance, out[2], "variance", 1e-3)
@@ -238,6 +240,7 @@ class TestLayerNormOp(unittest.TestCase):

    def test_check_forward_backward_with_scale_and_bias(self):
        self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
+
        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
        self.check_forward_backward(
            shape=[2, 3, 4, 5],
@@ -432,4 +435,5 @@ class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):


 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
@@ -21,6 +21,8 @@ import random
 import sys
 sys.path.append("../")
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types
+from xpu.get_test_cover_info import XPUOpTestWrapper

 paddle.enable_static()
 np.set_printoptions(threshold=np.inf)
@@ -73,188 +75,198 @@ def seqconv(x,
    return np.dot(col, filter)


-class TestSeqProject(XPUOpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = 'sequence_conv'
-        self.use_xpu = True
-
-        if self.context_length == 1 \
-                and self.context_start == 0 \
-                and self.padding_trainable:
-            print("If context_start is 0 " \
-                  "and context_length is 1," \
-                  " padding_trainable should be false.")
-            return
-
-        # one level, batch size
-        x = np.random.uniform(-6.10907e-05, 0.000104218,
-                              [self.input_size[0],
-                               self.input_size[1]]).astype('float32')
-        w = np.random.uniform(-3.17068e-05, 0.000159822, [
-            self.context_length * self.input_size[1], self.output_represention
-        ]).astype('float32')
-
-        begin_pad = np.max([0, -self.context_start])
-        end_pad = np.max([0, self.context_start + self.context_length - 1])
-        total_pad = begin_pad + end_pad
-        padding_data = np.random.uniform(
-            0, 0, [total_pad, self.input_size[1]]).astype('float32')
-        self.pad_data = padding_data
-        self.inputs = {
-            'X': (x, self.lod),
-            'Filter': w,
-        }
-        self.inputs_val = ['X', 'Filter']
-        self.inputs_val_no_x = ['Filter']
-        self.inputs_val_no_f = ['X']
-
-        if total_pad != 0:
-            self.inputs['PaddingData'] = padding_data
-            self.inputs_val = ['X', 'PaddingData', 'Filter']
-            self.inputs_val_no_x = ['PaddingData', 'Filter']
-            self.inputs_val_no_f = ['PaddingData', 'X']
-
-        self.attrs = {
-            'contextStart': self.context_start,
-            'contextLength': self.context_length,
-            'paddingTrainable': self.padding_trainable,
-            'contextStride': self.context_stride
-        }
-        out = seqconv(x, self.lod, w, self.context_length, self.context_start,
-                      self.padding_trainable, self.pad_data)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad_input(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x))
-
-    def test_check_grad_padding_data(self):
-        if self.padding_trainable:
+class XPUTestSequenceConv(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'sequence_conv'
+
+    class TestSeqProject(XPUOpTest):
+        def setUp(self):
+            self.init_test_case()
+            self.op_type = 'sequence_conv'
+            self.dtype = self.in_type
+            self.use_xpu = True
+
+            if self.context_length == 1 \
+                    and self.context_start == 0 \
+                    and self.padding_trainable:
+                print("If context_start is 0 " \
+                      "and context_length is 1," \
+                      " padding_trainable should be false.")
+                return
+
+            # one level, batch size
+            x = np.random.uniform(-6.10907e-05, 0.000104218,
+                                  [self.input_size[0],
+                                   self.input_size[1]]).astype(self.dtype)
+            w = np.random.uniform(-3.17068e-05, 0.000159822, [
+                self.context_length * self.input_size[1],
+                self.output_represention
+            ]).astype(self.dtype)
+
+            begin_pad = np.max([0, -self.context_start])
+            end_pad = np.max([0, self.context_start + self.context_length - 1])
+            total_pad = begin_pad + end_pad
+            padding_data = np.random.uniform(
+                0, 0, [total_pad, self.input_size[1]]).astype(self.dtype)
+            self.pad_data = padding_data
+            self.inputs = {
+                'X': (x, self.lod),
+                'Filter': w,
+            }
+            self.inputs_val = ['X', 'Filter']
+            self.inputs_val_no_x = ['Filter']
+            self.inputs_val_no_f = ['X']
+
+            if total_pad != 0:
+                self.inputs['PaddingData'] = padding_data
+                self.inputs_val = ['X', 'PaddingData', 'Filter']
+                self.inputs_val_no_x = ['PaddingData', 'Filter']
+                self.inputs_val_no_f = ['PaddingData', 'X']
+
+            self.attrs = {
+                'contextStart': self.context_start,
+                'contextLength': self.context_length,
+                'paddingTrainable': self.padding_trainable,
+                'contextStride': self.context_stride
+            }
+            out = seqconv(x, self.lod, w, self.context_length,
+                          self.context_start, self.padding_trainable,
+                          self.pad_data)
+            self.outputs = {'Out': out}
+
+        def test_check_output(self):
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+        def test_check_grad_input(self):
+            self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x))
+
+        def test_check_grad_padding_data(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
+
+        def test_check_grad_Filter(self):
            self.check_grad(
-                ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
-
-    def test_check_grad_Filter(self):
-        self.check_grad(
-            ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f))
-
-    def test_check_grad_input_filter(self):
-        if self.padding_trainable:
-            self.check_grad(
-                ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData']))
-
-    def test_check_grad_padding_input(self):
-        if self.padding_trainable:
-            self.check_grad(
-                self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter']))
-
-    def test_check_grad_padding_filter(self):
-        if self.padding_trainable:
-            self.check_grad(self.inputs_val_no_x, 'Out', no_grad_set=set(['X']))
-
-    def init_test_case(self):
-        self.input_row = 7
-        self.input_col = 25
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, self.input_col]
-        offset_lod = [[0, 1, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase1(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 50]
-        offset_lod = [[0, 4, 5, 8, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase2Len0(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 50]
-        offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase3(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 25
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 25]
-        idx = list(range(self.input_size[0]))
-        del idx[0]
-        offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
-                      [self.input_size[0]]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase4(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 7835
-        self.input_col = 128
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, self.input_col]
-        offset_lod = [[
-            0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387, 515,
-            516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073, 1074, 1202,
-            1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876, 1912, 1913, 1914,
-            2032, 2066, 2194, 2308, 2309, 2347, 2475, 2476, 2477, 2478, 2606,
-            2607, 2735, 2736, 2737, 2738, 2838, 2966, 2967, 2968, 2969, 3097,
-            3225, 3353, 3481, 3482, 3520, 3642, 3643, 3754, 3882, 3883, 4010,
-            4011, 4012, 4140, 4219, 4228, 4356, 4357, 4415, 4475, 4476, 4604,
-            4605, 4606, 4694, 4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260,
-            5312, 5440, 5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939,
-            6021, 6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867,
-            6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595, 7699,
-            7827, 7835
-        ]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
+                ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f))
+
+        def test_check_grad_input_filter(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData']))
+
+        def test_check_grad_padding_input(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter']))
+
+        def test_check_grad_padding_filter(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    self.inputs_val_no_x, 'Out', no_grad_set=set(['X']))
+
+        def init_test_case(self):
+            self.input_row = 7
+            self.input_col = 25
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, self.input_col]
+            offset_lod = [[0, 1, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase1(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 11
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 50]
+            offset_lod = [[0, 4, 5, 8, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase2Len0(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 11
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 50]
+            offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase3(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 25
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 25]
+            idx = list(range(self.input_size[0]))
+            del idx[0]
+            offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
+                          [self.input_size[0]]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase4(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 7835
+            self.input_col = 128
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, self.input_col]
+            offset_lod = [[
+                0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387,
+                515, 516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073,
+                1074, 1202, 1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876,
+                1912, 1913, 1914, 2032, 2066, 2194, 2308, 2309, 2347, 2475,
+                2476, 2477, 2478, 2606, 2607, 2735, 2736, 2737, 2738, 2838,
+                2966, 2967, 2968, 2969, 3097, 3225, 3353, 3481, 3482, 3520,
+                3642, 3643, 3754, 3882, 3883, 4010, 4011, 4012, 4140, 4219,
+                4228, 4356, 4357, 4415, 4475, 4476, 4604, 4605, 4606, 4694,
+                4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260, 5312, 5440,
+                5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939, 6021,
+                6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867,
+                6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595,
+                7699, 7827, 7835
+            ]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+
+support_types = get_xpu_op_support_types('sequence_conv')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSequenceConv, stype)


 class TestSeqConvApi(unittest.TestCase):