GPUGraph merge to develop (#44594)

Co-authored-by: seemingwang <zsasuke@qq.com> Co-authored-by: N DesmonDay <908660116@qq.com> Co-authored-by: N seemingwang <seemingwang@users.noreply.github.com> Co-authored-by: N Thunderbrook <a754913769@163.com> Co-authored-by: N xuewujiao <105861147+xuewujiao@users.noreply.github.com> Co-authored-by: N root <root@yq01-sys-hic-k8s-v100-box-a225-0693.yq01.baidu.com> Co-authored-by: N Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Co-authored-by: N root <root@yq01-inf-hic-k8s-a100-ab2-0009.yq01.baidu.com> Co-authored-by: N huwei02 <53012141+huwei02@users.noreply.github.com> Co-authored-by: N yaoxuefeng <yaoxuefeng@baidu.com> Co-authored-by: N lxsbupt <luoxsbupt@163.com> Co-authored-by: N miaoli06 <106585574+miaoli06@users.noreply.github.com> Co-authored-by: N root <root@yq01-inf-hic-k8s-a100-ab2-0008.yq01.baidu.com> Co-authored-by: N chao9527 <33347532+chao9527@users.noreply.github.com> Co-authored-by: N qingshui <qshuihu@gmail.com> Co-authored-by: N yangjunchao <yangjunchao@baidu.com>

GPUGraph merge to develop (#44594)
Co-authored-by: seemingwang <zsasuke@qq.com> Co-authored-by: N DesmonDay <908660116@qq.com> Co-authored-by: N seemingwang <seemingwang@users.noreply.github.com> Co-authored-by: N Thunderbrook <a754913769@163.com> Co-authored-by: N xuewujiao <105861147+xuewujiao@users.noreply.github.com> Co-authored-by: N root <root@yq01-sys-hic-k8s-v100-box-a225-0693.yq01.baidu.com> Co-authored-by: N Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Co-authored-by: N root <root@yq01-inf-hic-k8s-a100-ab2-0009.yq01.baidu.com> Co-authored-by: N huwei02 <53012141+huwei02@users.noreply.github.com> Co-authored-by: N yaoxuefeng <yaoxuefeng@baidu.com> Co-authored-by: N lxsbupt <luoxsbupt@163.com> Co-authored-by: N miaoli06 <106585574+miaoli06@users.noreply.github.com> Co-authored-by: N root <root@yq01-inf-hic-k8s-a100-ab2-0008.yq01.baidu.com> Co-authored-by: N chao9527 <33347532+chao9527@users.noreply.github.com> Co-authored-by: N qingshui <qshuihu@gmail.com> Co-authored-by: N yangjunchao <yangjunchao@baidu.com>
798670bb · danleifeng · GitHub · 1149a378 · 798670bb · 798670bb
69 changed file
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -241,3 +241,6 @@ endif()
 if(WITH_CUSTOM_DEVICE AND NOT WIN32)
  add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
 endif()
+if(WITH_GPU_GRAPH)
+  add_definitions(-DPADDLE_WITH_GPU_GRAPH)
+endif()
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -144,10 +144,8 @@ int32_t GraphBrpcService::add_graph_node(Table *table,

  int idx_ = *(int *)(request.params(0).c_str());
  size_t node_num = request.params(1).size() / sizeof(int64_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  // size_t node_num = request.params(0).size() / sizeof(int64_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  std::vector<int64_t> node_ids(node_data, node_data + node_num);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
  std::vector<bool> is_weighted_list;
  if (request.params_size() == 3) {
    size_t weight_list_size = request.params(2).size() / sizeof(bool);
@@ -179,11 +177,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table,
    return 0;
  }
  int idx_ = *(int *)(request.params(0).c_str());
-  size_t node_num = request.params(1).size() / sizeof(int64_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  // size_t node_num = request.params(0).size() / sizeof(int64_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  std::vector<int64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(1).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);

  ((GraphTable *)table)->remove_graph_node(idx_, node_ids);
  return 0;
@@ -217,11 +213,6 @@ int32_t GraphBrpcService::Initialize() {
      &GraphBrpcService::graph_set_node_feat;
  _service_handler_map[PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER] =
      &GraphBrpcService::sample_neighbors_across_multi_servers;
-  // _service_handler_map[PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE] =
-  //     &GraphBrpcService::use_neighbors_sample_cache;
-  // _service_handler_map[PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG] =
-  //     &GraphBrpcService::load_graph_split_config;
-  // shard初始化,server启动后才可从env获取到server_list的shard信息
  InitializeShardInfo();

  return 0;
@@ -389,9 +380,6 @@ int32_t GraphBrpcService::pull_graph_list(Table *table,
  int start = *(int *)(request.params(2).c_str());
  int size = *(int *)(request.params(3).c_str());
  int step = *(int *)(request.params(4).c_str());
-  // int start = *(int *)(request.params(0).c_str());
-  // int size = *(int *)(request.params(1).c_str());
-  // int step = *(int *)(request.params(2).c_str());
  std::unique_ptr<char[]> buffer;
  int actual_size;
  ((GraphTable *)table)
@@ -414,14 +402,10 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
    return 0;
  }
  int idx_ = *(int *)(request.params(0).c_str());
-  size_t node_num = request.params(1).size() / sizeof(int64_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  int sample_size = *(int64_t *)(request.params(2).c_str());
+  size_t node_num = request.params(1).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  int sample_size = *(int *)(request.params(2).c_str());
  bool need_weight = *(bool *)(request.params(3).c_str());
-  // size_t node_num = request.params(0).size() / sizeof(int64_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  // int sample_size = *(int64_t *)(request.params(1).c_str());
-  // bool need_weight = *(bool *)(request.params(2).c_str());
  std::vector<std::shared_ptr<char>> buffers(node_num);
  std::vector<int> actual_sizes(node_num, 0);
  ((GraphTable *)table)
@@ -443,7 +427,7 @@ int32_t GraphBrpcService::graph_random_sample_nodes(
    brpc::Controller *cntl) {
  int type_id = *(int *)(request.params(0).c_str());
  int idx_ = *(int *)(request.params(1).c_str());
-  size_t size = *(int64_t *)(request.params(2).c_str());
+  size_t size = *(uint64_t *)(request.params(2).c_str());
  // size_t size = *(int64_t *)(request.params(0).c_str());
  std::unique_ptr<char[]> buffer;
  int actual_size;
@@ -470,11 +454,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table,
    return 0;
  }
  int idx_ = *(int *)(request.params(0).c_str());
-  size_t node_num = request.params(1).size() / sizeof(int64_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  // size_t node_num = request.params(0).size() / sizeof(int64_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  std::vector<int64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(1).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);

  std::vector<std::string> feature_names =
      paddle::string::split_string<std::string>(request.params(2), "\t");
@@ -511,21 +493,14 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
  }

  int idx_ = *(int *)(request.params(0).c_str());
-  size_t node_num = request.params(1).size() / sizeof(int64_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  int sample_size = *(int64_t *)(request.params(2).c_str());
-  bool need_weight = *(int64_t *)(request.params(3).c_str());
-
-  // size_t node_num = request.params(0).size() / sizeof(int64_t),
-  //        size_of_size_t = sizeof(size_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  // int sample_size = *(int64_t *)(request.params(1).c_str());
-  // bool need_weight = *(int64_t *)(request.params(2).c_str());
-  // std::vector<int64_t> res = ((GraphTable
-  // *)table).filter_out_non_exist_nodes(node_data, sample_size);
+  size_t node_num = request.params(1).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  int sample_size = *(int *)(request.params(2).c_str());
+  bool need_weight = *(bool *)(request.params(3).c_str());
+
  std::vector<int> request2server;
  std::vector<int> server2request(server_size, -1);
-  std::vector<int64_t> local_id;
+  std::vector<uint64_t> local_id;
  std::vector<int> local_query_idx;
  size_t rank = GetRank();
  for (size_t query_idx = 0; query_idx < node_num; ++query_idx) {
@@ -548,7 +523,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
  std::vector<std::shared_ptr<char>> local_buffers;
  std::vector<int> local_actual_sizes;
  std::vector<size_t> seq;
-  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
  for (size_t query_idx = 0; query_idx < node_num; ++query_idx) {
    int server_index =
@@ -639,7 +614,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(

    closure->request(request_idx)
        ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(int64_t) * node_num);
+                     sizeof(uint64_t) * node_num);
    closure->request(request_idx)
        ->add_params((char *)&sample_size, sizeof(int));
    closure->request(request_idx)
@@ -682,11 +657,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table,
  }
  int idx_ = *(int *)(request.params(0).c_str());

-  // size_t node_num = request.params(0).size() / sizeof(int64_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  size_t node_num = request.params(1).size() / sizeof(int64_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  std::vector<int64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(1).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);

  // std::vector<std::string> feature_names =
  //     paddle::string::split_string<std::string>(request.params(1), "\t");

--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -18,7 +18,7 @@ set_source_files_properties(
 cc_library(
  graph_node
  SRCS ${graphDir}/graph_node.cc
-  DEPS WeightedSampler)
+  DEPS WeightedSampler enforce)
 set_source_files_properties(
  memory_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(

--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -21,12 +21,17 @@
 #include <set>
 #include <sstream>

+#include "gflags/gflags.h"
 #include "paddle/fluid/distributed/common/utils.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/string_helper.h"

+DECLARE_bool(graph_load_in_parallel);
+
 namespace paddle {
 namespace distributed {

@@ -47,34 +52,125 @@ int32_t GraphTable::Load_to_ssd(const std::string &path,
  return 0;
 }

-paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
-    int idx, std::vector<int64_t> ids) {
-  std::vector<std::vector<int64_t>> bags(task_pool_size_);
-  for (auto x : ids) {
+paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
+    std::vector<uint64_t> &node_ids, int slot_num) {
+  std::vector<std::vector<uint64_t>> bags(task_pool_size_);
+  for (int i = 0; i < task_pool_size_; i++) {
+    auto predsize = node_ids.size() / task_pool_size_;
+    bags[i].reserve(predsize * 1.2);
+  }
+
+  for (auto x : node_ids) {
    int location = x % shard_num % task_pool_size_;
    bags[location].push_back(x);
  }
+
  std::vector<std::future<int>> tasks;
-  std::vector<int64_t> edge_array[task_pool_size_];
-  std::vector<paddle::framework::GpuPsGraphNode> node_array[task_pool_size_];
+  std::vector<uint64_t> feature_array[task_pool_size_];
+  std::vector<uint8_t> slot_id_array[task_pool_size_];
+  std::vector<uint64_t> node_id_array[task_pool_size_];
+  std::vector<paddle::framework::GpuPsFeaInfo>
+      node_fea_info_array[task_pool_size_];
  for (size_t i = 0; i < bags.size(); i++) {
    if (bags[i].size() > 0) {
      tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
-        paddle::framework::GpuPsGraphNode x;
+        uint64_t node_id;
+        paddle::framework::GpuPsFeaInfo x;
+        std::vector<uint64_t> feature_ids;
        for (size_t j = 0; j < bags[i].size(); j++) {
-          Node *v = find_node(0, idx, bags[i][j]);
-          x.node_id = bags[i][j];
+          // TODO use FEATURE_TABLE instead
+          Node *v = find_node(1, bags[i][j]);
+          node_id = bags[i][j];
          if (v == NULL) {
-            x.neighbor_size = 0;
-            x.neighbor_offset = 0;
-            node_array[i].push_back(x);
+            x.feature_size = 0;
+            x.feature_offset = 0;
+            node_fea_info_array[i].push_back(x);
          } else {
-            x.neighbor_size = v->get_neighbor_size();
-            x.neighbor_offset = edge_array[i].size();
-            node_array[i].push_back(x);
-            for (size_t k = 0; k < x.neighbor_size; k++) {
+            // x <- v
+            x.feature_offset = feature_array[i].size();
+            int total_feature_size = 0;
+            for (int k = 0; k < slot_num; ++k) {
+              v->get_feature_ids(k, &feature_ids);
+              total_feature_size += feature_ids.size();
+              if (!feature_ids.empty()) {
+                feature_array[i].insert(feature_array[i].end(),
+                                        feature_ids.begin(),
+                                        feature_ids.end());
+                slot_id_array[i].insert(
+                    slot_id_array[i].end(), feature_ids.size(), k);
+              }
+            }
+            x.feature_size = total_feature_size;
+            node_fea_info_array[i].push_back(x);
+          }
+          node_id_array[i].push_back(node_id);
+        }
+        return 0;
+      }));
+    }
+  }
+  for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
+  paddle::framework::GpuPsCommGraphFea res;
+  uint64_t tot_len = 0;
+  for (int i = 0; i < task_pool_size_; i++) {
+    tot_len += feature_array[i].size();
+  }
+  VLOG(0) << "Loaded feature table on cpu, feature_list_size[" << tot_len
+          << "] node_ids_size[" << node_ids.size() << "]";
+  res.init_on_cpu(tot_len, (unsigned int)node_ids.size(), slot_num);
+  unsigned int offset = 0, ind = 0;
+  for (int i = 0; i < task_pool_size_; i++) {
+    for (int j = 0; j < (int)node_id_array[i].size(); j++) {
+      res.node_list[ind] = node_id_array[i][j];
+      res.fea_info_list[ind] = node_fea_info_array[i][j];
+      res.fea_info_list[ind++].feature_offset += offset;
+    }
+    for (size_t j = 0; j < feature_array[i].size(); j++) {
+      res.feature_list[offset + j] = feature_array[i][j];
+      res.slot_id_list[offset + j] = slot_id_array[i][j];
+    }
+    offset += feature_array[i].size();
+  }
+  return res;
+}
+
+paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
+    int idx, std::vector<uint64_t> ids) {
+  std::vector<std::vector<uint64_t>> bags(task_pool_size_);
+  for (int i = 0; i < task_pool_size_; i++) {
+    auto predsize = ids.size() / task_pool_size_;
+    bags[i].reserve(predsize * 1.2);
+  }
+  for (auto x : ids) {
+    int location = x % shard_num % task_pool_size_;
+    bags[location].push_back(x);
+  }
+
+  std::vector<std::future<int>> tasks;
+  std::vector<uint64_t> node_array[task_pool_size_];  // node id list
+  std::vector<paddle::framework::GpuPsNodeInfo> info_array[task_pool_size_];
+  std::vector<uint64_t> edge_array[task_pool_size_];  // edge id list
+
+  for (size_t i = 0; i < bags.size(); i++) {
+    if (bags[i].size() > 0) {
+      tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
+        node_array[i].resize(bags[i].size());
+        info_array[i].resize(bags[i].size());
+        edge_array[i].reserve(bags[i].size());
+
+        for (size_t j = 0; j < bags[i].size(); j++) {
+          auto node_id = bags[i][j];
+          node_array[i][j] = node_id;
+          Node *v = find_node(0, idx, node_id);
+          if (v != nullptr) {
+            info_array[i][j].neighbor_offset = edge_array[i].size();
+            info_array[i][j].neighbor_size = v->get_neighbor_size();
+            for (size_t k = 0; k < v->get_neighbor_size(); k++) {
              edge_array[i].push_back(v->get_neighbor_id(k));
            }
+          } else {
+            info_array[i][j].neighbor_offset = 0;
+            info_array[i][j].neighbor_size = 0;
          }
        }
        return 0;
@@ -82,21 +178,20 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
    }
  }
  for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
-  paddle::framework::GpuPsCommGraph res;
+
  int64_t tot_len = 0;
  for (int i = 0; i < task_pool_size_; i++) {
    tot_len += edge_array[i].size();
  }
-  // res.neighbor_size = tot_len;
-  // res.node_size = ids.size();
-  // res.neighbor_list = new int64_t[tot_len];
-  // res.node_list = new paddle::framework::GpuPsGraphNode[ids.size()];
+
+  paddle::framework::GpuPsCommGraph res;
  res.init_on_cpu(tot_len, ids.size());
  int64_t offset = 0, ind = 0;
  for (int i = 0; i < task_pool_size_; i++) {
    for (int j = 0; j < (int)node_array[i].size(); j++) {
      res.node_list[ind] = node_array[i][j];
-      res.node_list[ind++].neighbor_offset += offset;
+      res.node_info_list[ind] = info_array[i][j];
+      res.node_info_list[ind++].neighbor_offset += offset;
    }
    for (size_t j = 0; j < edge_array[i].size(); j++) {
      res.neighbor_list[offset + j] = edge_array[i][j];
@@ -107,62 +202,41 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
 }

 int32_t GraphTable::add_node_to_ssd(
-    int type_id, int idx, int64_t src_id, char *data, int len) {
+    int type_id, int idx, uint64_t src_id, char *data, int len) {
  if (_db != NULL) {
-    char ch[sizeof(int) * 2 + sizeof(int64_t)];
+    char ch[sizeof(int) * 2 + sizeof(uint64_t)];
    memcpy(ch, &type_id, sizeof(int));
    memcpy(ch + sizeof(int), &idx, sizeof(int));
-    memcpy(ch + sizeof(int) * 2, &src_id, sizeof(int64_t));
+    memcpy(ch + sizeof(int) * 2, &src_id, sizeof(uint64_t));
    std::string str;
    if (_db->get(src_id % shard_num % task_pool_size_,
                 ch,
-                 sizeof(int) * 2 + sizeof(int64_t),
+                 sizeof(int) * 2 + sizeof(uint64_t),
                 str) == 0) {
-      int64_t *stored_data = ((int64_t *)str.c_str());
-      int n = str.size() / sizeof(int64_t);
-      char *new_data = new char[n * sizeof(int64_t) + len];
-      memcpy(new_data, stored_data, n * sizeof(int64_t));
-      memcpy(new_data + n * sizeof(int64_t), data, len);
+      uint64_t *stored_data = ((uint64_t *)str.c_str());
+      int n = str.size() / sizeof(uint64_t);
+      char *new_data = new char[n * sizeof(uint64_t) + len];
+      memcpy(new_data, stored_data, n * sizeof(uint64_t));
+      memcpy(new_data + n * sizeof(uint64_t), data, len);
      _db->put(src_id % shard_num % task_pool_size_,
               ch,
-               sizeof(int) * 2 + sizeof(int64_t),
+               sizeof(int) * 2 + sizeof(uint64_t),
               (char *)new_data,
-               n * sizeof(int64_t) + len);
+               n * sizeof(uint64_t) + len);
      delete[] new_data;
    } else {
      _db->put(src_id % shard_num % task_pool_size_,
               ch,
-               sizeof(int) * 2 + sizeof(int64_t),
+               sizeof(int) * 2 + sizeof(uint64_t),
               (char *)data,
               len);
    }
-    // _db->flush(src_id % shard_num % task_pool_size_);
-    // std::string x;
-    // if (_db->get(src_id % shard_num % task_pool_size_, ch, sizeof(int64_t) +
-    // 2 * sizeof(int), x) ==0){
-    // VLOG(0)<<"put result";
-    // for(int i = 0;i < x.size();i+=8){
-    //   VLOG(0)<<"get an id "<<*((int64_t *)(x.c_str() + i));
-    // }
-    //}
-    // if(src_id == 429){
-    //   str = "";
-    //   _db->get(src_id % shard_num % task_pool_size_, ch,
-    //            sizeof(int) * 2 + sizeof(int64_t), str);
-    //   int64_t *stored_data = ((int64_t *)str.c_str());
-    //   int n = str.size() / sizeof(int64_t);
-    //   VLOG(0)<<"429 has "<<n<<"neighbors";
-    //   for(int i =0;i< n;i++){
-    //     VLOG(0)<<"get an id "<<*((int64_t *)(str.c_str() +
-    //     i*sizeof(int64_t)));
-    //   }
-    // }
  }
  return 0;
 }
 char *GraphTable::random_sample_neighbor_from_ssd(
    int idx,
-    int64_t id,
+    uint64_t id,
    int sample_size,
    const std::shared_ptr<std::mt19937_64> rng,
    int &actual_size) {
@@ -172,18 +246,18 @@ char *GraphTable::random_sample_neighbor_from_ssd(
  }
  std::string str;
  VLOG(2) << "sample ssd for key " << id;
-  char ch[sizeof(int) * 2 + sizeof(int64_t)];
+  char ch[sizeof(int) * 2 + sizeof(uint64_t)];
  memset(ch, 0, sizeof(int));
  memcpy(ch + sizeof(int), &idx, sizeof(int));
-  memcpy(ch + sizeof(int) * 2, &id, sizeof(int64_t));
+  memcpy(ch + sizeof(int) * 2, &id, sizeof(uint64_t));
  if (_db->get(id % shard_num % task_pool_size_,
               ch,
-               sizeof(int) * 2 + sizeof(int64_t),
+               sizeof(int) * 2 + sizeof(uint64_t),
               str) == 0) {
-    int64_t *data = ((int64_t *)str.c_str());
-    int n = str.size() / sizeof(int64_t);
+    uint64_t *data = ((uint64_t *)str.c_str());
+    int n = str.size() / sizeof(uint64_t);
    std::unordered_map<int, int> m;
-    // std::vector<int64_t> res;
+    // std::vector<uint64_t> res;
    int sm_size = std::min(n, sample_size);
    actual_size = sm_size * Node::id_size;
    char *buff = new char[actual_size];
@@ -207,7 +281,7 @@ char *GraphTable::random_sample_neighbor_from_ssd(
      // res.push_back(data[pos]);
    }
    for (int i = 0; i < actual_size; i += 8) {
-      VLOG(2) << "sampled an neighbor " << *(int64_t *)&buff[i];
+      VLOG(2) << "sampled an neighbor " << *(uint64_t *)&buff[i];
    }
    return buff;
  }
@@ -216,8 +290,8 @@ char *GraphTable::random_sample_neighbor_from_ssd(
 }

 int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
-                                                  std::vector<int64_t> &ids) {
-  std::vector<std::vector<int64_t>> bags(task_pool_size_);
+                                                  std::vector<uint64_t> &ids) {
+  std::vector<std::vector<uint64_t>> bags(task_pool_size_);
  for (auto x : ids) {
    int location = x % shard_num % task_pool_size_;
    bags[location].push_back(x);
@@ -227,17 +301,17 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
  for (size_t i = 0; i < bags.size(); i++) {
    if (bags[i].size() > 0) {
      tasks.push_back(_shards_task_pool[i]->enqueue([&, i, idx, this]() -> int {
-        char ch[sizeof(int) * 2 + sizeof(int64_t)];
+        char ch[sizeof(int) * 2 + sizeof(uint64_t)];
        memset(ch, 0, sizeof(int));
        memcpy(ch + sizeof(int), &idx, sizeof(int));
        for (size_t k = 0; k < bags[i].size(); k++) {
          auto v = bags[i][k];
-          memcpy(ch + sizeof(int) * 2, &v, sizeof(int64_t));
+          memcpy(ch + sizeof(int) * 2, &v, sizeof(uint64_t));
          std::string str;
-          if (_db->get(i, ch, sizeof(int) * 2 + sizeof(int64_t), str) == 0) {
+          if (_db->get(i, ch, sizeof(int) * 2 + sizeof(uint64_t), str) == 0) {
            count[i] += (int64_t)str.size();
-            for (int j = 0; j < str.size(); j += sizeof(int64_t)) {
-              int64_t id = *(int64_t *)(str.c_str() + j);
+            for (size_t j = 0; j < (int)str.size(); j += sizeof(uint64_t)) {
+              uint64_t id = *(uint64_t *)(str.c_str() + j);
              add_comm_edge(idx, v, id);
            }
          }
@@ -274,7 +348,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
  std::vector<double> weight_cost(part_len, 0);
  std::vector<int64_t> memory_remaining(part_len, gb_size_by_discount);
  std::vector<double> score(part_len, 0);
-  std::unordered_map<int64_t, int> id_map;
+  std::unordered_map<uint64_t, int> id_map;
  std::vector<rocksdb::Iterator *> iters;
  for (int i = 0; i < task_pool_size_; i++) {
    iters.push_back(_db->get_iterator(i));
@@ -282,7 +356,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
  }
  int next = 0;
  while (iters.size()) {
-    if (next >= iters.size()) {
+    if (next >= (int)iters.size()) {
      next = 0;
    }
    if (!iters[next]->Valid()) {
@@ -298,7 +372,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
      continue;
    }
    std::string value = iters[next]->value().ToString();
-    std::int64_t i_key = *(int64_t *)(key.c_str() + sizeof(int) * 2);
+    std::uint64_t i_key = *(uint64_t *)(key.c_str() + sizeof(int) * 2);
    for (int i = 0; i < part_len; i++) {
      if (memory_remaining[i] < (int64_t)value.size()) {
        score[i] = -100000.0;
@@ -306,8 +380,8 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
        score[i] = 0;
      }
    }
-    for (int j = 0; j < value.size(); j += sizeof(int64_t)) {
-      int64_t v = *((int64_t *)(value.c_str() + j));
+    for (size_t j = 0; j < (int)value.size(); j += sizeof(uint64_t)) {
+      uint64_t v = *((uint64_t *)(value.c_str() + j));
      int index = -1;
      if (id_map.find(v) != id_map.end()) {
        index = id_map[v];
@@ -398,7 +472,7 @@ void GraphTable::clear_graph(int idx) {
  }
 }
 int32_t GraphTable::load_next_partition(int idx) {
-  if (next_partition >= partitions[idx].size()) {
+  if (next_partition >= (int)partitions[idx].size()) {
    VLOG(0) << "partition iteration is done";
    return -1;
  }
@@ -426,8 +500,6 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
  auto paths = paddle::string::split_string<std::string>(path, ";");
  int64_t count = 0;
  std::string sample_type = "random";
-  bool is_weighted = false;
-  int valid_count = 0;
  for (auto path : paths) {
    std::ifstream file(path);
    std::string line;
@@ -438,16 +510,16 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
      if (values.size() < 2) continue;
      auto src_id = std::stoll(values[0]);
      auto dist_ids = paddle::string::split_string<std::string>(values[1], ";");
-      std::vector<int64_t> dist_data;
+      std::vector<uint64_t> dist_data;
      for (auto x : dist_ids) {
        dist_data.push_back(std::stoll(x));
-        total_memory_cost += sizeof(int64_t);
+        total_memory_cost += sizeof(uint64_t);
      }
      add_node_to_ssd(0,
                      idx,
                      src_id,
                      (char *)dist_data.data(),
-                      (int)(dist_data.size() * sizeof(int64_t)));
+                      (int)(dist_data.size() * sizeof(uint64_t)));
    }
  }
  VLOG(0) << "total memory cost = " << total_memory_cost << " bytes";
@@ -456,9 +528,6 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,

 int32_t GraphTable::dump_edges_to_ssd(int idx) {
  VLOG(2) << "calling dump edges to ssd";
-  const int64_t fixed_size = 10000;
-  // std::vector<int64_t> edge_array[task_pool_size_];
-  std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
  std::vector<std::future<int64_t>> tasks;
  auto &shards = edge_shards[idx];
  for (size_t i = 0; i < shards.size(); ++i) {
@@ -466,18 +535,17 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) {
        [&, i, this]() -> int64_t {
          int64_t cost = 0;
          std::vector<Node *> &v = shards[i]->get_bucket();
-          size_t ind = i % this->task_pool_size_;
          for (size_t j = 0; j < v.size(); j++) {
-            std::vector<int64_t> s;
-            for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
+            std::vector<uint64_t> s;
+            for (size_t k = 0; k < (int)v[j]->get_neighbor_size(); k++) {
              s.push_back(v[j]->get_neighbor_id(k));
            }
-            cost += v[j]->get_neighbor_size() * sizeof(int64_t);
+            cost += v[j]->get_neighbor_size() * sizeof(uint64_t);
            add_node_to_ssd(0,
                            idx,
                            v[j]->get_id(),
                            (char *)s.data(),
-                            s.size() * sizeof(int64_t));
+                            s.size() * sizeof(uint64_t));
          }
          return cost;
        }));
@@ -489,7 +557,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
  VLOG(0) << "make_complementary_graph";
  const int64_t fixed_size = byte_size / 8;
  // std::vector<int64_t> edge_array[task_pool_size_];
-  std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
+  std::vector<std::unordered_map<uint64_t, int>> count(task_pool_size_);
  std::vector<std::future<int>> tasks;
  auto &shards = edge_shards[idx];
  for (size_t i = 0; i < shards.size(); ++i) {
@@ -499,7 +567,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
          size_t ind = i % this->task_pool_size_;
          for (size_t j = 0; j < v.size(); j++) {
            // size_t location = v[j]->get_id();
-            for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
+            for (size_t k = 0; k < v[j]->get_neighbor_size(); k++) {
              count[ind][v[j]->get_neighbor_id(k)]++;
            }
          }
@@ -507,9 +575,9 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
        }));
  }
  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
-  std::unordered_map<int64_t, int> final_count;
-  std::map<int, std::vector<int64_t>> count_to_id;
-  std::vector<int64_t> buffer;
+  std::unordered_map<uint64_t, int> final_count;
+  std::map<int, std::vector<uint64_t>> count_to_id;
+  std::vector<uint64_t> buffer;
  clear_graph(idx);

  for (int i = 0; i < task_pool_size_; i++) {
@@ -546,6 +614,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
      bucket[i]->build_sampler(sample_type);
    }
  }
+
  return 0;
 }
 #endif
@@ -840,7 +909,7 @@ std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {

 size_t GraphShard::get_size() { return bucket.size(); }

-int32_t GraphTable::add_comm_edge(int idx, int64_t src_id, int64_t dst_id) {
+int32_t GraphTable::add_comm_edge(int idx, uint64_t src_id, uint64_t dst_id) {
  size_t src_shard_id = src_id % shard_num;

  if (src_shard_id >= shard_end || src_shard_id < shard_start) {
@@ -852,11 +921,11 @@ int32_t GraphTable::add_comm_edge(int idx, int64_t src_id, int64_t dst_id) {
  return 0;
 }
 int32_t GraphTable::add_graph_node(int idx,
-                                   std::vector<int64_t> &id_list,
+                                   std::vector<uint64_t> &id_list,
                                   std::vector<bool> &is_weight_list) {
  auto &shards = edge_shards[idx];
  size_t node_size = id_list.size();
-  std::vector<std::vector<std::pair<int64_t, bool>>> batch(task_pool_size_);
+  std::vector<std::vector<std::pair<uint64_t, bool>>> batch(task_pool_size_);
  for (size_t i = 0; i < node_size; i++) {
    size_t shard_id = id_list[i] % shard_num;
    if (shard_id >= shard_end || shard_id < shard_start) {
@@ -881,9 +950,9 @@ int32_t GraphTable::add_graph_node(int idx,
  return 0;
 }

-int32_t GraphTable::remove_graph_node(int idx, std::vector<int64_t> &id_list) {
+int32_t GraphTable::remove_graph_node(int idx, std::vector<uint64_t> &id_list) {
  size_t node_size = id_list.size();
-  std::vector<std::vector<int64_t>> batch(task_pool_size_);
+  std::vector<std::vector<uint64_t>> batch(task_pool_size_);
  for (size_t i = 0; i < node_size; i++) {
    size_t shard_id = id_list[i] % shard_num;
    if (shard_id >= shard_end || shard_id < shard_start) continue;
@@ -916,7 +985,7 @@ void GraphShard::clear() {

 GraphShard::~GraphShard() { clear(); }

-void GraphShard::delete_node(int64_t id) {
+void GraphShard::delete_node(uint64_t id) {
  auto iter = node_location.find(id);
  if (iter == node_location.end()) return;
  int pos = iter->second;
@@ -928,7 +997,7 @@ void GraphShard::delete_node(int64_t id) {
  node_location.erase(id);
  bucket.pop_back();
 }
-GraphNode *GraphShard::add_graph_node(int64_t id) {
+GraphNode *GraphShard::add_graph_node(uint64_t id) {
  if (node_location.find(id) == node_location.end()) {
    node_location[id] = bucket.size();
    bucket.push_back(new GraphNode(id));
@@ -944,19 +1013,25 @@ GraphNode *GraphShard::add_graph_node(Node *node) {
  }
  return (GraphNode *)bucket[node_location[id]];
 }
-FeatureNode *GraphShard::add_feature_node(int64_t id) {
+
+FeatureNode *GraphShard::add_feature_node(uint64_t id, bool is_overlap) {
  if (node_location.find(id) == node_location.end()) {
    node_location[id] = bucket.size();
    bucket.push_back(new FeatureNode(id));
+    return (FeatureNode *)bucket[node_location[id]];
+  }
+  if (is_overlap) {
+    return (FeatureNode *)bucket[node_location[id]];
  }
-  return (FeatureNode *)bucket[node_location[id]];
+
+  return NULL;
 }

-void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) {
+void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) {
  find_node(id)->add_edge(dst_id, weight);
 }

-Node *GraphShard::find_node(int64_t id) {
+Node *GraphShard::find_node(uint64_t id) {
  auto iter = node_location.find(id);
  return iter == node_location.end() ? nullptr : bucket[iter->second];
 }
@@ -992,15 +1067,93 @@ int32_t GraphTable::Load(const std::string &path, const std::string &param) {
  return 0;
 }

+std::string GraphTable::get_inverse_etype(std::string &etype) {
+  auto etype_split = paddle::string::split_string<std::string>(etype, "2");
+  std::string res;
+  if ((int)etype_split.size() == 3) {
+    res = etype_split[2] + "2" + etype_split[1] + "2" + etype_split[0];
+  } else {
+    res = etype_split[1] + "2" + etype_split[0];
+  }
+  return res;
+}
+
+int32_t GraphTable::load_node_and_edge_file(std::string etype,
+                                            std::string ntype,
+                                            std::string epath,
+                                            std::string npath,
+                                            int part_num,
+                                            bool reverse) {
+  auto etypes = paddle::string::split_string<std::string>(etype, ",");
+  auto ntypes = paddle::string::split_string<std::string>(ntype, ",");
+  VLOG(0) << "etypes size: " << etypes.size();
+  VLOG(0) << "whether reverse: " << reverse;
+  std::string delim = ";";
+  size_t total_len = etypes.size() + 1;  // 1 is for node
+
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < total_len; i++) {
+    tasks.push_back(
+        _shards_task_pool[i % task_pool_size_]->enqueue([&, i, this]() -> int {
+          if (i < etypes.size()) {
+            std::string etype_path = epath + "/" + etypes[i];
+            auto etype_path_list = paddle::framework::localfs_list(etype_path);
+            std::string etype_path_str;
+            if (part_num > 0 && part_num < (int)etype_path_list.size()) {
+              std::vector<std::string> sub_etype_path_list(
+                  etype_path_list.begin(), etype_path_list.begin() + part_num);
+              etype_path_str =
+                  paddle::string::join_strings(sub_etype_path_list, delim);
+            } else {
+              etype_path_str =
+                  paddle::string::join_strings(etype_path_list, delim);
+            }
+            this->load_edges(etype_path_str, false, etypes[i]);
+            if (reverse) {
+              std::string r_etype = get_inverse_etype(etypes[i]);
+              this->load_edges(etype_path_str, true, r_etype);
+            }
+          } else {
+            auto npath_list = paddle::framework::localfs_list(npath);
+            std::string npath_str;
+            if (part_num > 0 && part_num < (int)npath_list.size()) {
+              std::vector<std::string> sub_npath_list(
+                  npath_list.begin(), npath_list.begin() + part_num);
+              npath_str = paddle::string::join_strings(sub_npath_list, delim);
+            } else {
+              npath_str = paddle::string::join_strings(npath_list, delim);
+            }
+
+            if (ntypes.size() == 0) {
+              VLOG(0) << "node_type not specified, nothing will be loaded ";
+              return 0;
+            }
+
+            if (FLAGS_graph_load_in_parallel) {
+              this->load_nodes(npath_str, "");
+            } else {
+              for (size_t j = 0; j < ntypes.size(); j++) {
+                this->load_nodes(npath_str, ntypes[j]);
+              }
+            }
+          }
+          return 0;
+        }));
+  }
+  for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
+  return 0;
+}
+
 int32_t GraphTable::get_nodes_ids_by_ranges(
    int type_id,
    int idx,
    std::vector<std::pair<int, int>> ranges,
-    std::vector<int64_t> &res) {
+    std::vector<uint64_t> &res) {
+  std::mutex mutex;
  int start = 0, end, index = 0, total_size = 0;
  res.clear();
  auto &shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
-  std::vector<std::future<std::vector<int64_t>>> tasks;
+  std::vector<std::future<size_t>> tasks;
  for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
    end = total_size + shards[i]->get_size();
    start = total_size;
@@ -1016,86 +1169,173 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
        first -= total_size;
        second -= total_size;
        tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
-            [&shards, this, first, second, i]() -> std::vector<int64_t> {
-              return shards[i]->get_ids_by_range(first, second);
+            [&shards, this, first, second, i, &res, &mutex]() -> size_t {
+              std::vector<uint64_t> keys;
+              shards[i]->get_ids_by_range(first, second, &keys);
+
+              size_t num = keys.size();
+              mutex.lock();
+              res.reserve(res.size() + num);
+              for (auto &id : keys) {
+                res.push_back(id);
+                std::swap(res[rand() % res.size()], res[(int)res.size() - 1]);
+              }
+              mutex.unlock();
+
+              return num;
            }));
      }
    }
    total_size += shards[i]->get_size();
  }
  for (size_t i = 0; i < tasks.size(); i++) {
-    auto vec = tasks[i].get();
-    for (auto &id : vec) {
-      res.push_back(id);
-      std::swap(res[rand() % res.size()], res[(int)res.size() - 1]);
-    }
+    tasks[i].get();
  }
  return 0;
 }

-int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
-  auto paths = paddle::string::split_string<std::string>(path, ";");
-  int64_t count = 0;
-  int64_t valid_count = 0;
-  int idx = 0;
-  if (node_type == "") {
-    VLOG(0) << "node_type not specified, loading edges to " << id_to_feature[0]
-            << " part";
-  } else {
-    if (feature_to_id.find(node_type) == feature_to_id.end()) {
-      VLOG(0) << "node_type " << node_type
-              << " is not defined, nothing will be loaded";
-      return 0;
+std::pair<uint64_t, uint64_t> GraphTable::parse_node_file(
+    const std::string &path, const std::string &node_type, int idx) {
+  std::ifstream file(path);
+  std::string line;
+  uint64_t local_count = 0;
+  uint64_t local_valid_count = 0;
+
+  int num = 0;
+  std::vector<paddle::string::str_ptr> vals;
+  size_t n = node_type.length();
+  while (std::getline(file, line)) {
+    if (strncmp(line.c_str(), node_type.c_str(), n) != 0) {
+      continue;
    }
-    idx = feature_to_id[node_type];
-  }
-  for (auto path : paths) {
-    std::ifstream file(path);
-    std::string line;
-    while (std::getline(file, line)) {
-      auto values = paddle::string::split_string<std::string>(line, "\t");
-      if (values.size() < 2) continue;
-      auto id = std::stoull(values[1]);
+    vals.clear();
+    num = paddle::string::split_string_ptr(
+        line.c_str() + n + 1, line.length() - n - 1, '\t', &vals);
+    if (num == 0) {
+      continue;
+    }
+    uint64_t id = std::strtoul(vals[0].ptr, NULL, 10);
+    size_t shard_id = id % shard_num;
+    if (shard_id >= shard_end || shard_id < shard_start) {
+      VLOG(4) << "will not load " << id << " from " << path
+              << ", please check id distribution";
+      continue;
+    }
+    local_count++;

-      size_t shard_id = id % shard_num;
-      if (shard_id >= shard_end || shard_id < shard_start) {
-        VLOG(4) << "will not load " << id << " from " << path
-                << ", please check id distribution";
-        continue;
+    size_t index = shard_id - shard_start;
+    auto node = feature_shards[idx][index]->add_feature_node(id, false);
+    if (node != NULL) {
+      node->set_feature_size(feat_name[idx].size());
+      for (int i = 1; i < num; ++i) {
+        auto &v = vals[i];
+        parse_feature(idx, v.ptr, v.len, node);
      }
+    }
+    local_valid_count++;
+  }
+  VLOG(2) << "node_type[" << node_type << "] loads " << local_count
+          << " nodes from filepath->" << path;
+  return {local_count, local_valid_count};
+}

-      if (count % 1000000 == 0) {
-        VLOG(0) << count << " nodes are loaded from filepath";
-        VLOG(0) << line;
-      }
-      count++;
+std::pair<uint64_t, uint64_t> GraphTable::parse_node_file(
+    const std::string &path) {
+  std::ifstream file(path);
+  std::string line;
+  uint64_t local_count = 0;
+  uint64_t local_valid_count = 0;
+  int idx = 0;

-      std::string nt = values[0];
-      if (nt != node_type) {
-        continue;
-      }
+  auto path_split = paddle::string::split_string<std::string>(path, "/");
+  auto path_name = path_split[path_split.size() - 1];

-      size_t index = shard_id - shard_start;
+  int num = 0;
+  std::vector<paddle::string::str_ptr> vals;

-      // auto node = shards[index]->add_feature_node(id);
-      auto node = feature_shards[idx][index]->add_feature_node(id);
-      node->set_feature_size(feat_name[idx].size());
+  while (std::getline(file, line)) {
+    vals.clear();
+    num = paddle::string::split_string_ptr(
+        line.c_str(), line.length(), '\t', &vals);
+    if (vals.empty()) {
+      continue;
+    }
+    std::string parse_node_type = vals[0].to_string();
+    auto it = feature_to_id.find(parse_node_type);
+    if (it == feature_to_id.end()) {
+      VLOG(0) << parse_node_type << "type error, please check";
+      continue;
+    }
+    idx = it->second;
+    uint64_t id = std::strtoul(vals[1].ptr, NULL, 10);
+    size_t shard_id = id % shard_num;
+    if (shard_id >= shard_end || shard_id < shard_start) {
+      VLOG(4) << "will not load " << id << " from " << path
+              << ", please check id distribution";
+      continue;
+    }
+    local_count++;
+
+    size_t index = shard_id - shard_start;
+    auto node = feature_shards[idx][index]->add_feature_node(id, false);
+    if (node != NULL) {
+      for (int i = 2; i < num; ++i) {
+        auto &v = vals[i];
+        parse_feature(idx, v.ptr, v.len, node);
+      }
+    }
+    local_valid_count++;
+  }
+  VLOG(2) << local_valid_count << "/" << local_count << " nodes from filepath->"
+          << path;
+  return {local_count, local_valid_count};
+}

-      for (size_t slice = 2; slice < values.size(); slice++) {
-        auto feat = this->parse_feature(idx, values[slice]);
-        if (feat.first >= 0) {
-          node->set_feature(feat.first, feat.second);
-        } else {
-          VLOG(4) << "Node feature:  " << values[slice]
-                  << " not in feature_map.";
-        }
+// TODO opt load all node_types in once reading
+int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  uint64_t count = 0;
+  uint64_t valid_count = 0;
+  int idx = 0;
+  if (FLAGS_graph_load_in_parallel) {
+    if (node_type == "") {
+      VLOG(0) << "Begin GraphTable::load_nodes(), will load all node_type once";
+    }
+    std::vector<std::future<std::pair<uint64_t, uint64_t>>> tasks;
+    for (size_t i = 0; i < paths.size(); i++) {
+      tasks.push_back(load_node_edge_task_pool->enqueue(
+          [&, i, this]() -> std::pair<uint64_t, uint64_t> {
+            return parse_node_file(paths[i]);
+          }));
+    }
+    for (int i = 0; i < (int)tasks.size(); i++) {
+      auto res = tasks[i].get();
+      count += res.first;
+      valid_count += res.second;
+    }
+  } else {
+    VLOG(0) << "Begin GraphTable::load_nodes() node_type[" << node_type << "]";
+    if (node_type == "") {
+      VLOG(0) << "node_type not specified, loading edges to "
+              << id_to_feature[0] << " part";
+    } else {
+      if (feature_to_id.find(node_type) == feature_to_id.end()) {
+        VLOG(0) << "node_type " << node_type
+                << " is not defined, nothing will be loaded";
+        return 0;
      }
-      valid_count++;
+      idx = feature_to_id[node_type];
+    }
+    for (auto path : paths) {
+      VLOG(2) << "Begin GraphTable::load_nodes(), path[" << path << "]";
+      auto res = parse_node_file(path, node_type, idx);
+      count += res.first;
+      valid_count += res.second;
    }
  }

-  VLOG(0) << valid_count << "/" << count << " nodes in type " << node_type
-          << " are loaded successfully in " << path;
+  VLOG(0) << valid_count << "/" << count << " nodes in node_type[ " << node_type
+          << "] are loaded successfully!";
  return 0;
 }

@@ -1108,13 +1348,71 @@ int32_t GraphTable::build_sampler(int idx, std::string sample_type) {
  }
  return 0;
 }
+
+std::pair<uint64_t, uint64_t> GraphTable::parse_edge_file(
+    const std::string &path, int idx, bool reverse) {
+  std::string sample_type = "random";
+  bool is_weighted = false;
+  std::ifstream file(path);
+  std::string line;
+  uint64_t local_count = 0;
+  uint64_t local_valid_count = 0;
+  uint64_t part_num = 0;
+  if (FLAGS_graph_load_in_parallel) {
+    auto path_split = paddle::string::split_string<std::string>(path, "/");
+    auto part_name_split = paddle::string::split_string<std::string>(
+        path_split[path_split.size() - 1], "-");
+    part_num = std::stoull(part_name_split[part_name_split.size() - 1]);
+  }
+
+  while (std::getline(file, line)) {
+    size_t start = line.find_first_of('\t');
+    if (start == std::string::npos) continue;
+    local_count++;
+    uint64_t src_id = std::stoull(&line[0]);
+    uint64_t dst_id = std::stoull(&line[start + 1]);
+    if (reverse) {
+      std::swap(src_id, dst_id);
+    }
+    size_t src_shard_id = src_id % shard_num;
+    if (FLAGS_graph_load_in_parallel) {
+      if (src_shard_id != (part_num % shard_num)) {
+        continue;
+      }
+    }
+
+    float weight = 1;
+    size_t last = line.find_last_of('\t');
+    if (start != last) {
+      weight = std::stof(&line[last + 1]);
+      sample_type = "weighted";
+      is_weighted = true;
+    }
+
+    if (src_shard_id >= shard_end || src_shard_id < shard_start) {
+      VLOG(4) << "will not load " << src_id << " from " << path
+              << ", please check id distribution";
+      continue;
+    }
+    size_t index = src_shard_id - shard_start;
+    auto node = edge_shards[idx][index]->add_graph_node(src_id);
+    if (node != NULL) {
+      node->build_edges(is_weighted);
+      node->add_edge(dst_id, weight);
+    }
+
+    local_valid_count++;
+  }
+  VLOG(2) << local_count << " edges are loaded from filepath->" << path;
+  return {local_count, local_valid_count};
+}
+
 int32_t GraphTable::load_edges(const std::string &path,
                               bool reverse_edge,
                               const std::string &edge_type) {
 #ifdef PADDLE_WITH_HETERPS
-  // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
  if (search_level == 2) total_memory_cost = 0;
-  const int64_t fixed_load_edges = 1000000;
+  const uint64_t fixed_load_edges = 1000000;
 #endif
  int idx = 0;
  if (edge_type == "") {
@@ -1130,63 +1428,34 @@ int32_t GraphTable::load_edges(const std::string &path,
  }

  auto paths = paddle::string::split_string<std::string>(path, ";");
-  int64_t count = 0;
-  std::string sample_type = "random";
-  bool is_weighted = false;
-  int valid_count = 0;
-  for (auto path : paths) {
-    std::ifstream file(path);
-    std::string line;
-    while (std::getline(file, line)) {
-      auto values = paddle::string::split_string<std::string>(line, "\t");
-      count++;
-      if (values.size() < 2) continue;
-      auto src_id = std::stoull(values[0]);
-      auto dst_id = std::stoull(values[1]);
-      if (reverse_edge) {
-        std::swap(src_id, dst_id);
-      }
-      float weight = 1;
-      if (values.size() == 3) {
-        weight = std::stof(values[2]);
-        sample_type = "weighted";
-        is_weighted = true;
-      }
-
-      size_t src_shard_id = src_id % shard_num;
-
-      if (src_shard_id >= shard_end || src_shard_id < shard_start) {
-        VLOG(4) << "will not load " << src_id << " from " << path
-                << ", please check id distribution";
-        continue;
-      }
-
-      if (count % 1000000 == 0) {
-        VLOG(0) << count << " edges are loaded from filepath";
-        VLOG(0) << line;
-      }
-
-      size_t index = src_shard_id - shard_start;
-      edge_shards[idx][index]->add_graph_node(src_id)->build_edges(is_weighted);
-      edge_shards[idx][index]->add_neighbor(src_id, dst_id, weight);
-      valid_count++;
-#ifdef PADDLE_WITH_HETERPS
-      // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
-      if (count > fixed_load_edges && search_level == 2) {
-        dump_edges_to_ssd(idx);
-        VLOG(0) << "dumping edges to ssd, edge count is reset to 0";
-        clear_graph(idx);
-        count = 0;
-      }
-#endif
+  uint64_t count = 0;
+  uint64_t valid_count = 0;
+
+  VLOG(0) << "Begin GraphTable::load_edges() edge_type[" << edge_type << "]";
+  if (FLAGS_graph_load_in_parallel) {
+    std::vector<std::future<std::pair<uint64_t, uint64_t>>> tasks;
+    for (int i = 0; i < paths.size(); i++) {
+      tasks.push_back(load_node_edge_task_pool->enqueue(
+          [&, i, idx, this]() -> std::pair<uint64_t, uint64_t> {
+            return parse_edge_file(paths[i], idx, reverse_edge);
+          }));
+    }
+    for (int j = 0; j < (int)tasks.size(); j++) {
+      auto res = tasks[j].get();
+      count += res.first;
+      valid_count += res.second;
+    }
+  } else {
+    for (auto path : paths) {
+      auto res = parse_edge_file(path, idx, reverse_edge);
+      count += res.first;
+      valid_count += res.second;
    }
  }
-  VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in "
-          << path;
+  VLOG(0) << valid_count << "/" << count << " edge_type[" << edge_type
+          << "] edges are loaded successfully";

-// Build Sampler j
 #ifdef PADDLE_WITH_HETERPS
-  // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
  if (search_level == 2) {
    if (count > 0) {
      dump_edges_to_ssd(idx);
@@ -1197,31 +1466,65 @@ int32_t GraphTable::load_edges(const std::string &path,
    return 0;
  }
 #endif
-  for (auto &shard : edge_shards[idx]) {
-    auto bucket = shard->get_bucket();
-    for (size_t i = 0; i < bucket.size(); i++) {
-      bucket[i]->build_sampler(sample_type);
+
+  if (!build_sampler_on_cpu) {
+    // To reduce memory overhead, CPU samplers won't be created in gpugraph.
+    // In order not to affect the sampler function of other scenario,
+    // this optimization is only performed in load_edges function.
+    VLOG(0) << "run in gpugraph mode!";
+  } else {
+    std::string sample_type = "random";
+    VLOG(0) << "build sampler ... ";
+    for (auto &shard : edge_shards[idx]) {
+      auto bucket = shard->get_bucket();
+      for (size_t i = 0; i < bucket.size(); i++) {
+        bucket[i]->build_sampler(sample_type);
+      }
    }
  }

  return 0;
 }

-Node *GraphTable::find_node(int type_id, int idx, int64_t id) {
+Node *GraphTable::find_node(int type_id, uint64_t id) {
+  size_t shard_id = id % shard_num;
+  if (shard_id >= shard_end || shard_id < shard_start) {
+    return nullptr;
+  }
+  Node *node = nullptr;
+  size_t index = shard_id - shard_start;
+  auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
+  for (auto &search_shard : search_shards) {
+    PADDLE_ENFORCE_NOT_NULL(search_shard[index],
+                            paddle::platform::errors::InvalidArgument(
+                                "search_shard[%d] should not be null.", index));
+    node = search_shard[index]->find_node(id);
+    if (node != nullptr) {
+      break;
+    }
+  }
+  return node;
+}
+
+Node *GraphTable::find_node(int type_id, int idx, uint64_t id) {
  size_t shard_id = id % shard_num;
  if (shard_id >= shard_end || shard_id < shard_start) {
    return nullptr;
  }
  size_t index = shard_id - shard_start;
  auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
+  PADDLE_ENFORCE_NOT_NULL(search_shards[index],
+                          paddle::platform::errors::InvalidArgument(
+                              "search_shard[%d] should not be null.", index));
  Node *node = search_shards[index]->find_node(id);
  return node;
 }
-uint32_t GraphTable::get_thread_pool_index(int64_t node_id) {
+uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
  return node_id % shard_num % shard_num_per_server % task_pool_size_;
 }

-uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) {
+uint32_t GraphTable::get_thread_pool_index_by_shard_index(
+    uint64_t shard_index) {
  return shard_index % shard_num_per_server % task_pool_size_;
 }

@@ -1293,9 +1596,9 @@ int32_t GraphTable::random_sample_nodes(int type_id,
    }
  }
  for (auto &pair : first_half) second_half.push_back(pair);
-  std::vector<int64_t> res;
+  std::vector<uint64_t> res;
  get_nodes_ids_by_ranges(type_id, idx, second_half, res);
-  actual_size = res.size() * sizeof(int64_t);
+  actual_size = res.size() * sizeof(uint64_t);
  buffer.reset(new char[actual_size]);
  char *pointer = buffer.get();
  memcpy(pointer, res.data(), actual_size);
@@ -1303,7 +1606,7 @@ int32_t GraphTable::random_sample_nodes(int type_id,
 }
 int32_t GraphTable::random_sample_neighbors(
    int idx,
-    int64_t *node_ids,
+    uint64_t *node_ids,
    int sample_size,
    std::vector<std::shared_ptr<char>> &buffers,
    std::vector<int> &actual_sizes,
@@ -1323,7 +1626,7 @@ int32_t GraphTable::random_sample_neighbors(
  for (int i = 0; i < (int)seq_id.size(); i++) {
    if (seq_id[i].size() == 0) continue;
    tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
-      int64_t node_id;
+      uint64_t node_id;
      std::vector<std::pair<SampleKey, SampleResult>> r;
      LRUResponse response = LRUResponse::blocked;
      if (use_cache) {
@@ -1369,7 +1672,7 @@ int32_t GraphTable::random_sample_neighbors(
              res.size() * (need_weight ? (Node::id_size + Node::weight_size)
                                        : Node::id_size);
          int offset = 0;
-          int64_t id;
+          uint64_t id;
          float weight;
          char *buffer_addr = new char[actual_size];
          if (response == LRUResponse::ok) {
@@ -1405,13 +1708,13 @@ int32_t GraphTable::random_sample_neighbors(
 }

 int32_t GraphTable::get_node_feat(int idx,
-                                  const std::vector<int64_t> &node_ids,
+                                  const std::vector<uint64_t> &node_ids,
                                  const std::vector<std::string> &feature_names,
                                  std::vector<std::vector<std::string>> &res) {
  size_t node_num = node_ids.size();
  std::vector<std::future<int>> tasks;
  for (size_t idy = 0; idy < node_num; ++idy) {
-    int64_t node_id = node_ids[idy];
+    uint64_t node_id = node_ids[idy];
    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
        [&, idx, idy, node_id]() -> int {
          Node *node = find_node(1, idx, node_id);
@@ -1440,13 +1743,13 @@ int32_t GraphTable::get_node_feat(int idx,

 int32_t GraphTable::set_node_feat(
    int idx,
-    const std::vector<int64_t> &node_ids,
+    const std::vector<uint64_t> &node_ids,
    const std::vector<std::string> &feature_names,
    const std::vector<std::vector<std::string>> &res) {
  size_t node_num = node_ids.size();
  std::vector<std::future<int>> tasks;
  for (size_t idy = 0; idy < node_num; ++idy) {
-    int64_t node_id = node_ids[idy];
+    uint64_t node_id = node_ids[idy];
    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
        [&, idx, idy, node_id]() -> int {
          size_t index = node_id % this->shard_num - this->shard_start;
@@ -1469,60 +1772,247 @@ int32_t GraphTable::set_node_feat(
  return 0;
 }

-std::pair<int32_t, std::string> GraphTable::parse_feature(
-    int idx, std::string feat_str) {
+void string_vector_2_string(std::vector<std::string>::iterator strs_begin,
+                            std::vector<std::string>::iterator strs_end,
+                            char delim,
+                            std::string *output) {
+  size_t i = 0;
+  for (std::vector<std::string>::iterator iter = strs_begin; iter != strs_end;
+       ++iter) {
+    if (i > 0) {
+      *output += delim;
+    }
+
+    *output += *iter;
+    ++i;
+  }
+}
+
+void string_vector_2_string(
+    std::vector<paddle::string::str_ptr>::iterator strs_begin,
+    std::vector<paddle::string::str_ptr>::iterator strs_end,
+    char delim,
+    std::string *output) {
+  size_t i = 0;
+  for (auto iter = strs_begin; iter != strs_end; ++iter) {
+    if (i > 0) {
+      output->append(&delim, 1);
+    }
+    output->append((*iter).ptr, (*iter).len);
+    ++i;
+  }
+}
+
+int GraphTable::parse_feature(int idx,
+                              const char *feat_str,
+                              size_t len,
+                              FeatureNode *node) {
  // Return (feat_id, btyes) if name are in this->feat_name, else return (-1,
  // "")
-  auto fields = paddle::string::split_string<std::string>(feat_str, " ");
-  if (feat_id_map[idx].count(fields[0])) {
-    // if (this->feat_id_map.count(fields[0])) {
-    int32_t id = this->feat_id_map[idx][fields[0]];
+  thread_local std::vector<paddle::string::str_ptr> fields;
+  fields.clear();
+  const char c = feature_separator_.at(0);
+  paddle::string::split_string_ptr(feat_str, len, c, &fields);
+
+  std::string name = fields[0].to_string();
+  auto it = feat_id_map[idx].find(name);
+  if (it != feat_id_map[idx].end()) {
+    int32_t id = it->second;
+    std::string *fea_ptr = node->mutable_feature(id);
    std::string dtype = this->feat_dtype[idx][id];
-    std::vector<std::string> values(fields.begin() + 1, fields.end());
    if (dtype == "feasign") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), paddle::string::join_strings(values, ' '));
+      //      string_vector_2_string(fields.begin() + 1, fields.end(), ' ',
+      //      fea_ptr);
+      FeatureNode::parse_value_to_bytes<uint64_t>(
+          fields.begin() + 1, fields.end(), fea_ptr);
+      return 0;
    } else if (dtype == "string") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), paddle::string::join_strings(values, ' '));
+      string_vector_2_string(fields.begin() + 1, fields.end(), ' ', fea_ptr);
+      return 0;
    } else if (dtype == "float32") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), FeatureNode::parse_value_to_bytes<float>(values));
+      FeatureNode::parse_value_to_bytes<float>(
+          fields.begin() + 1, fields.end(), fea_ptr);
+      return 0;
    } else if (dtype == "float64") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), FeatureNode::parse_value_to_bytes<double>(values));
+      FeatureNode::parse_value_to_bytes<double>(
+          fields.begin() + 1, fields.end(), fea_ptr);
+      return 0;
    } else if (dtype == "int32") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), FeatureNode::parse_value_to_bytes<int32_t>(values));
+      FeatureNode::parse_value_to_bytes<int32_t>(
+          fields.begin() + 1, fields.end(), fea_ptr);
+      return 0;
    } else if (dtype == "int64") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), FeatureNode::parse_value_to_bytes<int64_t>(values));
+      FeatureNode::parse_value_to_bytes<uint64_t>(
+          fields.begin() + 1, fields.end(), fea_ptr);
+      return 0;
+    }
+  } else {
+    VLOG(2) << "feature_name[" << name << "] is not in feat_id_map, ntype_id["
+            << idx << "] feat_id_map_size[" << feat_id_map.size() << "]";
+  }
+
+  return -1;
+}
+// thread safe shard vector merge
+class MergeShardVector {
+ public:
+  MergeShardVector(std::vector<std::vector<uint64_t>> *output, int slice_num) {
+    _slice_num = slice_num;
+    _shard_keys = output;
+    _shard_keys->resize(slice_num);
+    _mutexs = new std::mutex[slice_num];
+  }
+  ~MergeShardVector() {
+    if (_mutexs != nullptr) {
+      delete[] _mutexs;
+      _mutexs = nullptr;
+    }
+  }
+  // merge shard keys
+  void merge(const std::vector<std::vector<uint64_t>> &shard_keys) {
+    // add to shard
+    for (int shard_id = 0; shard_id < _slice_num; ++shard_id) {
+      auto &dest = (*_shard_keys)[shard_id];
+      auto &src = shard_keys[shard_id];
+
+      _mutexs[shard_id].lock();
+      dest.insert(dest.end(), src.begin(), src.end());
+      _mutexs[shard_id].unlock();
+    }
+  }
+
+ private:
+  int _slice_num = 0;
+  std::mutex *_mutexs = nullptr;
+  std::vector<std::vector<uint64_t>> *_shard_keys;
+};
+
+int GraphTable::get_all_id(int type_id,
+                           int slice_num,
+                           std::vector<std::vector<uint64_t>> *output) {
+  MergeShardVector shard_merge(output, slice_num);
+  auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
+  std::vector<std::future<size_t>> tasks;
+  for (int idx = 0; idx < search_shards.size(); idx++) {
+    for (int j = 0; j < search_shards[idx].size(); j++) {
+      tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue(
+          [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t {
+            std::vector<std::vector<uint64_t>> shard_keys;
+            size_t num =
+                search_shards[idx][j]->get_all_id(&shard_keys, slice_num);
+            // add to shard
+            shard_merge.merge(shard_keys);
+            return num;
+          }));
+    }
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  return 0;
+}
+
+int GraphTable::get_all_neighbor_id(
+    int type_id, int slice_num, std::vector<std::vector<uint64_t>> *output) {
+  MergeShardVector shard_merge(output, slice_num);
+  auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
+  std::vector<std::future<size_t>> tasks;
+  for (int idx = 0; idx < search_shards.size(); idx++) {
+    for (int j = 0; j < search_shards[idx].size(); j++) {
+      tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue(
+          [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t {
+            std::vector<std::vector<uint64_t>> shard_keys;
+            size_t num = search_shards[idx][j]->get_all_neighbor_id(&shard_keys,
+                                                                    slice_num);
+            // add to shard
+            shard_merge.merge(shard_keys);
+            return num;
+          }));
    }
  }
-  return std::make_pair<int32_t, std::string>(-1, "");
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  return 0;
 }

-std::vector<std::vector<int64_t>> GraphTable::get_all_id(int type_id,
-                                                         int idx,
-                                                         int slice_num) {
-  std::vector<std::vector<int64_t>> res(slice_num);
+int GraphTable::get_all_id(int type_id,
+                           int idx,
+                           int slice_num,
+                           std::vector<std::vector<uint64_t>> *output) {
+  MergeShardVector shard_merge(output, slice_num);
  auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
-  std::vector<std::future<std::vector<int64_t>>> tasks;
+  std::vector<std::future<size_t>> tasks;
+  VLOG(3) << "begin task, task_pool_size_[" << task_pool_size_ << "]";
  for (size_t i = 0; i < search_shards.size(); i++) {
    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
-        [&search_shards, i]() -> std::vector<int64_t> {
-          return search_shards[i]->get_all_id();
+        [&search_shards, i, slice_num, &shard_merge]() -> size_t {
+          std::vector<std::vector<uint64_t>> shard_keys;
+          size_t num = search_shards[i]->get_all_id(&shard_keys, slice_num);
+          // add to shard
+          shard_merge.merge(shard_keys);
+          return num;
        }));
  }
  for (size_t i = 0; i < tasks.size(); ++i) {
    tasks[i].wait();
  }
-  for (size_t i = 0; i < tasks.size(); i++) {
-    auto ids = tasks[i].get();
-    for (auto &id : ids) res[(uint64_t)(id) % slice_num].push_back(id);
+  VLOG(3) << "end task, task_pool_size_[" << task_pool_size_ << "]";
+  return 0;
+}
+
+int GraphTable::get_all_neighbor_id(
+    int type_id,
+    int idx,
+    int slice_num,
+    std::vector<std::vector<uint64_t>> *output) {
+  MergeShardVector shard_merge(output, slice_num);
+  auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
+  std::vector<std::future<size_t>> tasks;
+  VLOG(3) << "begin task, task_pool_size_[" << task_pool_size_ << "]";
+  for (int i = 0; i < search_shards.size(); i++) {
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [&search_shards, i, slice_num, &shard_merge]() -> size_t {
+          std::vector<std::vector<uint64_t>> shard_keys;
+          size_t num =
+              search_shards[i]->get_all_neighbor_id(&shard_keys, slice_num);
+          // add to shard
+          shard_merge.merge(shard_keys);
+          return num;
+        }));
  }
-  return res;
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  VLOG(3) << "end task, task_pool_size_[" << task_pool_size_ << "]";
+  return 0;
 }
+
+int GraphTable::get_all_feature_ids(
+    int type_id,
+    int idx,
+    int slice_num,
+    std::vector<std::vector<uint64_t>> *output) {
+  MergeShardVector shard_merge(output, slice_num);
+  auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
+  std::vector<std::future<size_t>> tasks;
+  for (int i = 0; i < search_shards.size(); i++) {
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [&search_shards, i, slice_num, &shard_merge]() -> size_t {
+          std::vector<std::vector<uint64_t>> shard_keys;
+          size_t num =
+              search_shards[i]->get_all_feature_ids(&shard_keys, slice_num);
+          // add to shard
+          shard_merge.merge(shard_keys);
+          return num;
+        }));
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  return 0;
+}
+
 int32_t GraphTable::pull_graph_list(int type_id,
                                    int idx,
                                    int start,
@@ -1576,7 +2066,11 @@ int32_t GraphTable::pull_graph_list(int type_id,
  return 0;
 }

-int32_t GraphTable::get_server_index_by_id(int64_t id) {
+void GraphTable::set_feature_separator(const std::string &ch) {
+  feature_separator_ = ch;
+}
+
+int32_t GraphTable::get_server_index_by_id(uint64_t id) {
  return id % shard_num / shard_num_per_server;
 }
 int32_t GraphTable::Initialize(const TableParameter &config,
@@ -1617,6 +2111,7 @@ void GraphTable::load_node_weight(int type_id, int idx, std::string path) {
 }
 int32_t GraphTable::Initialize(const GraphParameter &graph) {
  task_pool_size_ = graph.task_pool_size();
+  build_sampler_on_cpu = graph.build_sampler_on_cpu();

 #ifdef PADDLE_WITH_HETERPS
  _db = NULL;
@@ -1651,6 +2146,8 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) {
    _shards_task_pool[i].reset(new ::ThreadPool(1));
    _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0));
  }
+  load_node_edge_task_pool.reset(new ::ThreadPool(load_thread_num));
+
  auto graph_feature = graph.graph_feature();
  auto node_types = graph.node_types();
  auto edge_types = graph.edge_types();

--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -58,33 +58,80 @@ class GraphShard {
  ~GraphShard();
  std::vector<Node *> &get_bucket() { return bucket; }
  std::vector<Node *> get_batch(int start, int end, int step);
-  std::vector<int64_t> get_ids_by_range(int start, int end) {
-    std::vector<int64_t> res;
+  void get_ids_by_range(int start, int end, std::vector<uint64_t> *res) {
+    res->reserve(res->size() + end - start);
    for (int i = start; i < end && i < (int)bucket.size(); i++) {
-      res.push_back(bucket[i]->get_id());
+      res->emplace_back(bucket[i]->get_id());
    }
-    return res;
  }
-  std::vector<int64_t> get_all_id() {
-    std::vector<int64_t> res;
+  size_t get_all_id(std::vector<std::vector<uint64_t>> *shard_keys,
+                    int slice_num) {
+    int bucket_num = bucket.size();
+    shard_keys->resize(slice_num);
+    for (int i = 0; i < slice_num; ++i) {
+      (*shard_keys)[i].reserve(bucket_num / slice_num);
+    }
+    for (int i = 0; i < bucket_num; i++) {
+      uint64_t k = bucket[i]->get_id();
+      (*shard_keys)[k % slice_num].emplace_back(k);
+    }
+    return bucket_num;
+  }
+  size_t get_all_neighbor_id(std::vector<std::vector<uint64_t>> *total_res,
+                             int slice_num) {
+    std::vector<uint64_t> keys;
+    for (size_t i = 0; i < bucket.size(); i++) {
+      size_t neighbor_size = bucket[i]->get_neighbor_size();
+      size_t n = keys.size();
+      keys.resize(n + neighbor_size);
+      for (size_t j = 0; j < neighbor_size; j++) {
+        keys[n + j] = bucket[i]->get_neighbor_id(j);
+      }
+    }
+    return dedup2shard_keys(&keys, total_res, slice_num);
+  }
+  size_t get_all_feature_ids(std::vector<std::vector<uint64_t>> *total_res,
+                             int slice_num) {
+    std::vector<uint64_t> keys;
    for (int i = 0; i < (int)bucket.size(); i++) {
-      res.push_back(bucket[i]->get_id());
+      bucket[i]->get_feature_ids(&keys);
+    }
+    return dedup2shard_keys(&keys, total_res, slice_num);
+  }
+  size_t dedup2shard_keys(std::vector<uint64_t> *keys,
+                          std::vector<std::vector<uint64_t>> *total_res,
+                          int slice_num) {
+    size_t num = keys->size();
+    uint64_t last_key = 0;
+    // sort key insert to vector
+    std::sort(keys->begin(), keys->end());
+    total_res->resize(slice_num);
+    for (int shard_id = 0; shard_id < slice_num; ++shard_id) {
+      (*total_res)[shard_id].reserve(num / slice_num);
    }
-    return res;
+    for (size_t i = 0; i < num; ++i) {
+      const uint64_t &k = (*keys)[i];
+      if (i > 0 && last_key == k) {
+        continue;
+      }
+      last_key = k;
+      (*total_res)[k % slice_num].push_back(k);
+    }
+    return num;
  }
-  GraphNode *add_graph_node(int64_t id);
+  GraphNode *add_graph_node(uint64_t id);
  GraphNode *add_graph_node(Node *node);
-  FeatureNode *add_feature_node(int64_t id);
-  Node *find_node(int64_t id);
-  void delete_node(int64_t id);
+  FeatureNode *add_feature_node(uint64_t id, bool is_overlap = true);
+  Node *find_node(uint64_t id);
+  void delete_node(uint64_t id);
  void clear();
-  void add_neighbor(int64_t id, int64_t dst_id, float weight);
-  std::unordered_map<int64_t, int> &get_node_location() {
+  void add_neighbor(uint64_t id, uint64_t dst_id, float weight);
+  std::unordered_map<uint64_t, int> &get_node_location() {
    return node_location;
  }

 private:
-  std::unordered_map<int64_t, int> node_location;
+  std::unordered_map<uint64_t, int> node_location;
  std::vector<Node *> bucket;
 };

@@ -92,11 +139,11 @@ enum LRUResponse { ok = 0, blocked = 1, err = 2 };

 struct SampleKey {
  int idx;
-  int64_t node_key;
+  uint64_t node_key;
  size_t sample_size;
  bool is_weighted;
  SampleKey(int _idx,
-            int64_t _node_key,
+            uint64_t _node_key,
            size_t _sample_size,
            bool _is_weighted) {
    idx = _idx;
@@ -467,7 +514,7 @@ class GraphTable : public Table {

  virtual int32_t random_sample_neighbors(
      int idx,
-      int64_t *node_ids,
+      uint64_t *node_ids,
      int sample_size,
      std::vector<std::shared_ptr<char>> &buffers,
      std::vector<int> &actual_sizes,
@@ -483,30 +530,62 @@ class GraphTable : public Table {
      int type_id,
      int idx,
      std::vector<std::pair<int, int>> ranges,
-      std::vector<int64_t> &res);
+      std::vector<uint64_t> &res);
  virtual int32_t Initialize() { return 0; }
  virtual int32_t Initialize(const TableParameter &config,
                             const FsClientParameter &fs_config);
  virtual int32_t Initialize(const GraphParameter &config);
  int32_t Load(const std::string &path, const std::string &param);

+  int32_t load_node_and_edge_file(std::string etype,
+                                  std::string ntype,
+                                  std::string epath,
+                                  std::string npath,
+                                  int part_num,
+                                  bool reverse);
+
+  std::string get_inverse_etype(std::string &etype);
+
  int32_t load_edges(const std::string &path,
                     bool reverse,
                     const std::string &edge_type);

-  std::vector<std::vector<int64_t>> get_all_id(int type,
-                                               int idx,
-                                               int slice_num);
-  int32_t load_nodes(const std::string &path, std::string node_type);
-
+  int get_all_id(int type,
+                 int slice_num,
+                 std::vector<std::vector<uint64_t>> *output);
+  int get_all_neighbor_id(int type,
+                          int slice_num,
+                          std::vector<std::vector<uint64_t>> *output);
+  int get_all_id(int type,
+                 int idx,
+                 int slice_num,
+                 std::vector<std::vector<uint64_t>> *output);
+  int get_all_neighbor_id(int type_id,
+                          int id,
+                          int slice_num,
+                          std::vector<std::vector<uint64_t>> *output);
+  int get_all_feature_ids(int type,
+                          int idx,
+                          int slice_num,
+                          std::vector<std::vector<uint64_t>> *output);
+  int32_t load_nodes(const std::string &path,
+                     std::string node_type = std::string());
+  std::pair<uint64_t, uint64_t> parse_edge_file(const std::string &path,
+                                                int idx,
+                                                bool reverse);
+  std::pair<uint64_t, uint64_t> parse_node_file(const std::string &path,
+                                                const std::string &node_type,
+                                                int idx);
+  std::pair<uint64_t, uint64_t> parse_node_file(const std::string &path);
  int32_t add_graph_node(int idx,
-                         std::vector<int64_t> &id_list,
+                         std::vector<uint64_t> &id_list,
                         std::vector<bool> &is_weight_list);

-  int32_t remove_graph_node(int idx, std::vector<int64_t> &id_list);
+  int32_t remove_graph_node(int idx, std::vector<uint64_t> &id_list);

-  int32_t get_server_index_by_id(int64_t id);
-  Node *find_node(int type_id, int idx, int64_t id);
+  int32_t get_server_index_by_id(uint64_t id);
+  Node *find_node(int type_id, int idx, uint64_t id);
+  Node *find_node(int type_id, uint64_t id);

  virtual int32_t Pull(TableContext &context) { return 0; }
  virtual int32_t Push(TableContext &context) { return 0; }
@@ -531,19 +610,21 @@ class GraphTable : public Table {
    this->server_num = server_num;
    return 0;
  }
-  virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index);
-  virtual uint32_t get_thread_pool_index(int64_t node_id);
-  virtual std::pair<int32_t, std::string> parse_feature(int idx,
-                                                        std::string feat_str);
+  virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index);
+  virtual uint32_t get_thread_pool_index(uint64_t node_id);
+  virtual int parse_feature(int idx,
+                            const char *feat_str,
+                            size_t len,
+                            FeatureNode *node);

  virtual int32_t get_node_feat(int idx,
-                                const std::vector<int64_t> &node_ids,
+                                const std::vector<uint64_t> &node_ids,
                                const std::vector<std::string> &feature_names,
                                std::vector<std::vector<std::string>> &res);

  virtual int32_t set_node_feat(
      int idx,
-      const std::vector<int64_t> &node_ids,
+      const std::vector<uint64_t> &node_ids,
      const std::vector<std::string> &feature_names,
      const std::vector<std::vector<std::string>> &res);

@@ -578,22 +659,24 @@ class GraphTable : public Table {
  virtual void export_partition_files(int idx, std::string file_path);
  virtual char *random_sample_neighbor_from_ssd(
      int idx,
-      int64_t id,
+      uint64_t id,
      int sample_size,
      const std::shared_ptr<std::mt19937_64> rng,
      int &actual_size);
  virtual int32_t add_node_to_ssd(
-      int type_id, int idx, int64_t src_id, char *data, int len);
+      int type_id, int idx, uint64_t src_id, char *data, int len);
  virtual paddle::framework::GpuPsCommGraph make_gpu_ps_graph(
-      int idx, std::vector<int64_t> ids);
+      int idx, std::vector<uint64_t> ids);
+  virtual paddle::framework::GpuPsCommGraphFea make_gpu_ps_graph_fea(
+      std::vector<uint64_t> &node_ids, int slot_num);
  int32_t Load_to_ssd(const std::string &path, const std::string &param);
-  int64_t load_graph_to_memory_from_ssd(int idx, std::vector<int64_t> &ids);
+  int64_t load_graph_to_memory_from_ssd(int idx, std::vector<uint64_t> &ids);
  int32_t make_complementary_graph(int idx, int64_t byte_size);
  int32_t dump_edges_to_ssd(int idx);
  int32_t get_partition_num(int idx) { return partitions[idx].size(); }
-  std::vector<int64_t> get_partition(int idx, int index) {
-    if (idx >= partitions.size() || index >= partitions[idx].size())
-      return std::vector<int64_t>();
+  std::vector<uint64_t> get_partition(int idx, int index) {
+    if (idx >= (int)partitions.size() || index >= (int)partitions[idx].size())
+      return std::vector<uint64_t>();
    return partitions[idx][index];
  }
  int32_t load_edges_to_ssd(const std::string &path,
@@ -603,17 +686,20 @@ class GraphTable : public Table {
  void set_search_level(int search_level) { this->search_level = search_level; }
  int search_level;
  int64_t total_memory_cost;
-  std::vector<std::vector<std::vector<int64_t>>> partitions;
+  std::vector<std::vector<std::vector<uint64_t>>> partitions;
  int next_partition;
 #endif
-  virtual int32_t add_comm_edge(int idx, int64_t src_id, int64_t dst_id);
+  virtual int32_t add_comm_edge(int idx, uint64_t src_id, uint64_t dst_id);
  virtual int32_t build_sampler(int idx, std::string sample_type = "random");
+  void set_feature_separator(const std::string &ch);
  std::vector<std::vector<GraphShard *>> edge_shards, feature_shards;
  size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
  int task_pool_size_ = 24;
+  int load_thread_num = 160;
+
  const int random_sample_nodes_ranges = 3;

-  std::vector<std::vector<std::unordered_map<int64_t, double>>> node_weight;
+  std::vector<std::vector<std::unordered_map<uint64_t, double>>> node_weight;
  std::vector<std::vector<std::string>> feat_name;
  std::vector<std::vector<std::string>> feat_dtype;
  std::vector<std::vector<int32_t>> feat_shape;
@@ -625,21 +711,24 @@ class GraphTable : public Table {

  std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
  std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
+  std::shared_ptr<::ThreadPool> load_node_edge_task_pool;
  std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
-  std::unordered_set<int64_t> extra_nodes;
-  std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index;
+  std::unordered_set<uint64_t> extra_nodes;
+  std::unordered_map<uint64_t, size_t> extra_nodes_to_thread_index;
  bool use_cache, use_duplicate_nodes;
  int cache_size_limit;
  int cache_ttl;
  mutable std::mutex mutex_;
+  bool build_sampler_on_cpu;
  std::shared_ptr<pthread_rwlock_t> rw_lock;
 #ifdef PADDLE_WITH_HETERPS
  // paddle::framework::GpuPsGraphTable gpu_graph_table;
  paddle::distributed::RocksDBHandler *_db;
-// std::shared_ptr<::ThreadPool> graph_sample_pool;
-// std::shared_ptr<GraphSampler> graph_sampler;
-// REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler)
+  // std::shared_ptr<::ThreadPool> graph_sample_pool;
+  // std::shared_ptr<GraphSampler> graph_sampler;
+  // REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler)
 #endif
+  std::string feature_separator_ = std::string(" ");
 };

 /*
@@ -657,7 +746,7 @@ class CompleteGraphSampler : public GraphSampler {
 protected:
  GraphTable *graph_table;
  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
-  std::vector<std::vector<int64_t>> sample_neighbors;
+  std::vector<std::vector<uint64_t>> sample_neighbors;
  // std::vector<GpuPsCommGraph> sample_res;
  // std::shared_ptr<std::mt19937_64> random;
  int gpu_num;
@@ -676,11 +765,11 @@ class BasicBfsGraphSampler : public GraphSampler {
  GraphTable *graph_table;
  // std::vector<std::vector<GpuPsGraphNode>> sample_nodes;
  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
-  std::vector<std::vector<int64_t>> sample_neighbors;
+  std::vector<std::vector<uint64_t>> sample_neighbors;
  size_t gpu_num;
  int init_search_size, node_num_for_each_shard, edge_num_for_each_node;
  int rounds, interval;
-  std::vector<std::unordered_map<int64_t, std::vector<int64_t>>>
+  std::vector<std::unordered_map<uint64_t, std::vector<uint64_t>>>
      sample_neighbors_map;
 };
 #endif

--- a/paddle/fluid/distributed/ps/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -16,10 +16,15 @@
 #include <cstring>
 #include <iostream>
 #include <memory>
+#include <set>
 #include <sstream>
 #include <vector>

+#include "glog/logging.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/string_helper.h"
+
 namespace paddle {
 namespace distributed {

@@ -30,6 +35,7 @@ class Node {
  virtual ~Node() {}
  static int id_size, int_size, weight_size;
  uint64_t get_id() { return id; }
+  int64_t get_py_id() { return (int64_t)id; }
  void set_id(uint64_t id) { this->id = id; }

  virtual void build_edges(bool is_weighted) {}
@@ -46,7 +52,11 @@ class Node {
  virtual void to_buffer(char *buffer, bool need_feature);
  virtual void recover_from_buffer(char *buffer);
  virtual std::string get_feature(int idx) { return std::string(""); }
-  virtual void set_feature(int idx, std::string str) {}
+  virtual int get_feature_ids(std::vector<uint64_t> *res) const { return 0; }
+  virtual int get_feature_ids(int slot_idx, std::vector<uint64_t> *res) const {
+    return 0;
+  }
+  virtual void set_feature(int idx, const std::string &str) {}
  virtual void set_feature_size(int size) {}
  virtual int get_feature_size() { return 0; }
  virtual size_t get_neighbor_size() { return 0; }
@@ -95,7 +105,64 @@ class FeatureNode : public Node {
    }
  }

-  virtual void set_feature(int idx, std::string str) {
+  virtual int get_feature_ids(std::vector<uint64_t> *res) const {
+    PADDLE_ENFORCE_NOT_NULL(res,
+                            paddle::platform::errors::InvalidArgument(
+                                "get_feature_ids res should not be null"));
+    errno = 0;
+    for (auto &feature_item : feature) {
+      const uint64_t *feas = (const uint64_t *)(feature_item.c_str());
+      size_t num = feature_item.length() / sizeof(uint64_t);
+      CHECK((feature_item.length() % sizeof(uint64_t)) == 0)
+          << "bad feature_item: [" << feature_item << "]";
+      size_t n = res->size();
+      res->resize(n + num);
+      for (size_t i = 0; i < num; ++i) {
+        (*res)[n + i] = feas[i];
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+        errno,
+        0,
+        paddle::platform::errors::InvalidArgument(
+            "get_feature_ids get errno should be 0, but got %d.", errno));
+    return 0;
+  }
+
+  virtual int get_feature_ids(int slot_idx, std::vector<uint64_t> *res) const {
+    PADDLE_ENFORCE_NOT_NULL(res,
+                            paddle::platform::errors::InvalidArgument(
+                                "get_feature_ids res should not be null"));
+    res->clear();
+    errno = 0;
+    if (slot_idx < (int)this->feature.size()) {
+      const std::string &s = this->feature[slot_idx];
+      const uint64_t *feas = (const uint64_t *)(s.c_str());
+
+      size_t num = s.length() / sizeof(uint64_t);
+      CHECK((s.length() % sizeof(uint64_t)) == 0)
+          << "bad feature_item: [" << s << "]";
+      res->resize(num);
+      for (size_t i = 0; i < num; ++i) {
+        (*res)[i] = feas[i];
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+        errno,
+        0,
+        paddle::platform::errors::InvalidArgument(
+            "get_feature_ids get errno should be 0, but got %d.", errno));
+    return 0;
+  }
+
+  virtual std::string *mutable_feature(int idx) {
+    if (idx >= (int)this->feature.size()) {
+      this->feature.resize(idx + 1);
+    }
+    return &(this->feature[idx]);
+  }
+
+  virtual void set_feature(int idx, const std::string &str) {
    if (idx >= (int)this->feature.size()) {
      this->feature.resize(idx + 1);
    }
@@ -117,6 +184,23 @@ class FeatureNode : public Node {
    return std::string(buffer, Tsize);
  }

+  template <typename T>
+  static void parse_value_to_bytes(
+      std::vector<std::string>::iterator feat_str_begin,
+      std::vector<std::string>::iterator feat_str_end,
+      std::string *output) {
+    T v;
+    size_t feat_str_size = feat_str_end - feat_str_begin;
+    size_t Tsize = sizeof(T) * feat_str_size;
+    char buffer[Tsize] = {'\0'};
+    for (size_t i = 0; i < feat_str_size; i++) {
+      std::stringstream ss(*(feat_str_begin + i));
+      ss >> v;
+      std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
+    }
+    output->assign(buffer);
+  }
+
  template <typename T>
  static std::vector<T> parse_bytes_to_array(std::string feat_str) {
    T v;
@@ -131,8 +215,28 @@ class FeatureNode : public Node {
    return out;
  }

+  template <typename T>
+  static void parse_value_to_bytes(
+      std::vector<paddle::string::str_ptr>::iterator feat_str_begin,
+      std::vector<paddle::string::str_ptr>::iterator feat_str_end,
+      std::string *output) {
+    size_t feat_str_size = feat_str_end - feat_str_begin;
+    size_t Tsize = sizeof(T) * feat_str_size;
+    size_t num = output->length();
+    output->resize(num + Tsize);
+
+    T *fea_ptrs = (T *)(&(*output)[num]);
+
+    thread_local paddle::string::str_ptr_stream ss;
+    for (size_t i = 0; i < feat_str_size; i++) {
+      ss.reset(*(feat_str_begin + i));
+      ss >> fea_ptrs[i];
+    }
+  }
+
 protected:
  std::vector<std::string> feature;
 };
+
 }  // namespace distributed
 }  // namespace paddle
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -41,14 +41,14 @@ namespace paddle {
 namespace distributed {

 int32_t MemorySparseTable::Initialize() {
-  _shards_task_pool.resize(_task_pool_size);
-  for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
-    _shards_task_pool[i].reset(new ::ThreadPool(1));
-  }
  auto& profiler = CostProfiler::instance();
  profiler.register_profiler("pserver_sparse_update_all");
  profiler.register_profiler("pserver_sparse_select_all");
  InitializeValue();
+  _shards_task_pool.resize(_task_pool_size);
+  for (int i = 0; i < _shards_task_pool.size(); ++i) {
+    _shards_task_pool[i].reset(new ::ThreadPool(1));
+  }
  VLOG(0) << "initalize MemorySparseTable succ";
  return 0;
 }
@@ -65,9 +65,13 @@ int32_t MemorySparseTable::InitializeValue() {
    _real_local_shard_num =
        _real_local_shard_num < 0 ? 0 : _real_local_shard_num;
  }
+#ifdef PADDLE_WITH_HETERPS
+  _task_pool_size = _sparse_table_shard_num;
+#endif
  VLOG(1) << "memory sparse table _avg_local_shard_num: "
          << _avg_local_shard_num
-          << " _real_local_shard_num: " << _real_local_shard_num;
+          << " _real_local_shard_num: " << _real_local_shard_num
+          << " _task_pool_size:" << _task_pool_size;

  _local_shards.reset(new shard_type[_real_local_shard_num]);

@@ -336,7 +340,11 @@ int32_t MemorySparseTable::Save(const std::string& dirname,

  size_t file_start_idx = _avg_local_shard_num * _shard_idx;

+#ifdef PADDLE_WITH_GPU_GRAPH
+  int thread_num = _real_local_shard_num;
+#else
  int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20;
+#endif
  omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
  for (int i = 0; i < _real_local_shard_num; ++i) {

--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
@@ -112,7 +112,7 @@ class MemorySparseTable : public Table {
  virtual int32_t LoadPatch(const std::vector<std::string>& file_list,
                            int save_param);

-  const int _task_pool_size = 24;
+  int _task_pool_size = 24;
  int _avg_local_shard_num;
  int _real_local_shard_num;
  int _sparse_table_shard_num;

--- a/paddle/fluid/distributed/the_one_ps.proto
+++ b/paddle/fluid/distributed/the_one_ps.proto
@@ -126,13 +126,20 @@ message TableParameter {

 message TableAccessorParameter {
  optional string accessor_class = 1;
-  optional uint32 fea_dim = 4 [ default = 11 ];
-  optional uint32 embedx_dim = 5 [ default = 8 ];
-  optional uint32 embedx_threshold = 6 [ default = 10 ];
+  optional uint32 fea_dim = 4 [ default = 11 ];   // field size of one value
+  optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size
+  optional uint32 embedx_threshold = 6
+      [ default = 10 ]; // embedx feature create threshold
  optional CtrAccessorParameter ctr_accessor_param = 7;
  repeated TableAccessorSaveParameter table_accessor_save_param = 8;
  optional SparseCommonSGDRuleParameter embed_sgd_param = 10;
  optional SparseCommonSGDRuleParameter embedx_sgd_param = 11;
+  optional GraphSGDParameter graph_sgd_param = 12;
+}
+
+message GraphSGDParameter {
+  optional uint32 nodeid_slot = 1 [ default = 9008 ];
+  optional float feature_learning_rate = 2 [ default = 0.05 ];
 }

 message CtrAccessorParameter {
@@ -232,6 +239,7 @@ message GraphParameter {
  optional string table_type = 9 [ default = "" ];
  optional int32 shard_num = 10 [ default = 127 ];
  optional int32 search_level = 11 [ default = 1 ];
+  optional bool build_sampler_on_cpu = 12 [ default = true ];
 }

 message GraphFeature {

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -740,6 +740,19 @@ if(WITH_DISTRIBUTE)
    set_source_files_properties(
      heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
  elseif(WITH_PSCORE)
+    # cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
+    #         dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
+    #         heterxpu_trainer.cc heter_pipeline_trainer.cc
+    #         data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
+    #         downpour_worker.cc downpour_lite_worker.cc downpour_worker_opt.cc data_feed.cu
+    #         pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
+    #         device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
+    #         index_sampler index_wrapper sampler index_dataset_proto
+    #         lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
+    #         graph_to_program_pass variable_helper timer monitor
+    #         heter_service_proto fleet heter_server brpc fleet_executor
+    #         graph_gpu_wrapper)
+
    cc_library(
      executor
      SRCS executor.cc
@@ -1001,21 +1014,41 @@ cc_library(
  DEPS parallel_executor)
 if(WITH_PSCORE)
  get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
-  cc_test(
-    dist_multi_trainer_test
-    SRCS dist_multi_trainer_test.cc
-    DEPS conditional_block_op executor gloo_wrapper ${RPC_DEPS})
-  cc_test(
-    heter_pipeline_trainer_test
-    SRCS heter_pipeline_trainer_test.cc
-    DEPS conditional_block_op
-         scale_op
-         heter_listen_and_serv_op
-         executor
-         heter_server
-         gloo_wrapper
-         eigen_function
-         ${RPC_DEPS})
+  if(WITH_HETERPS)
+    cc_test(
+      dist_multi_trainer_test
+      SRCS dist_multi_trainer_test.cc
+      DEPS conditional_block_op executor gloo_wrapper ${RPC_DEPS}
+           graph_gpu_wrapper)
+    cc_test(
+      heter_pipeline_trainer_test
+      SRCS heter_pipeline_trainer_test.cc
+      DEPS conditional_block_op
+           scale_op
+           heter_listen_and_serv_op
+           executor
+           heter_server
+           gloo_wrapper
+           eigen_function
+           ${RPC_DEPS}
+           graph_gpu_wrapper)
+  else()
+    cc_test(
+      dist_multi_trainer_test
+      SRCS dist_multi_trainer_test.cc
+      DEPS conditional_block_op executor gloo_wrapper ${RPC_DEPS})
+    cc_test(
+      heter_pipeline_trainer_test
+      SRCS heter_pipeline_trainer_test.cc
+      DEPS conditional_block_op
+           scale_op
+           heter_listen_and_serv_op
+           executor
+           heter_server
+           gloo_wrapper
+           eigen_function
+           ${RPC_DEPS})
+  endif()
 else()
  cc_test(
    dist_multi_trainer_test

--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -2108,6 +2108,9 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) {
  } else {
    so_parser_name_.clear();
  }
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+  gpu_graph_data_generator_.SetConfig(data_feed_desc);
+#endif
 }

 void SlotRecordInMemoryDataFeed::LoadIntoMemory() {
@@ -2644,6 +2647,9 @@ bool SlotRecordInMemoryDataFeed::Start() {
 #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
  CHECK(paddle::platform::is_gpu_place(this->place_));
  pack_ = BatchGpuPackMgr().get(this->GetPlace(), used_slots_info_);
+#endif
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+  gpu_graph_data_generator_.AllocResource(this->place_, feed_vec_);
 #endif
  return true;
 }
@@ -2651,27 +2657,33 @@ bool SlotRecordInMemoryDataFeed::Start() {
 int SlotRecordInMemoryDataFeed::Next() {
 #ifdef _LINUX
  this->CheckStart();
-
-  VLOG(3) << "enable heter next: " << offset_index_
-          << " batch_offsets: " << batch_offsets_.size();
-  if (offset_index_ >= batch_offsets_.size()) {
-    VLOG(3) << "offset_index: " << offset_index_
+  if (!gpu_graph_mode_) {
+    VLOG(3) << "enable heter next: " << offset_index_
            << " batch_offsets: " << batch_offsets_.size();
-    return 0;
-  }
-  auto& batch = batch_offsets_[offset_index_++];
-  this->batch_size_ = batch.second;
-  VLOG(3) << "batch_size_=" << this->batch_size_
-          << ", thread_id=" << thread_id_;
-  if (this->batch_size_ != 0) {
-    PutToFeedVec(&records_[batch.first], this->batch_size_);
+    if (offset_index_ >= batch_offsets_.size()) {
+      VLOG(3) << "offset_index: " << offset_index_
+              << " batch_offsets: " << batch_offsets_.size();
+      return 0;
+    }
+    auto& batch = batch_offsets_[offset_index_++];
+    this->batch_size_ = batch.second;
+    VLOG(3) << "batch_size_=" << this->batch_size_
+            << ", thread_id=" << thread_id_;
+    if (this->batch_size_ != 0) {
+      PutToFeedVec(&records_[batch.first], this->batch_size_);
+    } else {
+      VLOG(3) << "finish reading for heterps, batch size zero, thread_id="
+              << thread_id_;
+    }
+    VLOG(3) << "enable heter next: " << offset_index_
+            << " batch_offsets: " << batch_offsets_.size()
+            << " baych_size: " << this->batch_size_;
  } else {
-    VLOG(3) << "finish reading for heterps, batch size zero, thread_id="
-            << thread_id_;
+    VLOG(3) << "datafeed in gpu graph mode";
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    this->batch_size_ = gpu_graph_data_generator_.GenerateBatch();
+#endif
  }
-  VLOG(3) << "enable heter next: " << offset_index_
-          << " batch_offsets: " << batch_offsets_.size()
-          << " baych_size: " << this->batch_size_;

  return this->batch_size_;
 #else

--- a/paddle/fluid/framework/data_feed.cu
+++ b/paddle/fluid/framework/data_feed.cu
@@ -18,6 +18,15 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)

 #include "paddle/fluid/framework/data_feed.h"
+#include <thrust/device_ptr.h>
+#include <thrust/random.h>
+#include <thrust/shuffle.h>
+#include <sstream>
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
+
+DECLARE_bool(enable_opt_get_features);

 namespace paddle {
 namespace framework {
@@ -182,6 +191,1012 @@ void SlotRecordInMemoryDataFeed::CopyForTensor(
  cudaStreamSynchronize(stream);
 }

+__global__ void GraphFillCVMKernel(int64_t *tensor, int len) {
+  CUDA_KERNEL_LOOP(idx, len) { tensor[idx] = 1; }
+}
+
+__global__ void CopyDuplicateKeys(int64_t *dist_tensor,
+                                  uint64_t *src_tensor,
+                                  int len) {
+  CUDA_KERNEL_LOOP(idx, len) {
+    dist_tensor[idx * 2] = src_tensor[idx];
+    dist_tensor[idx * 2 + 1] = src_tensor[idx];
+  }
+}
+
+int GraphDataGenerator::AcquireInstance(BufState *state) {
+  //
+  if (state->GetNextStep()) {
+    state->Debug();
+    return state->len;
+  } else if (state->GetNextCentrolWord()) {
+    state->Debug();
+    return state->len;
+  } else if (state->GetNextBatch()) {
+    state->Debug();
+    return state->len;
+  }
+  return 0;
+}
+
+// TODO opt
+__global__ void GraphFillFeatureKernel(uint64_t *id_tensor,
+                                       int *fill_ins_num,
+                                       uint64_t *walk,
+                                       uint64_t *feature,
+                                       int *row,
+                                       int central_word,
+                                       int step,
+                                       int len,
+                                       int col_num,
+                                       int slot_num) {
+  __shared__ int32_t local_key[CUDA_NUM_THREADS * 16];
+  __shared__ int local_num;
+  __shared__ int global_num;
+
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+  if (idx < len) {
+    int src = row[idx] * col_num + central_word;
+    if (walk[src] != 0 && walk[src + step] != 0) {
+      size_t dst = atomicAdd(&local_num, 1);
+      for (int i = 0; i < slot_num; ++i) {
+        local_key[dst * 2 * slot_num + i * 2] = feature[src * slot_num + i];
+        local_key[dst * 2 * slot_num + i * 2 + 1] =
+            feature[(src + step) * slot_num + i];
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    global_num = atomicAdd(fill_ins_num, local_num);
+  }
+  __syncthreads();
+
+  if (threadIdx.x < local_num) {
+    for (int i = 0; i < slot_num; ++i) {
+      id_tensor[(global_num * 2 + 2 * threadIdx.x) * slot_num + i] =
+          local_key[(2 * threadIdx.x) * slot_num + i];
+      id_tensor[(global_num * 2 + 2 * threadIdx.x + 1) * slot_num + i] =
+          local_key[(2 * threadIdx.x + 1) * slot_num + i];
+    }
+  }
+}
+
+__global__ void GraphFillIdKernel(uint64_t *id_tensor,
+                                  int *fill_ins_num,
+                                  uint64_t *walk,
+                                  int *row,
+                                  int central_word,
+                                  int step,
+                                  int len,
+                                  int col_num) {
+  __shared__ uint64_t local_key[CUDA_NUM_THREADS * 2];
+  __shared__ int local_num;
+  __shared__ int global_num;
+
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+  // int dst = idx * 2;
+  // id_tensor[dst] = walk[src];
+  // id_tensor[dst + 1] = walk[src + step];
+  if (idx < len) {
+    int src = row[idx] * col_num + central_word;
+    if (walk[src] != 0 && walk[src + step] != 0) {
+      size_t dst = atomicAdd(&local_num, 1);
+      local_key[dst * 2] = walk[src];
+      local_key[dst * 2 + 1] = walk[src + step];
+    }
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    global_num = atomicAdd(fill_ins_num, local_num);
+  }
+  __syncthreads();
+
+  if (threadIdx.x < local_num) {
+    id_tensor[global_num * 2 + 2 * threadIdx.x] = local_key[2 * threadIdx.x];
+    id_tensor[global_num * 2 + 2 * threadIdx.x + 1] =
+        local_key[2 * threadIdx.x + 1];
+  }
+}
+
+__global__ void GraphFillSlotKernel(uint64_t *id_tensor,
+                                    uint64_t *feature_buf,
+                                    int len,
+                                    int total_ins,
+                                    int slot_num) {
+  CUDA_KERNEL_LOOP(idx, len) {
+    int slot_idx = idx / total_ins;
+    int ins_idx = idx % total_ins;
+    ((uint64_t *)(id_tensor[slot_idx]))[ins_idx] =
+        feature_buf[ins_idx * slot_num + slot_idx];
+  }
+}
+
+__global__ void GraphFillSlotLodKernelOpt(uint64_t *id_tensor,
+                                          int len,
+                                          int total_ins) {
+  CUDA_KERNEL_LOOP(idx, len) {
+    int slot_idx = idx / total_ins;
+    int ins_idx = idx % total_ins;
+    ((uint64_t *)(id_tensor[slot_idx]))[ins_idx] = ins_idx;
+  }
+}
+
+__global__ void GraphFillSlotLodKernel(int64_t *id_tensor, int len) {
+  CUDA_KERNEL_LOOP(idx, len) { id_tensor[idx] = idx; }
+}
+
+int GraphDataGenerator::FillInsBuf() {
+  if (ins_buf_pair_len_ >= batch_size_) {
+    return batch_size_;
+  }
+  int total_instance = AcquireInstance(&buf_state_);
+
+  VLOG(2) << "total_ins: " << total_instance;
+  buf_state_.Debug();
+
+  if (total_instance == 0) {
+    int res = FillWalkBuf(d_walk_);
+    if (!res) {
+      // graph iterate complete
+      return -1;
+    } else {
+      total_instance = buf_state_.len;
+      VLOG(2) << "total_ins: " << total_instance;
+      buf_state_.Debug();
+      // if (total_instance == 0) {
+      //  return -1;
+      //}
+    }
+
+    if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
+      FillFeatureBuf(d_walk_, d_feature_);
+      if (debug_mode_) {
+        int len = buf_size_ > 5000 ? 5000 : buf_size_;
+        uint64_t h_walk[len];
+        cudaMemcpy(h_walk,
+                   d_walk_->ptr(),
+                   len * sizeof(uint64_t),
+                   cudaMemcpyDeviceToHost);
+        uint64_t h_feature[len * slot_num_];
+        cudaMemcpy(h_feature,
+                   d_feature_->ptr(),
+                   len * slot_num_ * sizeof(uint64_t),
+                   cudaMemcpyDeviceToHost);
+        for (int i = 0; i < len; ++i) {
+          std::stringstream ss;
+          for (int j = 0; j < slot_num_; ++j) {
+            ss << h_feature[i * slot_num_ + j] << " ";
+          }
+          VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i
+                  << "] = " << (uint64_t)h_walk[i] << " feature["
+                  << i * slot_num_ << ".." << (i + 1) * slot_num_
+                  << "] = " << ss.str();
+        }
+      }
+    }
+  }
+
+  uint64_t *walk = reinterpret_cast<uint64_t *>(d_walk_->ptr());
+  uint64_t *ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
+  int *random_row = reinterpret_cast<int *>(d_random_row_->ptr());
+  int *d_pair_num = reinterpret_cast<int *>(d_pair_num_->ptr());
+  cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_);
+  int len = buf_state_.len;
+  GraphFillIdKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+      ins_buf + ins_buf_pair_len_ * 2,
+      d_pair_num,
+      walk,
+      random_row + buf_state_.cursor,
+      buf_state_.central_word,
+      window_step_[buf_state_.step],
+      len,
+      walk_len_);
+  int h_pair_num;
+  cudaMemcpyAsync(
+      &h_pair_num, d_pair_num, sizeof(int), cudaMemcpyDeviceToHost, stream_);
+  if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
+    uint64_t *feature_buf = reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
+    uint64_t *feature = reinterpret_cast<uint64_t *>(d_feature_->ptr());
+    cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_);
+    int len = buf_state_.len;
+    VLOG(2) << "feature_buf start[" << ins_buf_pair_len_ * 2 * slot_num_
+            << "] len[" << len << "]";
+    GraphFillFeatureKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+        feature_buf + ins_buf_pair_len_ * 2 * slot_num_,
+        d_pair_num,
+        walk,
+        feature,
+        random_row + buf_state_.cursor,
+        buf_state_.central_word,
+        window_step_[buf_state_.step],
+        len,
+        walk_len_,
+        slot_num_);
+  }
+
+  cudaStreamSynchronize(stream_);
+  ins_buf_pair_len_ += h_pair_num;
+
+  if (debug_mode_) {
+    uint64_t h_ins_buf[ins_buf_pair_len_ * 2];
+    cudaMemcpy(h_ins_buf,
+               ins_buf,
+               2 * ins_buf_pair_len_ * sizeof(uint64_t),
+               cudaMemcpyDeviceToHost);
+    VLOG(2) << "h_pair_num = " << h_pair_num
+            << ", ins_buf_pair_len = " << ins_buf_pair_len_;
+    for (int xx = 0; xx < 2 * ins_buf_pair_len_; xx++) {
+      VLOG(2) << "h_ins_buf[" << xx << "]: " << h_ins_buf[xx];
+    }
+    delete[] h_ins_buf;
+
+    if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
+      uint64_t *feature_buf =
+          reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
+      uint64_t h_feature_buf[(batch_size_ * 2 * 2) * slot_num_];
+      cudaMemcpy(h_feature_buf,
+                 feature_buf,
+                 (batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t),
+                 cudaMemcpyDeviceToHost);
+      for (int xx = 0; xx < (batch_size_ * 2 * 2) * slot_num_; xx++) {
+        VLOG(2) << "h_feature_buf[" << xx << "]: " << h_feature_buf[xx];
+      }
+    }
+  }
+  return ins_buf_pair_len_;
+}
+
+int GraphDataGenerator::GenerateBatch() {
+  int total_instance = 0;
+  platform::CUDADeviceGuard guard(gpuid_);
+  int res = 0;
+  if (!gpu_graph_training_) {
+    while (cursor_ < h_device_keys_.size()) {
+      size_t device_key_size = h_device_keys_[cursor_]->size();
+      if (infer_node_type_start_[cursor_] >= device_key_size) {
+        cursor_++;
+        continue;
+      }
+      total_instance =
+          (infer_node_type_start_[cursor_] + batch_size_ <= device_key_size)
+              ? batch_size_
+              : device_key_size - infer_node_type_start_[cursor_];
+      uint64_t *d_type_keys =
+          reinterpret_cast<uint64_t *>(d_device_keys_[cursor_]->ptr());
+      d_type_keys += infer_node_type_start_[cursor_];
+      infer_node_type_start_[cursor_] += total_instance;
+      VLOG(1) << "in graph_data generator:batch_size = " << batch_size_
+              << " instance = " << total_instance;
+      total_instance *= 2;
+      id_tensor_ptr_ = feed_vec_[0]->mutable_data<int64_t>({total_instance, 1},
+                                                           this->place_);
+      show_tensor_ptr_ =
+          feed_vec_[1]->mutable_data<int64_t>({total_instance}, this->place_);
+      clk_tensor_ptr_ =
+          feed_vec_[2]->mutable_data<int64_t>({total_instance}, this->place_);
+      CopyDuplicateKeys<<<GET_BLOCKS(total_instance / 2),
+                          CUDA_NUM_THREADS,
+                          0,
+                          stream_>>>(
+          id_tensor_ptr_, d_type_keys, total_instance / 2);
+      GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
+                           CUDA_NUM_THREADS,
+                           0,
+                           stream_>>>(show_tensor_ptr_, total_instance);
+      GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
+                           CUDA_NUM_THREADS,
+                           0,
+                           stream_>>>(clk_tensor_ptr_, total_instance);
+      break;
+    }
+    if (total_instance == 0) {
+      return 0;
+    }
+  } else {
+    while (ins_buf_pair_len_ < batch_size_) {
+      res = FillInsBuf();
+      if (res == -1) {
+        if (ins_buf_pair_len_ == 0) {
+          return 0;
+        } else {
+          break;
+        }
+      }
+    }
+    total_instance =
+        ins_buf_pair_len_ < batch_size_ ? ins_buf_pair_len_ : batch_size_;
+
+    total_instance *= 2;
+    id_tensor_ptr_ =
+        feed_vec_[0]->mutable_data<int64_t>({total_instance, 1}, this->place_);
+    show_tensor_ptr_ =
+        feed_vec_[1]->mutable_data<int64_t>({total_instance}, this->place_);
+    clk_tensor_ptr_ =
+        feed_vec_[2]->mutable_data<int64_t>({total_instance}, this->place_);
+  }
+
+  int64_t *slot_tensor_ptr_[slot_num_];
+  int64_t *slot_lod_tensor_ptr_[slot_num_];
+  if (slot_num_ > 0) {
+    for (int i = 0; i < slot_num_; ++i) {
+      slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data<int64_t>(
+          {total_instance, 1}, this->place_);
+      slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data<int64_t>(
+          {total_instance + 1}, this->place_);
+    }
+    if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
+      cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(),
+                      slot_tensor_ptr_,
+                      sizeof(uint64_t *) * slot_num_,
+                      cudaMemcpyHostToDevice,
+                      stream_);
+      cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(),
+                      slot_lod_tensor_ptr_,
+                      sizeof(uint64_t *) * slot_num_,
+                      cudaMemcpyHostToDevice,
+                      stream_);
+    }
+  }
+
+  uint64_t *ins_cursor, *ins_buf;
+  if (gpu_graph_training_) {
+    VLOG(2) << "total_instance: " << total_instance
+            << ", ins_buf_pair_len = " << ins_buf_pair_len_;
+    // uint64_t *ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
+    // uint64_t *ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
+    ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
+    ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
+    cudaMemcpyAsync(id_tensor_ptr_,
+                    ins_cursor,
+                    sizeof(uint64_t) * total_instance,
+                    cudaMemcpyDeviceToDevice,
+                    stream_);
+
+    GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
+                         CUDA_NUM_THREADS,
+                         0,
+                         stream_>>>(show_tensor_ptr_, total_instance);
+    GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
+                         CUDA_NUM_THREADS,
+                         0,
+                         stream_>>>(clk_tensor_ptr_, total_instance);
+  } else {
+    ins_cursor = (uint64_t *)id_tensor_ptr_;
+  }
+
+  if (slot_num_ > 0) {
+    uint64_t *feature_buf = reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
+    if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
+      FillFeatureBuf(ins_cursor, feature_buf, total_instance);
+      // FillFeatureBuf(id_tensor_ptr_, feature_buf, total_instance);
+      if (debug_mode_) {
+        uint64_t h_walk[total_instance];
+        cudaMemcpy(h_walk,
+                   ins_cursor,
+                   total_instance * sizeof(uint64_t),
+                   cudaMemcpyDeviceToHost);
+        uint64_t h_feature[total_instance * slot_num_];
+        cudaMemcpy(h_feature,
+                   feature_buf,
+                   total_instance * slot_num_ * sizeof(uint64_t),
+                   cudaMemcpyDeviceToHost);
+        for (int i = 0; i < total_instance; ++i) {
+          std::stringstream ss;
+          for (int j = 0; j < slot_num_; ++j) {
+            ss << h_feature[i * slot_num_ + j] << " ";
+          }
+          VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i
+                  << "] = " << (uint64_t)h_walk[i] << " feature["
+                  << i * slot_num_ << ".." << (i + 1) * slot_num_
+                  << "] = " << ss.str();
+        }
+      }
+
+      GraphFillSlotKernel<<<GET_BLOCKS(total_instance * slot_num_),
+                            CUDA_NUM_THREADS,
+                            0,
+                            stream_>>>((uint64_t *)d_slot_tensor_ptr_->ptr(),
+                                       feature_buf,
+                                       total_instance * slot_num_,
+                                       total_instance,
+                                       slot_num_);
+      GraphFillSlotLodKernelOpt<<<GET_BLOCKS((total_instance + 1) * slot_num_),
+                                  CUDA_NUM_THREADS,
+                                  0,
+                                  stream_>>>(
+          (uint64_t *)d_slot_lod_tensor_ptr_->ptr(),
+          (total_instance + 1) * slot_num_,
+          total_instance + 1);
+    } else {
+      for (int i = 0; i < slot_num_; ++i) {
+        int feature_buf_offset =
+            (ins_buf_pair_len_ * 2 - total_instance) * slot_num_ + i * 2;
+        for (int j = 0; j < total_instance; j += 2) {
+          VLOG(2) << "slot_tensor[" << i << "][" << j << "] <- feature_buf["
+                  << feature_buf_offset + j * slot_num_ << "]";
+          VLOG(2) << "slot_tensor[" << i << "][" << j + 1 << "] <- feature_buf["
+                  << feature_buf_offset + j * slot_num_ + 1 << "]";
+          cudaMemcpyAsync(slot_tensor_ptr_[i] + j,
+                          &feature_buf[feature_buf_offset + j * slot_num_],
+                          sizeof(uint64_t) * 2,
+                          cudaMemcpyDeviceToDevice,
+                          stream_);
+        }
+        GraphFillSlotLodKernel<<<GET_BLOCKS(total_instance),
+                                 CUDA_NUM_THREADS,
+                                 0,
+                                 stream_>>>(slot_lod_tensor_ptr_[i],
+                                            total_instance + 1);
+      }
+    }
+  }
+
+  offset_.clear();
+  offset_.push_back(0);
+  offset_.push_back(total_instance);
+  LoD lod{offset_};
+  feed_vec_[0]->set_lod(lod);
+  if (slot_num_ > 0) {
+    for (int i = 0; i < slot_num_; ++i) {
+      feed_vec_[3 + 2 * i]->set_lod(lod);
+    }
+  }
+
+  cudaStreamSynchronize(stream_);
+  if (!gpu_graph_training_) return 1;
+  ins_buf_pair_len_ -= total_instance / 2;
+  if (debug_mode_) {
+    uint64_t h_slot_tensor[slot_num_][total_instance];
+    uint64_t h_slot_lod_tensor[slot_num_][total_instance + 1];
+    for (int i = 0; i < slot_num_; ++i) {
+      cudaMemcpy(h_slot_tensor[i],
+                 slot_tensor_ptr_[i],
+                 total_instance * sizeof(uint64_t),
+                 cudaMemcpyDeviceToHost);
+      int len = total_instance > 5000 ? 5000 : total_instance;
+      for (int j = 0; j < len; ++j) {
+        VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i << "][" << j
+                << "] = " << h_slot_tensor[i][j];
+      }
+
+      cudaMemcpy(h_slot_lod_tensor[i],
+                 slot_lod_tensor_ptr_[i],
+                 (total_instance + 1) * sizeof(uint64_t),
+                 cudaMemcpyDeviceToHost);
+      len = total_instance + 1 > 5000 ? 5000 : total_instance + 1;
+      for (int j = 0; j < len; ++j) {
+        VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i << "][" << j
+                << "] = " << h_slot_lod_tensor[i][j];
+      }
+    }
+  }
+
+  return 1;
+}
+
+__global__ void GraphFillSampleKeysKernel(uint64_t *neighbors,
+                                          uint64_t *sample_keys,
+                                          int *prefix_sum,
+                                          int *sampleidx2row,
+                                          int *tmp_sampleidx2row,
+                                          int *actual_sample_size,
+                                          int cur_degree,
+                                          int len) {
+  CUDA_KERNEL_LOOP(idx, len) {
+    for (int k = 0; k < actual_sample_size[idx]; k++) {
+      size_t offset = prefix_sum[idx] + k;
+      sample_keys[offset] = neighbors[idx * cur_degree + k];
+      tmp_sampleidx2row[offset] = sampleidx2row[idx] + k;
+    }
+  }
+}
+
+__global__ void GraphDoWalkKernel(uint64_t *neighbors,
+                                  uint64_t *walk,
+                                  int *d_prefix_sum,
+                                  int *actual_sample_size,
+                                  int cur_degree,
+                                  int step,
+                                  int len,
+                                  int *id_cnt,
+                                  int *sampleidx2row,
+                                  int col_size) {
+  CUDA_KERNEL_LOOP(i, len) {
+    for (int k = 0; k < actual_sample_size[i]; k++) {
+      // int idx = sampleidx2row[i];
+      size_t row = sampleidx2row[k + d_prefix_sum[i]];
+      // size_t row = idx * cur_degree + k;
+      size_t col = step;
+      size_t offset = (row * col_size + col);
+      walk[offset] = neighbors[i * cur_degree + k];
+    }
+  }
+}
+
+// Fill keys to the first column of walk
+__global__ void GraphFillFirstStepKernel(int *prefix_sum,
+                                         int *sampleidx2row,
+                                         uint64_t *walk,
+                                         uint64_t *keys,
+                                         int len,
+                                         int walk_degree,
+                                         int col_size,
+                                         int *actual_sample_size,
+                                         uint64_t *neighbors,
+                                         uint64_t *sample_keys) {
+  CUDA_KERNEL_LOOP(idx, len) {
+    for (int k = 0; k < actual_sample_size[idx]; k++) {
+      size_t row = prefix_sum[idx] + k;
+      sample_keys[row] = neighbors[idx * walk_degree + k];
+      sampleidx2row[row] = row;
+
+      size_t offset = col_size * row;
+      walk[offset] = keys[idx];
+      walk[offset + 1] = neighbors[idx * walk_degree + k];
+    }
+  }
+}
+
+// Fill sample_res to the stepth column of walk
+void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids,
+                                     uint64_t *walk,
+                                     int len,
+                                     NeighborSampleResult &sample_res,
+                                     int cur_degree,
+                                     int step,
+                                     int *len_per_row) {
+  size_t temp_storage_bytes = 0;
+  int *d_actual_sample_size = sample_res.actual_sample_size;
+  uint64_t *d_neighbors = sample_res.val;
+  int *d_prefix_sum = reinterpret_cast<int *>(d_prefix_sum_->ptr());
+  uint64_t *d_sample_keys = reinterpret_cast<uint64_t *>(d_sample_keys_->ptr());
+  int *d_sampleidx2row =
+      reinterpret_cast<int *>(d_sampleidx2rows_[cur_sampleidx2row_]->ptr());
+  int *d_tmp_sampleidx2row =
+      reinterpret_cast<int *>(d_sampleidx2rows_[1 - cur_sampleidx2row_]->ptr());
+
+  CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL,
+                                           temp_storage_bytes,
+                                           d_actual_sample_size,
+                                           d_prefix_sum + 1,
+                                           len,
+                                           stream_));
+  auto d_temp_storage = memory::Alloc(place_, temp_storage_bytes);
+
+  CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(),
+                                           temp_storage_bytes,
+                                           d_actual_sample_size,
+                                           d_prefix_sum + 1,
+                                           len,
+                                           stream_));
+
+  cudaStreamSynchronize(stream_);
+
+  if (step == 1) {
+    GraphFillFirstStepKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+        d_prefix_sum,
+        d_tmp_sampleidx2row,
+        walk,
+        d_start_ids,
+        len,
+        walk_degree_,
+        walk_len_,
+        d_actual_sample_size,
+        d_neighbors,
+        d_sample_keys);
+
+  } else {
+    GraphFillSampleKeysKernel<<<GET_BLOCKS(len),
+                                CUDA_NUM_THREADS,
+                                0,
+                                stream_>>>(d_neighbors,
+                                           d_sample_keys,
+                                           d_prefix_sum,
+                                           d_sampleidx2row,
+                                           d_tmp_sampleidx2row,
+                                           d_actual_sample_size,
+                                           cur_degree,
+                                           len);
+
+    GraphDoWalkKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+        d_neighbors,
+        walk,
+        d_prefix_sum,
+        d_actual_sample_size,
+        cur_degree,
+        step,
+        len,
+        len_per_row,
+        d_tmp_sampleidx2row,
+        walk_len_);
+  }
+  if (debug_mode_) {
+    size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
+    int *h_prefix_sum = new int[len + 1];
+    int *h_actual_size = new int[len];
+    int *h_offset2idx = new int[once_max_sample_keynum];
+    uint64_t h_sample_keys[once_max_sample_keynum];
+    cudaMemcpy(h_offset2idx,
+               d_tmp_sampleidx2row,
+               once_max_sample_keynum * sizeof(int),
+               cudaMemcpyDeviceToHost);
+
+    cudaMemcpy(h_prefix_sum,
+               d_prefix_sum,
+               (len + 1) * sizeof(int),
+               cudaMemcpyDeviceToHost);
+    for (int xx = 0; xx < once_max_sample_keynum; xx++) {
+      VLOG(2) << "h_offset2idx[" << xx << "]: " << h_offset2idx[xx];
+    }
+    for (int xx = 0; xx < len + 1; xx++) {
+      VLOG(2) << "h_prefix_sum[" << xx << "]: " << h_prefix_sum[xx];
+    }
+    delete[] h_prefix_sum;
+    delete[] h_actual_size;
+    delete[] h_offset2idx;
+    delete[] h_sample_keys;
+  }
+  cudaStreamSynchronize(stream_);
+  cur_sampleidx2row_ = 1 - cur_sampleidx2row_;
+}
+
+int GraphDataGenerator::FillFeatureBuf(uint64_t *d_walk,
+                                       uint64_t *d_feature,
+                                       size_t key_num) {
+  platform::CUDADeviceGuard guard(gpuid_);
+
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  int ret = gpu_graph_ptr->get_feature_of_nodes(
+      gpuid_, d_walk, d_feature, key_num, slot_num_);
+  return ret;
+}
+
+int GraphDataGenerator::FillFeatureBuf(
+    std::shared_ptr<phi::Allocation> d_walk,
+    std::shared_ptr<phi::Allocation> d_feature) {
+  platform::CUDADeviceGuard guard(gpuid_);
+
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  int ret = gpu_graph_ptr->get_feature_of_nodes(gpuid_,
+                                                (uint64_t *)d_walk->ptr(),
+                                                (uint64_t *)d_feature->ptr(),
+                                                buf_size_,
+                                                slot_num_);
+  return ret;
+}
+
+int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
+  platform::CUDADeviceGuard guard(gpuid_);
+  size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
+  ////////
+  uint64_t *h_walk;
+  uint64_t *h_sample_keys;
+  int *h_offset2idx;
+  int *h_len_per_row;
+  uint64_t *h_prefix_sum;
+  if (debug_mode_) {
+    h_walk = new uint64_t[buf_size_];
+    h_sample_keys = new uint64_t[once_max_sample_keynum];
+    h_offset2idx = new int[once_max_sample_keynum];
+    h_len_per_row = new int[once_max_sample_keynum];
+    h_prefix_sum = new uint64_t[once_max_sample_keynum + 1];
+  }
+  ///////
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  uint64_t *walk = reinterpret_cast<uint64_t *>(d_walk->ptr());
+  int *len_per_row = reinterpret_cast<int *>(d_len_per_row_->ptr());
+  uint64_t *d_sample_keys = reinterpret_cast<uint64_t *>(d_sample_keys_->ptr());
+  cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), stream_);
+  cudaMemsetAsync(
+      len_per_row, 0, once_max_sample_keynum * sizeof(int), stream_);
+  int i = 0;
+  int total_row = 0;
+  size_t node_type_len = first_node_type_.size();
+  int remain_size =
+      buf_size_ - walk_degree_ * once_sample_startid_len_ * walk_len_;
+
+  while (i <= remain_size) {
+    int cur_node_idx = cursor_ % node_type_len;
+    int node_type = first_node_type_[cur_node_idx];
+    auto &path = meta_path_[cur_node_idx];
+    size_t start = node_type_start_[node_type];
+    // auto node_query_result = gpu_graph_ptr->query_node_list(
+    //    gpuid_, node_type, start, once_sample_startid_len_);
+
+    // int tmp_len = node_query_result.actual_sample_size;
+    VLOG(2) << "choose start type: " << node_type;
+    int type_index = type_to_index_[node_type];
+    size_t device_key_size = h_device_keys_[type_index]->size();
+    VLOG(2) << "type: " << node_type << " size: " << device_key_size
+            << " start: " << start;
+    uint64_t *d_type_keys =
+        reinterpret_cast<uint64_t *>(d_device_keys_[type_index]->ptr());
+    int tmp_len = start + once_sample_startid_len_ > device_key_size
+                      ? device_key_size - start
+                      : once_sample_startid_len_;
+    node_type_start_[node_type] = tmp_len + start;
+    if (tmp_len == 0) {
+      finish_node_type_.insert(node_type);
+      if (finish_node_type_.size() == node_type_start_.size()) {
+        break;
+      }
+      cursor_ += 1;
+      continue;
+    }
+    // if (tmp_len == 0) {
+    //  break;
+    //}
+    VLOG(2) << "i = " << i << " buf_size_ = " << buf_size_
+            << " tmp_len = " << tmp_len << " cursor = " << cursor_
+            << " once_max_sample_keynum = " << once_max_sample_keynum;
+    uint64_t *cur_walk = walk + i;
+
+    NeighborSampleQuery q;
+    q.initialize(gpuid_,
+                 path[0],
+                 (uint64_t)(d_type_keys + start),
+                 walk_degree_,
+                 tmp_len);
+    auto sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false);
+
+    int step = 1;
+    VLOG(2) << "sample edge type: " << path[0] << " step: " << 1;
+    jump_rows_ = sample_res.total_sample_size;
+    FillOneStep(d_type_keys + start,
+                cur_walk,
+                tmp_len,
+                sample_res,
+                walk_degree_,
+                step,
+                len_per_row);
+    VLOG(2) << "jump_row: " << jump_rows_;
+    /////////
+    if (debug_mode_) {
+      cudaMemcpy(
+          h_walk, walk, buf_size_ * sizeof(uint64_t), cudaMemcpyDeviceToHost);
+      for (int xx = 0; xx < buf_size_; xx++) {
+        VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
+      }
+    }
+    /////////
+    step++;
+    size_t path_len = path.size();
+    for (; step < walk_len_; step++) {
+      if (sample_res.total_sample_size == 0) {
+        break;
+      }
+      auto sample_key_mem = sample_res.actual_val_mem;
+      uint64_t *sample_keys_ptr =
+          reinterpret_cast<uint64_t *>(sample_key_mem->ptr());
+      int edge_type_id = path[(step - 1) % path_len];
+      VLOG(2) << "sample edge type: " << edge_type_id << " step: " << step;
+      q.initialize(gpuid_,
+                   edge_type_id,
+                   (uint64_t)sample_keys_ptr,
+                   1,
+                   sample_res.total_sample_size);
+      sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false);
+
+      FillOneStep(d_type_keys + start,
+                  cur_walk,
+                  sample_res.total_sample_size,
+                  sample_res,
+                  1,
+                  step,
+                  len_per_row);
+      if (debug_mode_) {
+        cudaMemcpy(
+            h_walk, walk, buf_size_ * sizeof(uint64_t), cudaMemcpyDeviceToHost);
+        for (int xx = 0; xx < buf_size_; xx++) {
+          VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
+        }
+      }
+    }
+    // cursor_ += tmp_len;
+    i += jump_rows_ * walk_len_;
+    total_row += jump_rows_;
+    cursor_ += 1;
+  }
+  buf_state_.Reset(total_row);
+  int *d_random_row = reinterpret_cast<int *>(d_random_row_->ptr());
+
+  thrust::random::default_random_engine engine(shuffle_seed_);
+  const auto &exec_policy = thrust::cuda::par.on(stream_);
+  thrust::counting_iterator<int> cnt_iter(0);
+  thrust::shuffle_copy(exec_policy,
+                       cnt_iter,
+                       cnt_iter + total_row,
+                       thrust::device_pointer_cast(d_random_row),
+                       engine);
+
+  cudaStreamSynchronize(stream_);
+  shuffle_seed_ = engine();
+
+  if (debug_mode_) {
+    int *h_random_row = new int[total_row + 10];
+    cudaMemcpy(h_random_row,
+               d_random_row,
+               total_row * sizeof(int),
+               cudaMemcpyDeviceToHost);
+    for (int xx = 0; xx < total_row; xx++) {
+      VLOG(2) << "h_random_row[" << xx << "]: " << h_random_row[xx];
+    }
+    delete[] h_random_row;
+    delete[] h_walk;
+    delete[] h_sample_keys;
+    delete[] h_offset2idx;
+    delete[] h_len_per_row;
+    delete[] h_prefix_sum;
+  }
+  return total_row != 0;
+}
+
+void GraphDataGenerator::AllocResource(const paddle::platform::Place &place,
+                                       std::vector<LoDTensor *> feed_vec) {
+  place_ = place;
+  gpuid_ = place_.GetDeviceId();
+  VLOG(3) << "gpuid " << gpuid_;
+  stream_ = dynamic_cast<platform::CUDADeviceContext *>(
+                platform::DeviceContextPool::Instance().Get(place))
+                ->stream();
+  feed_vec_ = feed_vec;
+  slot_num_ = (feed_vec_.size() - 3) / 2;
+
+  // d_device_keys_.resize(h_device_keys_.size());
+  VLOG(2) << "h_device_keys size: " << h_device_keys_.size();
+  infer_node_type_start_ = std::vector<int>(h_device_keys_.size(), 0);
+  for (size_t i = 0; i < h_device_keys_.size(); i++) {
+    for (size_t j = 0; j < h_device_keys_[i]->size(); j++) {
+      VLOG(3) << "h_device_keys_[" << i << "][" << j
+              << "] = " << (*(h_device_keys_[i]))[j];
+    }
+    auto buf = memory::AllocShared(
+        place_, h_device_keys_[i]->size() * sizeof(uint64_t));
+    d_device_keys_.push_back(buf);
+    CUDA_CHECK(cudaMemcpyAsync(buf->ptr(),
+                               h_device_keys_[i]->data(),
+                               h_device_keys_[i]->size() * sizeof(uint64_t),
+                               cudaMemcpyHostToDevice,
+                               stream_));
+  }
+  // h_device_keys_ = h_device_keys;
+  // device_key_size_ = h_device_keys_->size();
+  // d_device_keys_ =
+  //    memory::AllocShared(place_, device_key_size_ * sizeof(int64_t));
+  // CUDA_CHECK(cudaMemcpyAsync(d_device_keys_->ptr(), h_device_keys_->data(),
+  //                           device_key_size_ * sizeof(int64_t),
+  //                           cudaMemcpyHostToDevice, stream_));
+  size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
+  d_prefix_sum_ =
+      memory::AllocShared(place_, (once_max_sample_keynum + 1) * sizeof(int));
+  int *d_prefix_sum_ptr = reinterpret_cast<int *>(d_prefix_sum_->ptr());
+  cudaMemsetAsync(
+      d_prefix_sum_ptr, 0, (once_max_sample_keynum + 1) * sizeof(int), stream_);
+  cursor_ = 0;
+  jump_rows_ = 0;
+  d_walk_ = memory::AllocShared(place_, buf_size_ * sizeof(uint64_t));
+  cudaMemsetAsync(d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_);
+  if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
+    d_feature_ =
+        memory::AllocShared(place_, buf_size_ * slot_num_ * sizeof(uint64_t));
+    cudaMemsetAsync(
+        d_feature_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_);
+  }
+  d_sample_keys_ =
+      memory::AllocShared(place_, once_max_sample_keynum * sizeof(uint64_t));
+
+  d_sampleidx2rows_.push_back(
+      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)));
+  d_sampleidx2rows_.push_back(
+      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)));
+  cur_sampleidx2row_ = 0;
+
+  d_len_per_row_ =
+      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int));
+  for (int i = -window_; i < 0; i++) {
+    window_step_.push_back(i);
+  }
+  for (int i = 0; i < window_; i++) {
+    window_step_.push_back(i + 1);
+  }
+  buf_state_.Init(batch_size_, walk_len_, &window_step_);
+  d_random_row_ = memory::AllocShared(
+      place_,
+      (once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int));
+  shuffle_seed_ = 0;
+
+  ins_buf_pair_len_ = 0;
+  d_ins_buf_ =
+      memory::AllocShared(place_, (batch_size_ * 2 * 2) * sizeof(uint64_t));
+  if (slot_num_ > 0) {
+    d_feature_buf_ = memory::AllocShared(
+        place_, (batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t));
+  }
+  d_pair_num_ = memory::AllocShared(place_, sizeof(int));
+  if (FLAGS_enable_opt_get_features && slot_num_ > 0) {
+    d_slot_tensor_ptr_ =
+        memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
+    d_slot_lod_tensor_ptr_ =
+        memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
+  }
+
+  cudaStreamSynchronize(stream_);
+}
+
+void GraphDataGenerator::SetConfig(
+    const paddle::framework::DataFeedDesc &data_feed_desc) {
+  auto graph_config = data_feed_desc.graph_config();
+  walk_degree_ = graph_config.walk_degree();
+  walk_len_ = graph_config.walk_len();
+  window_ = graph_config.window();
+  once_sample_startid_len_ = graph_config.once_sample_startid_len();
+  debug_mode_ = graph_config.debug_mode();
+  gpu_graph_training_ = graph_config.gpu_graph_training();
+  if (debug_mode_ || !gpu_graph_training_) {
+    batch_size_ = graph_config.batch_size();
+  } else {
+    batch_size_ = once_sample_startid_len_;
+  }
+  repeat_time_ = graph_config.sample_times_one_chunk();
+  buf_size_ =
+      once_sample_startid_len_ * walk_len_ * walk_degree_ * repeat_time_;
+  VLOG(2) << "Confirm GraphConfig, walk_degree : " << walk_degree_
+          << ", walk_len : " << walk_len_ << ", window : " << window_
+          << ", once_sample_startid_len : " << once_sample_startid_len_
+          << ", sample_times_one_chunk : " << repeat_time_
+          << ", batch_size: " << batch_size_;
+  std::string first_node_type = graph_config.first_node_type();
+  std::string meta_path = graph_config.meta_path();
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  auto edge_to_id = gpu_graph_ptr->edge_to_id;
+  auto node_to_id = gpu_graph_ptr->feature_to_id;
+  // parse first_node_type
+  auto node_types =
+      paddle::string::split_string<std::string>(first_node_type, ";");
+  VLOG(2) << "node_types: " << first_node_type;
+  finish_node_type_.clear();
+  node_type_start_.clear();
+  for (auto &type : node_types) {
+    auto iter = node_to_id.find(type);
+    PADDLE_ENFORCE_NE(
+        iter,
+        node_to_id.end(),
+        platform::errors::NotFound("(%s) is not found in node_to_id.", type));
+    VLOG(2) << "node_to_id[" << type << "] = " << iter->second;
+    first_node_type_.push_back(iter->second);
+    node_type_start_[iter->second] = 0;
+  }
+  meta_path_.resize(first_node_type_.size());
+  auto meta_paths = paddle::string::split_string<std::string>(meta_path, ";");
+
+  for (size_t i = 0; i < meta_paths.size(); i++) {
+    auto path = meta_paths[i];
+    auto nodes = paddle::string::split_string<std::string>(path, "-");
+    for (auto &node : nodes) {
+      auto iter = edge_to_id.find(node);
+      PADDLE_ENFORCE_NE(
+          iter,
+          edge_to_id.end(),
+          platform::errors::NotFound("(%s) is not found in edge_to_id.", node));
+      VLOG(2) << "edge_to_id[" << node << "] = " << iter->second;
+      meta_path_[i].push_back(iter->second);
+    }
+  }
+};
+
 }  // namespace framework
 }  // namespace paddle
 #endif
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
+#include <random>
 #include <sstream>
 #include <string>
 #include <thread>  // NOLINT
@@ -42,6 +43,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/string/string_helper.h"
 #if defined(PADDLE_WITH_CUDA)
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -56,6 +58,8 @@ namespace framework {
 class DataFeedDesc;
 class Scope;
 class Variable;
+class NeighborSampleResult;
+class NodeQueryResult;
 }  // namespace framework
 }  // namespace paddle

@@ -420,7 +424,6 @@ struct UsedSlotGpuType {
 };

 #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
-#define CUDA_CHECK(val) CHECK(val == gpuSuccess)
 template <typename T>
 struct CudaBuffer {
  T* cu_buffer;
@@ -776,6 +779,202 @@ class DLManager {
  std::map<std::string, DLHandle> handle_map_;
 };

+struct engine_wrapper_t {
+  std::default_random_engine engine;
+#if !defined(_WIN32)
+  engine_wrapper_t() {
+    struct timespec tp;
+    clock_gettime(CLOCK_REALTIME, &tp);
+    double cur_time = tp.tv_sec + tp.tv_nsec * 1e-9;
+    static std::atomic<uint64_t> x(0);
+    std::seed_seq sseq = {x++, x++, x++, (uint64_t)(cur_time * 1000)};
+    engine.seed(sseq);
+  }
+#endif
+};
+
+struct BufState {
+  int left;
+  int right;
+  int central_word;
+  int step;
+  engine_wrapper_t random_engine_;
+
+  int len;
+  int cursor;
+  int row_num;
+
+  int batch_size;
+  int walk_len;
+  std::vector<int>* window;
+
+  BufState() {}
+  ~BufState() {}
+
+  void Init(int graph_batch_size,
+            int graph_walk_len,
+            std::vector<int>* graph_window) {
+    batch_size = graph_batch_size;
+    walk_len = graph_walk_len;
+    window = graph_window;
+
+    left = 0;
+    right = window->size() - 1;
+    central_word = -1;
+    step = -1;
+
+    len = 0;
+    cursor = 0;
+    row_num = 0;
+    for (size_t i = 0; i < graph_window->size(); i++) {
+      VLOG(2) << "graph_window[" << i << "] = " << (*graph_window)[i];
+    }
+  }
+
+  void Reset(int total_rows) {
+    cursor = 0;
+    row_num = total_rows;
+    int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size;
+    len = tmp_len;
+    central_word = -1;
+    step = -1;
+    GetNextCentrolWord();
+  }
+
+  int GetNextStep() {
+    step++;
+    if (step <= right && central_word + (*window)[step] < walk_len) {
+      return 1;
+    }
+    return 0;
+  }
+
+  void Debug() {
+    VLOG(2) << "left: " << left << " right: " << right
+            << " central_word: " << central_word << " step: " << step
+            << " cursor: " << cursor << " len: " << len
+            << " row_num: " << row_num;
+  }
+
+  int GetNextCentrolWord() {
+    if (++central_word >= walk_len) {
+      return 0;
+    }
+    int window_size = window->size() / 2;
+    int random_window = random_engine_.engine() % window_size + 1;
+    left = window_size - random_window;
+    right = window_size + random_window - 1;
+    VLOG(2) << "random window: " << random_window << " window[" << left
+            << "] = " << (*window)[left] << " window[" << right
+            << "] = " << (*window)[right];
+
+    for (step = left; step <= right; step++) {
+      if (central_word + (*window)[step] >= 0) {
+        return 1;
+      }
+    }
+    return 0;
+  }
+
+  int GetNextBatch() {
+    cursor += len;
+    int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size;
+    if (tmp_len == 0) {
+      return 0;
+    }
+    len = tmp_len;
+    central_word = -1;
+    step = -1;
+    GetNextCentrolWord();
+    return tmp_len != 0;
+  }
+};
+
+class GraphDataGenerator {
+ public:
+  GraphDataGenerator(){};
+  virtual ~GraphDataGenerator(){};
+  void SetConfig(const paddle::framework::DataFeedDesc& data_feed_desc);
+  void AllocResource(const paddle::platform::Place& place,
+                     std::vector<LoDTensor*> feed_vec);
+  int AcquireInstance(BufState* state);
+  int GenerateBatch();
+  int FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk);
+  int FillFeatureBuf(uint64_t* d_walk, uint64_t* d_feature, size_t key_num);
+  int FillFeatureBuf(std::shared_ptr<phi::Allocation> d_walk,
+                     std::shared_ptr<phi::Allocation> d_feature);
+  void FillOneStep(uint64_t* start_ids,
+                   uint64_t* walk,
+                   int len,
+                   NeighborSampleResult& sample_res,
+                   int cur_degree,
+                   int step,
+                   int* len_per_row);
+  int FillInsBuf();
+  void SetDeviceKeys(std::vector<uint64_t>* device_keys, int type) {
+    type_to_index_[type] = h_device_keys_.size();
+    h_device_keys_.push_back(device_keys);
+  }
+
+ protected:
+  int walk_degree_;
+  int walk_len_;
+  int window_;
+  int once_sample_startid_len_;
+  int gpuid_;
+  // start ids
+  // int64_t* device_keys_;
+  // size_t device_key_size_;
+  std::vector<std::vector<uint64_t>*> h_device_keys_;
+  std::unordered_map<int, int> type_to_index_;
+  // point to device_keys_
+  size_t cursor_;
+  size_t jump_rows_;
+  int64_t* id_tensor_ptr_;
+  int64_t* show_tensor_ptr_;
+  int64_t* clk_tensor_ptr_;
+  cudaStream_t stream_;
+  paddle::platform::Place place_;
+  std::vector<LoDTensor*> feed_vec_;
+  std::vector<size_t> offset_;
+  std::shared_ptr<phi::Allocation> d_prefix_sum_;
+  std::vector<std::shared_ptr<phi::Allocation>> d_device_keys_;
+
+  std::shared_ptr<phi::Allocation> d_walk_;
+  std::shared_ptr<phi::Allocation> d_feature_;
+  std::shared_ptr<phi::Allocation> d_len_per_row_;
+  std::shared_ptr<phi::Allocation> d_random_row_;
+  //
+  std::vector<std::shared_ptr<phi::Allocation>> d_sampleidx2rows_;
+  int cur_sampleidx2row_;
+  // record the keys to call graph_neighbor_sample
+  std::shared_ptr<phi::Allocation> d_sample_keys_;
+  int sample_keys_len_;
+
+  std::set<int> finish_node_type_;
+  std::unordered_map<int, size_t> node_type_start_;
+  std::vector<int> infer_node_type_start_;
+
+  std::shared_ptr<phi::Allocation> d_ins_buf_;
+  std::shared_ptr<phi::Allocation> d_feature_buf_;
+  std::shared_ptr<phi::Allocation> d_pair_num_;
+  std::shared_ptr<phi::Allocation> d_slot_tensor_ptr_;
+  std::shared_ptr<phi::Allocation> d_slot_lod_tensor_ptr_;
+  int ins_buf_pair_len_;
+  // size of a d_walk buf
+  size_t buf_size_;
+  int repeat_time_;
+  std::vector<int> window_step_;
+  BufState buf_state_;
+  int batch_size_;
+  int slot_num_;
+  int shuffle_seed_;
+  int debug_mode_;
+  std::vector<int> first_node_type_;
+  std::vector<std::vector<int>> meta_path_;
+  bool gpu_graph_training_;
+};
+
 class DataFeed {
 public:
  DataFeed() {
@@ -838,6 +1037,14 @@ class DataFeed {
  virtual void SetParseLogKey(bool parse_logkey) {}
  virtual void SetEnablePvMerge(bool enable_pv_merge) {}
  virtual void SetCurrentPhase(int current_phase) {}
+  virtual void SetDeviceKeys(std::vector<uint64_t>* device_keys, int type) {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    gpu_graph_data_generator_.SetDeviceKeys(device_keys, type);
+#endif
+  }
+  virtual void SetGpuGraphMode(int gpu_graph_mode) {
+    gpu_graph_mode_ = gpu_graph_mode;
+  }
  virtual void SetFileListMutex(std::mutex* mutex) {
    mutex_for_pick_file_ = mutex;
  }
@@ -921,6 +1128,10 @@ class DataFeed {

  // The input type of pipe reader, 0 for one sample, 1 for one batch
  int input_type_;
+  int gpu_graph_mode_ = 0;
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+  GraphDataGenerator gpu_graph_data_generator_;
+#endif
 };

 // PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.

--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -27,6 +27,19 @@ message MultiSlotDesc {
  optional string uid_slot = 2;
 }

+message GraphConfig {
+  optional int32 walk_degree = 1 [ default = 1 ];
+  optional int32 walk_len = 2 [ default = 20 ];
+  optional int32 window = 3 [ default = 5 ];
+  optional int32 once_sample_startid_len = 4 [ default = 8000 ];
+  optional int32 sample_times_one_chunk = 5 [ default = 10 ];
+  optional int32 batch_size = 6 [ default = 1 ];
+  optional int32 debug_mode = 7 [ default = 0 ];
+  optional string first_node_type = 8;
+  optional string meta_path = 9;
+  optional bool gpu_graph_training = 10 [ default = true ];
+}
+
 message DataFeedDesc {
  optional string name = 1;
  optional int32 batch_size = 2 [ default = 32 ];
@@ -37,4 +50,5 @@ message DataFeedDesc {
  optional int32 pv_batch_size = 7 [ default = 32 ];
  optional int32 input_type = 8 [ default = 0 ];
  optional string so_parser_name = 9;
+  optional GraphConfig graph_config = 10;
 }
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -14,6 +14,7 @@

 #include "paddle/fluid/framework/data_set.h"

+#include "gflags/gflags.h"
 #include "google/protobuf/text_format.h"
 #if (defined PADDLE_WITH_DISTRIBUTE) && (defined PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/index_dataset/index_sampler.h"
@@ -26,6 +27,7 @@

 #ifdef PADDLE_WITH_PSCORE
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 #endif

 #if defined _WIN32 || defined __APPLE__
@@ -34,6 +36,8 @@
 #endif

 USE_INT_STAT(STAT_total_feasign_num_in_mem);
+DECLARE_bool(graph_get_neighbor_id);
+
 namespace paddle {
 namespace framework {

@@ -196,6 +200,16 @@ void DatasetImpl<T>::SetFeaEval(bool fea_eval, int record_candidate_size) {
          << " with record candidate size: " << record_candidate_size;
 }

+template <typename T>
+void DatasetImpl<T>::SetGpuGraphMode(int is_graph_mode) {
+  gpu_graph_mode_ = is_graph_mode;
+}
+
+template <typename T>
+int DatasetImpl<T>::GetGpuGraphMode() {
+  return gpu_graph_mode_;
+}
+
 template <typename T>
 std::vector<paddle::framework::DataFeed*> DatasetImpl<T>::GetReaders() {
  std::vector<paddle::framework::DataFeed*> ret;
@@ -440,12 +454,91 @@ void DatasetImpl<T>::LoadIntoMemory() {
  platform::Timer timeline;
  timeline.Start();
  std::vector<std::thread> load_threads;
-  for (int64_t i = 0; i < thread_num_; ++i) {
-    load_threads.push_back(std::thread(
-        &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
-  }
-  for (std::thread& t : load_threads) {
-    t.join();
+  if (gpu_graph_mode_) {
+    VLOG(0) << "in gpu_graph_mode";
+#ifdef PADDLE_WITH_HETERPS
+    graph_all_type_total_keys_.clear();
+    auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+    auto node_to_id = gpu_graph_ptr->feature_to_id;
+    auto edge_to_id = gpu_graph_ptr->edge_to_id;
+    graph_all_type_total_keys_.resize(node_to_id.size());
+    int cnt = 0;
+    for (auto& iter : node_to_id) {
+      int node_idx = iter.second;
+      std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
+      gpu_graph_ptr->get_all_id(
+          1, node_idx, thread_num_, &gpu_graph_device_keys);
+      auto& type_total_key = graph_all_type_total_keys_[cnt];
+      type_total_key.resize(thread_num_);
+      for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
+        VLOG(2) << "node type: " << node_idx << ", gpu_graph_device_keys[" << i
+                << "] = " << gpu_graph_device_keys[i].size();
+        for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
+          gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
+          type_total_key[i].push_back(gpu_graph_device_keys[i][j]);
+        }
+      }
+
+      for (size_t i = 0; i < readers_.size(); i++) {
+        readers_[i]->SetDeviceKeys(&type_total_key[i], node_idx);
+        readers_[i]->SetGpuGraphMode(gpu_graph_mode_);
+      }
+      cnt++;
+    }
+
+    VLOG(2) << "begin add feature_id into gpu_graph_total_keys_ size["
+            << gpu_graph_total_keys_.size() << "]";
+    for (auto& iter : node_to_id) {
+      std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
+      int node_idx = iter.second;
+      gpu_graph_ptr->get_all_feature_ids(
+          1, node_idx, thread_num_, &gpu_graph_device_keys);
+      for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
+        VLOG(2) << "begin node type: " << node_idx << ", gpu_graph_device_keys["
+                << i << "] = " << gpu_graph_device_keys[i].size();
+        for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
+          gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
+        }
+        VLOG(2) << "end node type: " << node_idx << ", gpu_graph_device_keys["
+                << i << "] = " << gpu_graph_device_keys[i].size();
+      }
+    }
+    VLOG(2) << "end add feature_id into gpu_graph_total_keys_ size["
+            << gpu_graph_total_keys_.size() << "]";
+
+    // FIX: trick for iterate edge table
+    for (auto& iter : edge_to_id) {
+      int edge_idx = iter.second;
+      std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
+      gpu_graph_ptr->get_all_id(
+          0, edge_idx, thread_num_, &gpu_graph_device_keys);
+      for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
+        VLOG(1) << "edge type: " << edge_idx << ", gpu_graph_device_keys[" << i
+                << "] = " << gpu_graph_device_keys[i].size();
+        for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
+          gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
+        }
+      }
+      if (FLAGS_graph_get_neighbor_id) {
+        std::vector<std::vector<uint64_t>> gpu_graph_neighbor_keys;
+        gpu_graph_ptr->get_all_neighbor_id(
+            0, edge_idx, thread_num_, &gpu_graph_neighbor_keys);
+        for (size_t i = 0; i < gpu_graph_neighbor_keys.size(); i++) {
+          for (size_t k = 0; k < gpu_graph_neighbor_keys[i].size(); k++) {
+            gpu_graph_total_keys_.push_back(gpu_graph_neighbor_keys[i][k]);
+          }
+        }
+      }
+    }
+#endif
+  } else {
+    for (int64_t i = 0; i < thread_num_; ++i) {
+      load_threads.push_back(std::thread(
+          &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
+    }
+    for (std::thread& t : load_threads) {
+      t.join();
+    }
  }
  input_channel_->Close();
  int64_t in_chan_size = input_channel_->Size();

--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -165,6 +165,9 @@ class Dataset {

  virtual std::vector<std::string> GetSlots() = 0;

+  virtual void SetGpuGraphMode(int is_graph_mode) = 0;
+  virtual int GetGpuGraphMode() = 0;
+
 protected:
  virtual int ReceiveFromClient(int msg_type,
                                int client_id,
@@ -213,6 +216,8 @@ class DatasetImpl : public Dataset {
  virtual std::pair<std::string, std::string> GetHdfsConfig() {
    return std::make_pair(fs_name_, fs_ugi_);
  }
+  virtual void SetGpuGraphMode(int is_graph_mode);
+  virtual int GetGpuGraphMode();
  virtual std::string GetDownloadCmd();
  virtual const paddle::framework::DataFeedDesc& GetDataFeedDesc() {
    return data_feed_desc_;
@@ -272,7 +277,9 @@ class DatasetImpl : public Dataset {
      return multi_consume_channel_;
    }
  }
-
+  std::vector<uint64_t>& GetGpuGraphTotalKeys() {
+    return gpu_graph_total_keys_;
+  }
  Channel<T>& GetInputChannelRef() { return input_channel_; }

 protected:
@@ -333,6 +340,10 @@ class DatasetImpl : public Dataset {
  std::vector<T> input_records_;  // only for paddleboxdatafeed
  std::vector<std::string> use_slots_;
  bool enable_heterps_ = false;
+  int gpu_graph_mode_ = 0;
+  // std::vector<std::vector<int64_t>> gpu_graph_device_keys_;
+  std::vector<std::vector<std::vector<uint64_t>>> graph_all_type_total_keys_;
+  std::vector<uint64_t> gpu_graph_total_keys_;
 };

 // use std::vector<MultiSlotType> or Record as data type

--- a/paddle/fluid/framework/device_worker.cc
+++ b/paddle/fluid/framework/device_worker.cc
@@ -14,8 +14,8 @@ limitations under the License. */

 #include "paddle/fluid/framework/device_worker.h"

+#include <chrono>
 #include "paddle/fluid/framework/convert_utils.h"
-
 namespace phi {
 class DenseTensor;
 }  // namespace phi
@@ -32,48 +32,179 @@ void DeviceWorker::SetDataFeed(DataFeed* data_feed) {
 }

 template <typename T>
-std::string PrintLodTensorType(Tensor* tensor, int64_t start, int64_t end) {
+std::string PrintLodTensorType(Tensor* tensor,
+                               int64_t start,
+                               int64_t end,
+                               char separator = ',',
+                               bool need_leading_separator = true) {
  auto count = tensor->numel();
  if (start < 0 || end > count) {
    VLOG(3) << "access violation";
    return "access violation";
  }
+  if (start >= end) return "";
  std::ostringstream os;
+  if (!need_leading_separator) {
+    os << tensor->data<T>()[start];
+    start++;
+  }
  for (int64_t i = start; i < end; i++) {
-    os << ":" << tensor->data<T>()[i];
+    // os << ":" << tensor->data<T>()[i];
+    os << separator << tensor->data<T>()[i];
  }
  return os.str();
 }
+template <typename T>
+void PrintLodTensorType(Tensor* tensor,
+                        int64_t start,
+                        int64_t end,
+                        std::string& out_val,
+                        char separator = ',',
+                        bool need_leading_separator = true) {
+  auto count = tensor->numel();
+  if (start < 0 || end > count) {
+    VLOG(3) << "access violation";
+    out_val += "access violation";
+    return;
+  }
+  if (start >= end) return;
+  if (!need_leading_separator) {
+    out_val += std::to_string(tensor->data<T>()[start]);
+    // os << tensor->data<T>()[start];
+    start++;
+  }
+  for (int64_t i = start; i < end; i++) {
+    // os << ":" << tensor->data<T>()[i];
+    // os << separator << tensor->data<T>()[i];
+    out_val += separator;
+    out_val += std::to_string(tensor->data<T>()[i]);
+  }
+}

-std::string PrintLodTensorIntType(Tensor* tensor, int64_t start, int64_t end) {
+#define FLOAT_EPS 1e-8
+#define MAX_FLOAT_BUFF_SIZE 40
+template <>
+void PrintLodTensorType<float>(Tensor* tensor,
+                               int64_t start,
+                               int64_t end,
+                               std::string& out_val,
+                               char separator,
+                               bool need_leading_separator) {
+  char buf[MAX_FLOAT_BUFF_SIZE];
+  auto count = tensor->numel();
+  if (start < 0 || end > count) {
+    VLOG(3) << "access violation";
+    out_val += "access violation";
+    return;
+  }
+  if (start >= end) return;
+  for (int64_t i = start; i < end; i++) {
+    if (i != start || need_leading_separator) out_val += separator;
+    if (tensor->data<float>()[i] > -FLOAT_EPS &&
+        tensor->data<float>()[i] < FLOAT_EPS)
+      out_val += "0";
+    else {
+      sprintf(buf, "%.9f", tensor->data<float>()[i]);
+      out_val += buf;
+    }
+  }
+}
+std::string PrintLodTensorIntType(Tensor* tensor,
+                                  int64_t start,
+                                  int64_t end,
+                                  char separator = ',',
+                                  bool need_leading_separator = true) {
  auto count = tensor->numel();
  if (start < 0 || end > count) {
    VLOG(3) << "access violation";
    return "access violation";
  }
+  if (start >= end) return "";
  std::ostringstream os;
+  if (!need_leading_separator) {
+    os << static_cast<uint64_t>(tensor->data<int64_t>()[start]);
+    start++;
+  }
  for (int64_t i = start; i < end; i++) {
-    os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
+    // os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
+    os << separator << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
  }
  return os.str();
 }

-std::string PrintLodTensor(Tensor* tensor, int64_t start, int64_t end) {
+void PrintLodTensorIntType(Tensor* tensor,
+                           int64_t start,
+                           int64_t end,
+                           std::string& out_val,
+                           char separator = ',',
+                           bool need_leading_separator = true) {
+  auto count = tensor->numel();
+  if (start < 0 || end > count) {
+    VLOG(3) << "access violation";
+    out_val += "access violation";
+    return;
+  }
+  if (start >= end) return;
+  if (!need_leading_separator) {
+    out_val +=
+        std::to_string(static_cast<uint64_t>(tensor->data<int64_t>()[start]));
+    start++;
+  }
+  for (int64_t i = start; i < end; i++) {
+    // os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
+    // os << separator << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
+    out_val += separator;
+    out_val +=
+        std::to_string(static_cast<uint64_t>(tensor->data<int64_t>()[i]));
+  }
+  // return os.str();
+}
+
+std::string PrintLodTensor(Tensor* tensor,
+                           int64_t start,
+                           int64_t end,
+                           char separator,
+                           bool need_leading_separator) {
  std::string out_val;
  if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) {
-    out_val = PrintLodTensorType<float>(tensor, start, end);
+    out_val = PrintLodTensorType<float>(
+        tensor, start, end, separator, need_leading_separator);
  } else if (framework::TransToProtoVarType(tensor->dtype()) ==
             proto::VarType::INT64) {
-    out_val = PrintLodTensorIntType(tensor, start, end);
+    out_val = PrintLodTensorIntType(
+        tensor, start, end, separator, need_leading_separator);
  } else if (framework::TransToProtoVarType(tensor->dtype()) ==
             proto::VarType::FP64) {
-    out_val = PrintLodTensorType<double>(tensor, start, end);
+    out_val = PrintLodTensorType<double>(
+        tensor, start, end, separator, need_leading_separator);
  } else {
    out_val = "unsupported type";
  }
  return out_val;
 }

+void PrintLodTensor(Tensor* tensor,
+                    int64_t start,
+                    int64_t end,
+                    std::string& out_val,
+                    char separator,
+                    bool need_leading_separator) {
+  if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) {
+    PrintLodTensorType<float>(
+        tensor, start, end, out_val, separator, need_leading_separator);
+  } else if (framework::TransToProtoVarType(tensor->dtype()) ==
+             proto::VarType::INT64) {
+    PrintLodTensorIntType(
+        tensor, start, end, out_val, separator, need_leading_separator);
+  } else if (framework::TransToProtoVarType(tensor->dtype()) ==
+             proto::VarType::FP64) {
+    PrintLodTensorType<double>(
+        tensor, start, end, out_val, separator, need_leading_separator);
+  } else {
+    out_val += "unsupported type";
+  }
+}
+
 std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index) {
  auto& dims = tensor->dims();
  if (tensor->lod().size() != 0) {
@@ -122,6 +253,11 @@ void DeviceWorker::DumpParam(const Scope& scope, const int batch_id) {
 }

 void DeviceWorker::InitRandomDumpConfig(const TrainerDesc& desc) {
+  bool is_dump_in_simple_mode = desc.is_dump_in_simple_mode();
+  if (is_dump_in_simple_mode) {
+    dump_mode_ = 3;
+    return;
+  }
  bool enable_random_dump = desc.enable_random_dump();
  if (!enable_random_dump) {
    dump_mode_ = 0;
@@ -140,16 +276,124 @@ void DeviceWorker::DumpField(const Scope& scope,
                             int dump_interval) {  // dump_mode: 0: no random,
                                                   // 1: random with insid hash,
                                                   // 2: random with random
-                                                   // number
+  // 3: simple mode using multi-threads, for gpugraphps-mode
+  auto start1 = std::chrono::steady_clock::now();
+
  size_t batch_size = device_reader_->GetCurBatchSize();
  auto& ins_id_vec = device_reader_->GetInsIdVec();
  auto& ins_content_vec = device_reader_->GetInsContentVec();
-  if (ins_id_vec.size() > 0) {
+  if (dump_mode_ == 3) {
+    batch_size = std::string::npos;
+    bool has_valid_batch = false;
+    for (auto& field : *dump_fields_) {
+      Variable* var = scope.FindVar(field);
+      if (var == nullptr) {
+        VLOG(0) << "Note: field[" << field
+                << "] cannot be find in scope, so it was skipped.";
+        continue;
+      }
+      LoDTensor* tensor = var->GetMutable<LoDTensor>();
+      if (!tensor->IsInitialized()) {
+        VLOG(0) << "Note: field[" << field
+                << "] is not initialized, so it was skipped.";
+        continue;
+      }
+      auto& dims = tensor->dims();
+      if (dims.size() == 2 && dims[0] > 0) {
+        batch_size = std::min(batch_size, static_cast<size_t>(dims[0]));
+        // VLOG(0)<<"in dump field ---> "<<field<<" dim_size = "<<dims[0]<<"
+        // "<<dims[1]<<" batch_size = "<<batch_size;
+        has_valid_batch = true;
+      }
+    }
+    if (!has_valid_batch) return;
+  } else if (ins_id_vec.size() > 0) {
    batch_size = ins_id_vec.size();
  }
  std::vector<std::string> ars(batch_size);
-  std::vector<bool> hit(batch_size, false);
+  if (dump_mode_ == 3) {
+    if (dump_fields_ == NULL || (*dump_fields_).size() == 0) {
+      return;
+    }
+    auto set_output_str = [&, this](
+                              size_t begin, size_t end, LoDTensor* tensor) {
+      std::pair<int64_t, int64_t> bound;
+      auto& dims = tensor->dims();
+      for (size_t i = begin; i < end; ++i) {
+        bound = {i * dims[1], (i + 1) * dims[1]};
+        // auto bound = GetTensorBound(tensor, i);

+        if (ars[i].size() > 0) ars[i] += "\t";
+        // ars[i] += '[';
+        PrintLodTensor(tensor, bound.first, bound.second, ars[i], ' ', false);
+        // ars[i] += ']';
+        // ars[i] += "<" + PrintLodTensor(tensor, bound.first, bound.second, '
+        // ', false) + ">";
+      }
+    };
+    std::vector<std::thread> threads(tensor_iterator_thread_num);
+    for (auto& field : *dump_fields_) {
+      Variable* var = scope.FindVar(field);
+      if (var == nullptr) {
+        VLOG(0) << "Note: field[" << field
+                << "] cannot be find in scope, so it was skipped.";
+        continue;
+      }
+      LoDTensor* tensor = var->GetMutable<LoDTensor>();
+      if (!tensor->IsInitialized()) {
+        VLOG(0) << "Note: field[" << field
+                << "] is not initialized, so it was skipped.";
+        continue;
+      }
+      framework::LoDTensor cpu_tensor;
+      if (platform::is_gpu_place(tensor->place())) {
+        TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+        cpu_tensor.set_lod(tensor->lod());
+        tensor = &cpu_tensor;
+      }
+      auto& dims = tensor->dims();
+      if (dims.size() != 2 || dims[0] <= 0) {
+        VLOG(0) << "Note: field[" << field
+                << "] cannot pass check, so it was "
+                   "skipped. Maybe the dimension is "
+                   "wrong ";
+        VLOG(0) << dims.size() << " " << dims[0] << " * " << dims[1];
+        continue;
+      }
+      size_t acutal_thread_num =
+          std::min((size_t)batch_size, tensor_iterator_thread_num);
+      for (size_t i = 0; i < acutal_thread_num; i++) {
+        size_t average_size = batch_size / acutal_thread_num;
+        size_t begin =
+            average_size * i + std::min(batch_size % acutal_thread_num, i);
+        size_t end =
+            begin + average_size + (i < batch_size % acutal_thread_num ? 1 : 0);
+        threads[i] = std::thread(set_output_str, begin, end, tensor);
+      }
+      for (size_t i = 0; i < acutal_thread_num; i++) threads[i].join();
+    }
+    auto end1 = std::chrono::steady_clock::now();
+    auto tt =
+        std::chrono::duration_cast<std::chrono::microseconds>(end1 - start1);
+    VLOG(1) << "writing a batch takes " << tt.count() << " us";
+
+    size_t acutal_thread_num =
+        std::min((size_t)batch_size, tensor_iterator_thread_num);
+    for (size_t i = 0; i < acutal_thread_num; i++) {
+      size_t average_size = batch_size / acutal_thread_num;
+      size_t begin =
+          average_size * i + std::min(batch_size % acutal_thread_num, i);
+      size_t end =
+          begin + average_size + (i < batch_size % acutal_thread_num ? 1 : 0);
+      for (size_t j = begin + 1; j < end; j++) {
+        if (ars[begin].size() > 0 && ars[j].size() > 0) ars[begin] += "\n";
+        ars[begin] += ars[j];
+      }
+      if (ars[begin].size() > 0) writer_ << ars[begin];
+    }
+    return;
+  }
+  std::vector<bool> hit(batch_size, false);
  std::default_random_engine engine(0);
  std::uniform_int_distribution<size_t> dist(0U, INT_MAX);
  for (size_t i = 0; i < batch_size; i++) {
@@ -206,6 +450,7 @@ void DeviceWorker::DumpField(const Scope& scope,
      ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
    }
  }
+
  // #pragma omp parallel for
  for (size_t i = 0; i < ars.size(); i++) {
    if (ars[i].length() == 0) {

--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #endif

+#include <map>
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/heter_util.h"
@@ -59,7 +60,17 @@ class Scope;
 namespace paddle {
 namespace framework {

-std::string PrintLodTensor(Tensor* tensor, int64_t start, int64_t end);
+std::string PrintLodTensor(Tensor* tensor,
+                           int64_t start,
+                           int64_t end,
+                           char separator = ',',
+                           bool need_leading_separator = false);
+void PrintLodTensor(Tensor* tensor,
+                    int64_t start,
+                    int64_t end,
+                    std::string& output_str,
+                    char separator = ',',
+                    bool need_leading_separator = false);
 std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index);
 bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);

@@ -230,6 +241,7 @@ class DeviceWorker {
  int dump_mode_ = 0;
  int dump_interval_ = 10000;
  ChannelWriter<std::string> writer_;
+  const size_t tensor_iterator_thread_num = 16;
  platform::DeviceContext* dev_ctx_ = nullptr;
 };

@@ -772,7 +784,6 @@ class HeterSectionWorker : public DeviceWorker {
  static uint64_t batch_id_;
  uint64_t total_ins_num_ = 0;
  platform::DeviceContext* dev_ctx_ = nullptr;
-
  bool debug_ = false;
  std::vector<double> op_total_time_;
  std::vector<std::string> op_name_;

--- a/paddle/fluid/framework/device_worker_test.cc
+++ b/paddle/fluid/framework/device_worker_test.cc
@@ -29,7 +29,7 @@ TEST(LodTensor, PrintLodTensor) {
  std::string res = PrintLodTensor(&tensor1, -1, 2);
  ASSERT_EQ(res, "access violation");
  res = PrintLodTensor(&tensor1, 0, 2);
-  ASSERT_EQ(res, ":0.2:0.5");
+  ASSERT_EQ(res, "0.2,0.5");

  LoDTensor tensor2;
  tensor2.Resize({2});
@@ -39,7 +39,7 @@ TEST(LodTensor, PrintLodTensor) {
  res = PrintLodTensor(&tensor2, -1, 2);
  ASSERT_EQ(res, "access violation");
  res = PrintLodTensor(&tensor2, 0, 2);
-  ASSERT_EQ(res, ":1:2");
+  ASSERT_EQ(res, "1,2");

  LoDTensor tensor3;
  tensor3.Resize({2});
@@ -47,7 +47,40 @@ TEST(LodTensor, PrintLodTensor) {
  tensor3.data<double>()[0] = 0.1;
  tensor3.data<double>()[1] = 0.2;
  res = PrintLodTensor(&tensor3, 0, 2);
-  ASSERT_EQ(res, ":0.1:0.2");
+  ASSERT_EQ(res, "0.1,0.2");
+
+  LoDTensor tensor4;
+  tensor4.Resize({2});
+  tensor4.mutable_data<double>(platform::CPUPlace());
+  tensor4.data<double>()[0] = 0.1;
+  tensor4.data<double>()[1] = 0.2;
+  res = "";
+  PrintLodTensor(&tensor4, 0, 2, res);
+  // ASSERT_EQ(res, "0.1,0.2");
+
+  LoDTensor tensor5;
+  tensor5.Resize({2});
+  tensor5.mutable_data<int64_t>(platform::CPUPlace());
+  tensor5.data<int64_t>()[0] = 1;
+  tensor5.data<int64_t>()[1] = 2;
+  res = "";
+  PrintLodTensor(&tensor5, -1, 2, res);
+  ASSERT_EQ(res, "access violation");
+  res = "";
+  PrintLodTensor(&tensor5, 0, 2, res);
+  ASSERT_EQ(res, "1,2");
+
+  LoDTensor tensor6;
+  tensor6.Resize({2});
+  tensor6.mutable_data<float>(platform::CPUPlace());
+  tensor6.data<float>()[0] = 0.2;
+  tensor6.data<float>()[1] = 0.5;
+  res = "";
+  PrintLodTensor(&tensor6, -1, 2, res);
+  // ASSERT_EQ(res, "access violation");
+  res = "";
+  PrintLodTensor(&tensor6, 0, 2, res);
+  // ASSERT_EQ(res, "0.2,0.5");
 }

 TEST(LodTensor, GetTensorBound) {

--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -207,6 +207,12 @@ message TableAccessorParameter {
  repeated TableAccessorSaveParameter table_accessor_save_param = 8;
  optional SGDParameter embed_sgd_param = 10;
  optional SGDParameter embedx_sgd_param = 11;
+  optional GraphSGDParameter graph_sgd_param = 12;
+}
+
+message GraphSGDParameter {
+  optional uint32 nodeid_slot = 1 [ default = 9008 ];
+  optional float feature_learning_rate = 2 [ default = 0.05 ];
 }

 message SGDParameter {

--- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
@@ -51,6 +51,8 @@
  }
 #endif

+DECLARE_bool(gpugraph_enable_hbm_table_collision_stat);
+
 // TODO: can we do this more efficiently?
 __inline__ __device__ int8_t atomicCAS(int8_t* address,
                                       int8_t compare,
@@ -330,8 +332,7 @@ template <typename Key,
          Key unused_key,
          typename Hasher = default_hash<Key>,
          typename Equality = equal_to<Key>,
-          typename Allocator = managed_allocator<thrust::pair<Key, Element>>,
-          bool count_collisions = false>
+          typename Allocator = managed_allocator<thrust::pair<Key, Element>>>
 class concurrent_unordered_map : public managed {
 public:
  using size_type = size_t;
@@ -363,9 +364,12 @@ class concurrent_unordered_map : public managed {
        m_allocator(a),
        m_hashtbl_size(n),
        m_hashtbl_capacity(n),
-        m_collisions(0),
-        m_unused_element(
-            unused_element) {  // allocate the raw data of hash table:
+        m_unused_element(unused_element),
+        m_enable_collision_stat(false),
+        m_insert_times(0),
+        m_insert_collisions(0),
+        m_query_times(0),
+        m_query_collisions(0) {  // allocate the raw data of hash table:
    // m_hashtbl_values,pre-alloc it on current GPU if UM.
    m_hashtbl_values = m_allocator.allocate(m_hashtbl_capacity);
    constexpr int block_size = 128;
@@ -390,9 +394,9 @@ class concurrent_unordered_map : public managed {
    // Initialize kernel, set all entry to unused <K,V>
    init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size>>>(
        m_hashtbl_values, m_hashtbl_size, unused_key, m_unused_element);
-    // CUDA_RT_CALL( cudaGetLastError() );
    CUDA_RT_CALL(cudaStreamSynchronize(0));
    CUDA_RT_CALL(cudaGetLastError());
+    m_enable_collision_stat = FLAGS_gpugraph_enable_hbm_table_collision_stat;
  }

  ~concurrent_unordered_map() {
@@ -572,11 +576,16 @@ class concurrent_unordered_map : public managed {
      // TODO: How to handle data types less than 32 bits?
      if (keys_equal(unused_key, old_key) || keys_equal(insert_key, old_key)) {
        update_existing_value(existing_value, x, op);
-
        insert_success = true;
+        if (m_enable_collision_stat) {
+          atomicAdd(&m_insert_times, 1);
+        }
        break;
      }

+      if (m_enable_collision_stat) {
+        atomicAdd(&m_insert_collisions, 1);
+      }
      current_index = (current_index + 1) % hashtbl_size;
      current_hash_bucket = &(hashtbl_values[current_index]);
    }
@@ -614,9 +623,9 @@ std::numeric_limits<mapped_type>::is_integer && sizeof(unsigned long long int)
 reinterpret_cast<unsigned long long
 int*>(tmp_it), unused, value ); if ( old_val == unused ) { it = tmp_it;
              }
-              else if ( count_collisions )
+              else if ( m_enable_collision_stat )
              {
-                  atomicAdd( &m_collisions, 1 );
+                  atomicAdd( &m_insert_collisions, 1 );
              }
          } else {
              const key_type old_key = atomicCAS( &(tmp_it->first), unused_key,
@@ -625,9 +634,9 @@ x.first );
                  (m_hashtbl_values+hash_tbl_idx)->second = x.second;
                  it = tmp_it;
              }
-              else if ( count_collisions )
+              else if ( m_enable_collision_stat )
              {
-                  atomicAdd( &m_collisions, 1 );
+                  atomicAdd( &m_insert_collisions, 1 );
              }
          }
 #else
@@ -648,8 +657,7 @@ x.second );
  }
  */

-  __forceinline__ __host__ __device__ const_iterator
-  find(const key_type& k) const {
+  __forceinline__ __device__ const_iterator find(const key_type& k) {
    size_type key_hash = m_hf(k);
    size_type hash_tbl_idx = key_hash % m_hashtbl_size;

@@ -667,10 +675,17 @@ x.second );
        begin_ptr = m_hashtbl_values + m_hashtbl_size;
        break;
      }
+      if (m_enable_collision_stat) {
+        atomicAdd(&m_query_collisions, 1);
+      }
      hash_tbl_idx = (hash_tbl_idx + 1) % m_hashtbl_size;
      ++counter;
    }

+    if (m_enable_collision_stat) {
+      atomicAdd(&m_query_times, 1);
+    }
+
    return const_iterator(
        m_hashtbl_values, m_hashtbl_values + m_hashtbl_size, begin_ptr);
  }
@@ -770,7 +785,7 @@ x.second );

  int assign_async(const concurrent_unordered_map& other,
                   cudaStream_t stream = 0) {
-    m_collisions = other.m_collisions;
+    m_insert_collisions = other.m_insert_collisions;
    if (other.m_hashtbl_size <= m_hashtbl_capacity) {
      m_hashtbl_size = other.m_hashtbl_size;
    } else {
@@ -795,10 +810,15 @@ x.second );
                   0,
                   stream>>>(
        m_hashtbl_values, m_hashtbl_size, unused_key, m_unused_element);
-    if (count_collisions) m_collisions = 0;
+    if (m_enable_collision_stat) {
+      m_insert_times = 0;
+      m_insert_collisions = 0;
+      m_query_times = 0;
+      m_query_collisions = 0;
+    }
  }

-  unsigned long long get_num_collisions() const { return m_collisions; }
+  unsigned long long get_num_collisions() const { return m_insert_collisions; }

  void print() {
    for (size_type i = 0; i < 5; ++i) {
@@ -850,6 +870,21 @@ x.second );
    return it;
  }

+  __host__ void print_collision(int id) {
+    if (m_enable_collision_stat) {
+      printf(
+          "collision stat for hbm table %d, insert(%lu:%lu:%.2f), "
+          "query(%lu:%lu:%.2f)\n",
+          id,
+          m_insert_times,
+          m_insert_collisions,
+          m_insert_collisions / (double)m_insert_times,
+          m_query_times,
+          m_query_collisions,
+          m_query_collisions / (double)m_query_times);
+    }
+  }
+
 private:
  const hasher m_hf;
  const key_equal m_equal;
@@ -862,7 +897,11 @@ x.second );
  size_type m_hashtbl_capacity;
  value_type* m_hashtbl_values;

-  unsigned long long m_collisions;
+  bool m_enable_collision_stat;
+  uint64_t m_insert_times;
+  uint64_t m_insert_collisions;
+  uint64_t m_query_times;
+  uint64_t m_query_collisions;
 };

 #endif  // CONCURRENT_UNORDERED_MAP_CUH
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -13,11 +13,16 @@ limitations under the License. */

 #ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"

 namespace paddle {
 namespace framework {

-template <typename FVAccessor>
+const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
+#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
+#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
+
+template <typename GPUAccessor>
 __global__ void PullCopy(float** dest,
                         const float* src,
                         const int64_t* len,
@@ -26,7 +31,7 @@ __global__ void PullCopy(float** dest,
                         uint64_t** keys,
                         uint64_t max_val_size,
                         int* gpu_dim,
-                         FVAccessor feature_value_accessor) {
+                         GPUAccessor gpu_accessor) {
  CUDA_KERNEL_LOOP(i, total_len) {
    int low = 0;
    int high = slot_num - 1;
@@ -42,12 +47,62 @@ __global__ void PullCopy(float** dest,
    float* feature_value_ptr =
        (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
    int mf_dim = gpu_dim[x] - 3;
-    feature_value_accessor.Select(
+    gpu_accessor.Select(
        dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
  }
 }

-template <typename FVAccessor>
+template <typename TAccess>
+__global__ void PullDedupCopy(const size_t N,
+                              const uint64_t* total_keys,
+                              float** dest,
+                              const float* src,
+                              const int64_t* slot_lens,
+                              uint64_t max_val_size,
+                              const int* slot_dims,
+                              const int hidden,
+                              const int* key2slot,
+                              const uint32_t* restore_idx,
+                              TAccess accessor) {
+  CUDA_KERNEL_LOOP(idx, N) {
+    int i = idx / hidden;
+    int off = idx % hidden;
+
+    int x = key2slot[i];
+    int y = i - slot_lens[x];
+
+    assert(slot_dims[x] == hidden);
+    float* dest_ptr = dest[x] + y * hidden;
+    // 0 key fill zero
+    if (total_keys[i] == 0) {
+      *(dest_ptr + off) = 0;
+      return;
+    }
+
+    float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) *
+                                               uint64_t(max_val_size));
+    switch (off) {
+      case 0:
+        *(dest_ptr + off) = src_ptr[accessor.ShowIndex()];
+        break;
+      case 1:
+        *(dest_ptr + off) = src_ptr[accessor.ClickIndex()];
+        break;
+      case 2:
+        *(dest_ptr + off) = src_ptr[accessor.EmbedWIndex()];
+        break;
+      default:
+        if (src_ptr[accessor.MfSizeIndex()] == 0) {
+          *(dest_ptr + off) = 0;
+        } else {
+          *(dest_ptr + off) = src_ptr[accessor.EmbedxWIndex() + off - 3];
+        }
+        break;
+    }
+  }
+}
+
+template <typename GPUAccessor>
 __global__ void PushCopyWithPool(float* dest,
                                 float** src,
                                 int64_t* len,
@@ -57,7 +112,7 @@ __global__ void PushCopyWithPool(float* dest,
                                 int* slot_vector,
                                 int* mf_dim_vector,
                                 size_t grad_value_size,
-                                 FVAccessor feature_value_accessor) {
+                                 GPUAccessor gpu_accessor) {
  CUDA_KERNEL_LOOP(i, total_len) {
    int low = 0;
    int high = slot_num - 1;
@@ -72,24 +127,167 @@ __global__ void PushCopyWithPool(float* dest,
    int y = i - (x ? len[low - 1] : 0);
    float* cur = (float*)((char*)dest + i * grad_value_size);

-    cur[feature_value_accessor.common_push_value.SlotIndex()] =
-        (float)slot_vector[x];
+    cur[gpu_accessor.common_push_value.SlotIndex()] = (float)slot_vector[x];
    int mf_dim = mf_dim_vector[x];
-    cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim;
+    cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim;

-    cur[feature_value_accessor.common_push_value.ShowIndex()] =
+    cur[gpu_accessor.common_push_value.ShowIndex()] =
        *(src[x] + y * (mf_dim + 3));
-    cur[feature_value_accessor.common_push_value.ClickIndex()] =
+    cur[gpu_accessor.common_push_value.ClickIndex()] =
        *(src[x] + y * (mf_dim + 3) + 1);
-    cur[feature_value_accessor.common_push_value.EmbedGIndex()] =
+    cur[gpu_accessor.common_push_value.EmbedGIndex()] =
        *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
    for (int j = 0; j < mf_dim; j++) {
-      cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] =
+      cur[gpu_accessor.common_push_value.EmbedxGIndex() + j] =
          *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
    }
  }
 }

+template <typename TAccess>
+__global__ void PushMergeCopyAtomic(const size_t N,
+                                    const uint64_t* total_keys,
+                                    float* dest,
+                                    float** src,
+                                    const int hidden,
+                                    const int bs,
+                                    const int* slot_vector,
+                                    const int* slot_dims,
+                                    const int64_t* slot_lens,
+                                    const int* key2slot,
+                                    const uint32_t* d_restore_idx,
+                                    size_t grad_value_size,
+                                    TAccess accessor) {
+  CUDA_KERNEL_LOOP(idx, N) {
+    int i = idx / hidden;
+    int off = idx % hidden;
+    // filter 0 keys
+    if (total_keys[i] == 0) {
+      return;
+    }
+
+    int x = key2slot[i];
+    int y = i - slot_lens[x];
+
+    const float* ptr = src[x] + y * hidden;
+    float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size);
+    int mf_dim = slot_dims[x] - 3;
+    switch (off) {
+      case 0:
+        cur[accessor.SlotIndex()] = (float)slot_vector[x];
+        cur[accessor.MfDimIndex()] = mf_dim;
+        paddle::platform::CudaAtomicAdd(&cur[accessor.ShowIndex()],
+                                        *(ptr + off));
+        break;
+      case 1:
+        paddle::platform::CudaAtomicAdd(&cur[accessor.ClickIndex()],
+                                        *(ptr + off));
+        break;
+      case 2:
+        paddle::platform::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
+                                        *(ptr + off) * -1. * bs);
+        break;
+      default:
+        int embedx_idx = off - 3;
+        if (mf_dim < embedx_idx) {
+          return;
+        }
+        paddle::platform::CudaAtomicAdd(
+            &cur[accessor.EmbedxGIndex() + embedx_idx],
+            *(ptr + off) * -1. * bs);
+        break;
+    }
+  }
+}
+
+#define SUM_GRAD_VALUE                                             \
+  for (uint32_t j = 0; j < count; ++j) {                           \
+    const uint32_t& pos = d_sort_idx[start + j];                   \
+    const int& x = key2slot[pos];                                  \
+    y = pos - slot_lens[x];                                        \
+    val += *(reinterpret_cast<float*>(src[x] + y * hidden + off)); \
+  }
+
+template <typename TAccess>
+__global__ void PushMergeCopy(const size_t N,
+                              const uint64_t* total_keys,
+                              float* dest,
+                              float** src,
+                              const int hidden,
+                              const int bs,
+                              const int* slot_vector,
+                              const int* slot_dims,
+                              const int64_t* slot_lens,
+                              const int* key2slot,
+                              const uint32_t* d_sort_idx,
+                              const uint32_t* d_sort_offset,
+                              const uint32_t* d_sort_cnt,
+                              size_t grad_value_size,
+                              TAccess accessor) {
+  CUDA_KERNEL_LOOP(idx, N) {
+    int i = idx / hidden;
+    int off = idx % hidden;
+    // filter 0 keys
+    float* cur = (float*)((char*)dest + i * grad_value_size);
+
+    if (total_keys[i] == 0) {
+      switch (off) {
+        case 0:
+          cur[accessor.SlotIndex()] = 0;
+          cur[accessor.MfDimIndex()] = 0;
+          cur[accessor.ShowIndex()] = 0.0;
+          break;
+        case 1:
+          cur[accessor.ClickIndex()] = 0.0;
+          break;
+        case 2:
+          cur[accessor.EmbedGIndex()] = 0.0;
+          break;
+        default:
+          cur[accessor.EmbedxGIndex() + off - 3] = 0.0;
+          break;
+      }
+      return;
+    }
+
+    const uint32_t& start = d_sort_offset[i];
+    const uint32_t& count = d_sort_cnt[i];
+    const uint32_t& pos = d_sort_idx[start];
+
+    const int& x = key2slot[pos];
+    int y = pos - slot_lens[x];
+    int mf_dim = slot_dims[x] - 3;
+
+    double val = 0.0;
+
+    switch (off) {
+      case 0:
+        cur[accessor.SlotIndex()] = (float)slot_vector[x];
+        cur[accessor.MfDimIndex()] = mf_dim;
+        SUM_GRAD_VALUE
+        cur[accessor.ShowIndex()] = val;
+        break;
+      case 1:
+        SUM_GRAD_VALUE
+        cur[accessor.ClickIndex()] = val;
+        break;
+      case 2:
+        SUM_GRAD_VALUE
+        cur[accessor.EmbedGIndex()] = val * -1. * bs;
+        break;
+      default:
+        int embedx_idx = off - 3;
+        if (mf_dim < embedx_idx) {
+          cur[accessor.EmbedxGIndex() + embedx_idx] = 0.0;
+          return;
+        }
+        SUM_GRAD_VALUE
+        cur[accessor.EmbedxGIndex() + embedx_idx] = val * -1. * bs;
+        break;
+    }
+  }
+}
+
 template <typename GPUAccessor>
 void AccessorWrapper<GPUAccessor>::CopyForPullImpl(
    const paddle::platform::Place& place,
@@ -183,6 +381,118 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
  cudaStreamSynchronize(stream);
 }

+template <typename GPUAccessor>
+void AccessorWrapper<GPUAccessor>::CopyForPullDedupImpl(
+    const paddle::platform::Place& place,
+    const uint64_t* total_keys,
+    float** gpu_values,
+    const float* total_values_gpu,
+    const int64_t* slot_lens,
+    const int* key2slot,
+    const int hidden_size,
+    const int64_t total_length,
+    const int* slot_dims,
+    const uint32_t* gpu_restore_idx,
+    int pull_value_size) {
+  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+                    paddle::platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  size_t N = total_length * hidden_size;
+  PullDedupCopy<<<CUDA_BLOCK(N), stream>>>(N,
+                                           total_keys,
+                                           gpu_values,
+                                           total_values_gpu,
+                                           slot_lens,
+                                           pull_value_size,
+                                           slot_dims,
+                                           hidden_size,
+                                           key2slot,
+                                           gpu_restore_idx,
+                                           gpu_accessor_.common_pull_value);
+  cudaStreamSynchronize(stream);
+}
+
+template <typename GPUAccessor>
+void AccessorWrapper<GPUAccessor>::CopyForPushDedupImpl(
+    const paddle::platform::Place& place,
+    const uint64_t* total_keys,
+    float** grad_values,
+    float* total_grad_values_gpu,
+    const int* slots,
+    const int64_t* slot_lens,
+    const int hidden_size,
+    const int64_t total_length,
+    const int64_t dedup_length,
+    const int batch_size,
+    const int* slot_dims,
+    const int* key2slot,
+    const uint32_t* d_restore_idx,
+    const size_t grad_value_size) {
+  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+                    paddle::platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  cudaMemsetAsync(
+      total_grad_values_gpu, 0, dedup_length * grad_value_size, stream);
+  size_t N = total_length * hidden_size;
+  PushMergeCopyAtomic<<<CUDA_BLOCK(N), stream>>>(
+      N,
+      total_keys,
+      total_grad_values_gpu,
+      grad_values,
+      hidden_size,
+      batch_size,
+      slots,
+      slot_dims,
+      slot_lens,
+      key2slot,
+      d_restore_idx,
+      grad_value_size,
+      gpu_accessor_.common_push_value);
+
+  cudaStreamSynchronize(stream);
+}
+
+template <typename GPUAccessor>
+void AccessorWrapper<GPUAccessor>::CopyForPushDedupImpl(
+    const paddle::platform::Place& place,
+    const uint64_t* total_keys,
+    float** grad_values,
+    float* total_grad_values_gpu,
+    const int* slots,
+    const int64_t* slot_lens,
+    const int hidden_size,
+    const int64_t total_length,
+    const int64_t dedup_length,
+    const int batch_size,
+    const int* slot_dims,
+    const int* key2slot,
+    const uint32_t* gpu_sort_idx,
+    const uint32_t* gpu_sort_offset,
+    const uint32_t* gpu_sort_lens,
+    const size_t grad_value_size) {
+  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+                    paddle::platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  // merge all grad to one
+  size_t N = dedup_length * hidden_size;
+  PushMergeCopy<<<CUDA_BLOCK(N), stream>>>(N,
+                                           total_keys,
+                                           total_grad_values_gpu,
+                                           grad_values,
+                                           hidden_size,
+                                           batch_size,
+                                           slots,
+                                           slot_dims,
+                                           slot_lens,
+                                           key2slot,
+                                           gpu_sort_idx,
+                                           gpu_sort_offset,
+                                           gpu_sort_lens,
+                                           grad_value_size,
+                                           gpu_accessor_.common_push_value);
+  cudaStreamSynchronize(stream);
+}
+
 #ifdef PADDLE_WITH_PSCORE
 template class AccessorWrapper<CommonFeatureValueAccessor>;
 #endif

--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -36,27 +36,10 @@ typedef uint64_t FeatureKey;
 #define TYPEALIGN(ALIGNVAL, LEN) \
  (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))

-class FeatureValueAccessor {
- public:
-  __host__ __device__ FeatureValueAccessor() {}
-  __host__ __device__ ~FeatureValueAccessor() {}
-
-  __host__ __device__ virtual int Configure(
-      std::unordered_map<std::string, float> config) {
-    _config = config;
-    Initialize();
-    return 0;
-  }
-  __host__ __device__ virtual int Initialize() = 0;
-
- protected:
-  std::unordered_map<std::string, float> _config;
-};
-
 // adagrad: embed_sgd_dim=1, embedx_sgd_dim=1,embedx_dim=n
 // adam std:  embed_sgd_dim=4, embedx_sgd_dim=n*2+2,embedx_dim=n
 // adam shared:  embed_sgd_dim=4, embedx_sgd_dim=4,embedx_dim=n
-class CommonFeatureValueAccessor : public FeatureValueAccessor {
+class CommonFeatureValueAccessor {
 public:
  struct CommonFeatureValue {
    /*
@@ -175,6 +158,30 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
    int optimizer_type_;
  };

+  struct CommonPullValue {
+    /*
+      float show;
+      float click;
+      float embed_w;
+      float mf_size
+      std::vector<float> embedx_w;
+    */
+    __host__ __device__ static int Dim(int embedx_dim) {
+      return 4 + embedx_dim;
+    }
+    __host__ __device__ int DimSize(size_t dim) { return sizeof(float); }
+    __host__ __device__ int Size(int embedx_dim) {
+      return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float));
+    }
+    __host__ __device__ int ShowIndex() { return 0; }
+    __host__ __device__ int ClickIndex() { return 1; }
+    __host__ __device__ int EmbedWIndex() { return 2; }
+    __host__ __device__ int MfSizeIndex() {
+      return 3;
+    }  // actual mf size (ex. 0)
+    __host__ __device__ int EmbedxWIndex() { return 4; }
+  };
+
  struct CommonPushValue {
    /*
       float slot;
@@ -229,43 +236,10 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
    }
  };

-  struct CommonPullValue {
-    /*
-       float show;
-       float click;
-       float embed_w;
-       std::vector<float> embedx_w;
-       */
-
-    __host__ __device__ static int Dim(int embedx_dim) {
-      return 3 + embedx_dim;
-    }
-    __host__ __device__ int DimSize(size_t dim) { return sizeof(float); }
-    __host__ __device__ int Size(int embedx_dim) {
-      return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float));
-    }
-    __host__ __device__ int ShowIndex() { return 0; }
-    __host__ __device__ int ClickIndex() { return 1; }
-    __host__ __device__ int EmbedWIndex() { return 2; }
-    __host__ __device__ int EmbedxWIndex() { return 3; }
-    __host__ __device__ float& Show(float* val) {
-      return val[CommonPullValue::ShowIndex()];
-    }
-    __host__ __device__ float& Click(float* val) {
-      return val[CommonPullValue::ClickIndex()];
-    }
-    __host__ __device__ float& EmbedW(float* val) {
-      return val[CommonPullValue::EmbedWIndex()];
-    }
-    __host__ __device__ float* EmbedxW(float* val) {
-      return val + CommonPullValue::EmbedxWIndex();
-    }
-  };
-
  __host__ __device__ CommonFeatureValueAccessor() {}
  __host__ __device__ ~CommonFeatureValueAccessor() {}

-  __host__ __device__ virtual int Initialize() {
+  __host__ int Initialize() {
    int optimizer_type = (_config.find("optimizer_type") == _config.end())
                             ? 1
                             : int(_config["optimizer_type"]);
@@ -288,6 +262,12 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
    return 0;
  }

+  __host__ int Configure(std::unordered_map<std::string, float>& config) {
+    _config = config;
+    Initialize();
+    return 0;
+  }
+
  // // build阶段从cpu_val赋值给gpu_val
  __host__ void BuildFill(
      float* gpu_val,
@@ -388,7 +368,7 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
 #endif
  }

-  // dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel
+  // dy_mf_fill_dvals_kernel 阶段 gpukernel
  // 中从src_val赋值给dest_val
  __host__ __device__ void FeatureValueFill(float* dest_val,
                                            float* src_val,
@@ -422,6 +402,32 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
    }
  }

+  // dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel
+  // 中从src_val赋值给dest_val
+  __host__ __device__ void PullValueFill(float* dest_val, float* src_val) {
+    dest_val[common_pull_value.ShowIndex()] =
+        src_val[common_feature_value.ShowIndex()];
+    dest_val[common_pull_value.ClickIndex()] =
+        src_val[common_feature_value.ClickIndex()];
+    dest_val[common_pull_value.EmbedWIndex()] =
+        src_val[common_feature_value.EmbedWIndex()];
+
+    int mf_size = int(src_val[common_feature_value.MfSizeIndex()]);
+    if (mf_size == 0) {
+      dest_val[common_pull_value.MfSizeIndex()] = 0;
+      return;
+    }
+    // set pull value real dim size
+    int mf_dim = int(src_val[common_feature_value.MfDimIndex()]);
+    dest_val[common_pull_value.MfSizeIndex()] = mf_dim;
+
+    int embedx_off = common_pull_value.EmbedxWIndex();
+    int value_off = common_feature_value.EmbedxWIndex();
+    for (int k = 0; k < mf_dim; ++k) {
+      dest_val[embedx_off + k] = src_val[value_off + k];
+    }
+  }
+
  // dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel
  // 中从src_val赋值给dest_val
  __host__ __device__ void PushValueFill(float* dest_val,
@@ -508,8 +514,9 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
      }
    } else {
      for (int j = 0; j < mf_dim; j++) {
-        *(dest_val + common_pull_value.EmbedxWIndex() + j) =
-            src_val[common_feature_value.EmbedxWOffsetIndex(src_val) + j];
+        // common_pull_value EmbedxWIndex 之前还有 MfSizeIndex，
+        // 所以这里没有直接使用 common_pull_value.EmbedxWIndex()
+        *(dest_val + 3 + j) = src_val[common_pull_value.EmbedxWIndex() + j];
      }
    }
  }
@@ -554,6 +561,7 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
  }

 public:
+  std::unordered_map<std::string, float> _config;
  CommonFeatureValue common_feature_value;
  CommonPushValue common_push_value;
  CommonPullValue common_pull_value;
@@ -638,6 +646,8 @@ class VirtualAccessor {

  virtual size_t GetPushValueSize(int& mf_dim) = 0;

+  virtual size_t GetPullValueSize(int& mf_dim) = 0;
+
  virtual void BuildFill(void* gpu_val,
                         void* cpu_val,
                         paddle::distributed::ValueAccessor* cpu_table_accessor,
@@ -657,6 +667,18 @@ class VirtualAccessor {
                           const int64_t total_length,
                           int* gpu_dim,
                           int feature_value_size) = 0;
+  // dedup
+  virtual void CopyForPull(const paddle::platform::Place& place,
+                           const uint64_t* total_keys,
+                           float** gpu_values,
+                           const float* total_values_gpu,
+                           const int64_t* slot_lens,
+                           const int* key2slot,
+                           const int hidden_size,
+                           const int64_t total_length,
+                           const int* slot_dims,
+                           const uint32_t* gpu_restore_idx,
+                           int pull_value_size) = 0;

  virtual void CopyForPush(const paddle::platform::Place& place,
                           const std::vector<const float*>& grad_values,
@@ -668,6 +690,39 @@ class VirtualAccessor {
                           std::vector<int>& slot_vector,
                           std::vector<int>& slot_mf_dim_vector) = 0;

+  // dedup
+  virtual void CopyForPush(const paddle::platform::Place& place,
+                           const uint64_t* total_keys,
+                           float** grad_values,
+                           float* total_grad_values_gpu,
+                           const int* slots,
+                           const int64_t* slot_lens,
+                           const int hidden_size,
+                           const int64_t total_length,
+                           const int64_t dedup_length,
+                           const int batch_size,
+                           const int* slot_dims,
+                           const int* key2slot,
+                           const uint32_t* d_restore_idx,
+                           const size_t grad_value_size) = 0;
+
+  virtual void CopyForPush(const paddle::platform::Place& place,
+                           const uint64_t* total_keys,
+                           float** grad_values,
+                           float* total_grad_values_gpu,
+                           const int* slots,
+                           const int64_t* slot_lens,
+                           const int hidden_size,
+                           const int64_t total_length,
+                           const int64_t dedup_length,
+                           const int batch_size,
+                           const int* slot_dims,
+                           const int* key2slot,
+                           const uint32_t* gpu_sort_idx,
+                           const uint32_t* gpu_sort_offset,
+                           const uint32_t* gpu_sort_lens,
+                           const size_t grad_value_size) = 0;
+
  virtual std::string ParseToString(const float* v, int param_size) = 0;
 };

@@ -691,6 +746,12 @@ class AccessorWrapper : public VirtualAccessor {
    return gpu_accessor_.common_push_value.Size(mf_dim);
  }

+  virtual size_t GetPullValueSize(int& mf_dim) {
+    return gpu_accessor_.common_pull_value.Size(mf_dim);
+  }
+
+  GPUAccessor* AccessorPtr() { return &gpu_accessor_; }
+
  virtual void BuildFill(void* gpu_val,
                         void* cpu_val,
                         paddle::distributed::ValueAccessor* cpu_table_accessor,
@@ -727,6 +788,30 @@ class AccessorWrapper : public VirtualAccessor {
                    feature_value_size);
  }

+  virtual void CopyForPull(const paddle::platform::Place& place,
+                           const uint64_t* total_keys,
+                           float** gpu_values,
+                           const float* total_values_gpu,
+                           const int64_t* slot_lens,
+                           const int* key2slot,
+                           const int hidden_size,
+                           const int64_t total_length,
+                           const int* slot_dims,
+                           const uint32_t* gpu_restore_idx,
+                           int pull_value_size) {
+    CopyForPullDedupImpl(place,
+                         total_keys,
+                         gpu_values,
+                         total_values_gpu,
+                         slot_lens,
+                         key2slot,
+                         hidden_size,
+                         total_length,
+                         slot_dims,
+                         gpu_restore_idx,
+                         pull_value_size);
+  }
+
  virtual void CopyForPush(const paddle::platform::Place& place,
                           const std::vector<const float*>& grad_values,
                           float* total_grad_values_gpu,
@@ -747,6 +832,70 @@ class AccessorWrapper : public VirtualAccessor {
                    slot_mf_dim_vector);
  }

+  virtual void CopyForPush(const paddle::platform::Place& place,
+                           const uint64_t* total_keys,
+                           float** grad_values,
+                           float* total_grad_values_gpu,
+                           const int* slots,
+                           const int64_t* slot_lens,
+                           const int hidden_size,
+                           const int64_t total_length,
+                           const int64_t dedup_length,
+                           const int batch_size,
+                           const int* slot_dims,
+                           const int* key2slot,
+                           const uint32_t* d_restore_idx,
+                           const size_t grad_value_size) {
+    CopyForPushDedupImpl(place,
+                         total_keys,
+                         grad_values,
+                         total_grad_values_gpu,
+                         slots,
+                         slot_lens,
+                         hidden_size,
+                         total_length,
+                         dedup_length,
+                         batch_size,
+                         slot_dims,
+                         key2slot,
+                         d_restore_idx,
+                         grad_value_size);
+  }
+
+  virtual void CopyForPush(const paddle::platform::Place& place,
+                           const uint64_t* total_keys,
+                           float** grad_values,
+                           float* total_grad_values_gpu,
+                           const int* slots,
+                           const int64_t* slot_lens,
+                           const int hidden_size,
+                           const int64_t total_length,
+                           const int64_t dedup_length,
+                           const int batch_size,
+                           const int* slot_dims,
+                           const int* key2slot,
+                           const uint32_t* gpu_sort_idx,
+                           const uint32_t* gpu_sort_offset,
+                           const uint32_t* gpu_sort_lens,
+                           const size_t grad_value_size) {
+    CopyForPushDedupImpl(place,
+                         total_keys,
+                         grad_values,
+                         total_grad_values_gpu,
+                         slots,
+                         slot_lens,
+                         hidden_size,
+                         total_length,
+                         dedup_length,
+                         batch_size,
+                         slot_dims,
+                         key2slot,
+                         gpu_sort_idx,
+                         gpu_sort_offset,
+                         gpu_sort_lens,
+                         grad_value_size);
+  }
+
  void CopyForPullImpl(const paddle::platform::Place& place,
                       uint64_t** gpu_keys,
                       const std::vector<float*>& values,
@@ -768,6 +917,49 @@ class AccessorWrapper : public VirtualAccessor {
                       std::vector<int>& slot_vector,
                       std::vector<int>& slot_mf_dim_vector);

+  void CopyForPullDedupImpl(const paddle::platform::Place& place,
+                            const uint64_t* total_keys,
+                            float** gpu_values,
+                            const float* total_values_gpu,
+                            const int64_t* slot_lens,
+                            const int* key2slot,
+                            const int hidden_size,
+                            const int64_t total_length,
+                            const int* slot_dims,
+                            const uint32_t* gpu_restore_idx,
+                            int pull_value_size);
+
+  void CopyForPushDedupImpl(const paddle::platform::Place& place,
+                            const uint64_t* total_keys,
+                            float** grad_values,
+                            float* total_grad_values_gpu,
+                            const int* slots,
+                            const int64_t* slot_lens,
+                            const int hidden_size,
+                            const int64_t total_length,
+                            const int64_t dedup_length,
+                            const int batch_size,
+                            const int* slot_dims,
+                            const int* key2slot,
+                            const uint32_t* d_restore_idx,
+                            const size_t grad_value_size);
+
+  void CopyForPushDedupImpl(const paddle::platform::Place& place,
+                            const uint64_t* total_keys,
+                            float** grad_values,
+                            float* total_grad_values_gpu,
+                            const int* slots,
+                            const int64_t* slot_lens,
+                            const int hidden_size,
+                            const int64_t total_length,
+                            const int64_t dedup_length,
+                            const int batch_size,
+                            const int* slot_dims,
+                            const int* key2slot,
+                            const uint32_t* gpu_sort_idx,
+                            const uint32_t* gpu_sort_offset,
+                            const uint32_t* gpu_sort_lens,
+                            const size_t grad_value_size);
  virtual std::string ParseToString(const float* v, int param_size) {
    return gpu_accessor_.ParseToString(v, param_size);
  }
@@ -775,10 +967,10 @@ class AccessorWrapper : public VirtualAccessor {
  GPUAccessor gpu_accessor_;
 };

-class GlobalAccessorTransfor {
+class GlobalAccessorFactory {
 public:
-  static GlobalAccessorTransfor& GetInstance() {
-    static GlobalAccessorTransfor ins;
+  static GlobalAccessorFactory& GetInstance() {
+    static GlobalAccessorFactory ins;
    return ins;
  }
  void Init(std::string accessor_type) {
@@ -788,7 +980,7 @@ class GlobalAccessorTransfor {
    if (accessor_type == "CtrDymfAccessor") {
      accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>();
    } else {
-      VLOG(0) << "GlobalAccessorTransfor Init not support accessor_type:"
+      VLOG(0) << "GlobalAccessorFactory Init not support accessor_type:"
              << accessor_type;
      accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>();
    }

--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -21,56 +21,75 @@
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/phi/core/enforce.h"
+DECLARE_bool(gpugraph_load_node_list_into_hbm);
 namespace paddle {
 namespace framework {
-struct GpuPsGraphNode {
-  int64_t node_id;
-  int64_t neighbor_size, neighbor_offset;
+struct GpuPsNodeInfo {
+  uint32_t neighbor_size, neighbor_offset;
+  GpuPsNodeInfo() : neighbor_size(0), neighbor_offset(0) {}
  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
  // neighbor_size) of int64_t *neighbor_list;
 };

 struct GpuPsCommGraph {
-  int64_t *neighbor_list;
-  GpuPsGraphNode *node_list;
-  int64_t neighbor_size, node_size;
-  // the size of neighbor array and graph_node_list array
+  uint64_t *node_list;
+  // when FLAGS_gpugraph_load_node_list_into_hbm is ture locate on both side
+  // else only locate on host side
+  int64_t node_size;              //  the size of node_list
+  GpuPsNodeInfo *node_info_list;  // only locate on host side
+  uint64_t *neighbor_list;        // locate on both side
+  int64_t neighbor_size;          // the size of neighbor_list
  GpuPsCommGraph()
-      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
-  GpuPsCommGraph(int64_t *neighbor_list_,
-                 GpuPsGraphNode *node_list_,
-                 int64_t neighbor_size_,
-                 int64_t node_size_)
-      : neighbor_list(neighbor_list_),
-        node_list(node_list_),
-        neighbor_size(neighbor_size_),
-        node_size(node_size_) {}
-  void init_on_cpu(int64_t neighbor_size, int64_t node_size) {
-    this->neighbor_size = neighbor_size;
-    this->node_size = node_size;
-    this->neighbor_list = new int64_t[neighbor_size];
-    this->node_list = new paddle::framework::GpuPsGraphNode[node_size];
+      : node_list(nullptr),
+        node_size(0),
+        node_info_list(nullptr),
+        neighbor_list(nullptr),
+        neighbor_size(0) {}
+  GpuPsCommGraph(uint64_t *node_list_,
+                 int64_t node_size_,
+                 GpuPsNodeInfo *node_info_list_,
+                 uint64_t *neighbor_list_,
+                 int64_t neighbor_size_)
+      : node_list(node_list_),
+        node_size(node_size_),
+        node_info_list(node_info_list_),
+        neighbor_list(neighbor_list_),
+        neighbor_size(neighbor_size_) {}
+  void init_on_cpu(int64_t neighbor_size_, int64_t node_size_) {
+    if (node_size_ > 0) {
+      this->node_size = node_size_;
+      this->node_list = new uint64_t[node_size_];
+      this->node_info_list = new paddle::framework::GpuPsNodeInfo[node_size_];
+    }
+    if (neighbor_size_) {
+      this->neighbor_size = neighbor_size_;
+      this->neighbor_list = new uint64_t[neighbor_size_];
+    }
  }
  void release_on_cpu() {
-    delete[] neighbor_list;
-    delete[] node_list;
+#define DEL_PTR_ARRAY(p) \
+  if (p != nullptr) {    \
+    delete[] p;          \
+    p = nullptr;         \
+  }
+    DEL_PTR_ARRAY(node_list);
+    DEL_PTR_ARRAY(neighbor_list);
+    DEL_PTR_ARRAY(node_info_list);
+    node_size = 0;
+    neighbor_size = 0;
  }
-  void display_on_cpu() {
+  void display_on_cpu() const {
    VLOG(0) << "neighbor_size = " << neighbor_size;
    VLOG(0) << "node_size = " << node_size;
-    for (size_t i = 0; i < neighbor_size; i++) {
+    for (int64_t i = 0; i < neighbor_size; i++) {
      VLOG(0) << "neighbor " << i << " " << neighbor_list[i];
    }
-    for (size_t i = 0; i < node_size; i++) {
-      VLOG(0) << "node i " << node_list[i].node_id
-              << " neighbor_size = " << node_list[i].neighbor_size;
-      std::string str;
-      int offset = node_list[i].neighbor_offset;
-      for (size_t j = 0; j < node_list[i].neighbor_size; j++) {
-        if (j > 0) str += ",";
-        str += std::to_string(neighbor_list[j + offset]);
-      }
-      VLOG(0) << str;
+    for (int64_t i = 0; i < node_size; i++) {
+      auto id = node_list[i];
+      auto val = node_info_list[i];
+      VLOG(0) << "node id " << id << "," << val.neighbor_offset << ":"
+              << val.neighbor_size;
    }
  }
 };
@@ -110,37 +129,33 @@ node 9:[14,14]
 node 17:[15,15]
 ...
 by the above information,
-we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
-of size 9,
-where node_list[i].id = u_id[i]
-then we have:
-node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
-node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
-node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
-node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
-node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
-node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
-node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
-node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
-node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
+we generate a node_list and node_info_list in GpuPsCommGraph,
+node_list: [0,5,1,2,7,3,8,9,17]
+node_info_list: [(2,0),(2,2),(1,4),(1,5),(3,6),(4,9),(1,13),(1,14),(1,15)]
+Here, we design the data in this format to better
+adapt to gpu and avoid to convert again.
 */
 struct NeighborSampleQuery {
  int gpu_id;
-  int64_t *key;
-  int sample_size;
+  int table_idx;
+  uint64_t *src_nodes;
  int len;
-  void initialize(int gpu_id, int64_t key, int sample_size, int len) {
+  int sample_size;
+  void initialize(
+      int gpu_id, int table_idx, uint64_t src_nodes, int sample_size, int len) {
+    this->table_idx = table_idx;
    this->gpu_id = gpu_id;
-    this->key = (int64_t *)key;
+    this->src_nodes = (uint64_t *)src_nodes;
    this->sample_size = sample_size;
    this->len = len;
  }
  void display() {
-    int64_t *sample_keys = new int64_t[len];
+    uint64_t *sample_keys = new uint64_t[len];
    VLOG(0) << "device_id " << gpu_id << " sample_size = " << sample_size;
-    VLOG(0) << "there are " << len << " keys ";
+    VLOG(0) << "there are " << len << " keys to sample for graph " << table_idx;
    std::string key_str;
-    cudaMemcpy(sample_keys, key, len * sizeof(int64_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(
+        sample_keys, src_nodes, len * sizeof(uint64_t), cudaMemcpyDeviceToHost);

    for (int i = 0; i < len; i++) {
      if (key_str.size() > 0) key_str += ";";
@@ -151,14 +166,14 @@ struct NeighborSampleQuery {
  }
 };
 struct NeighborSampleResult {
-  int64_t *val;
-  int64_t *actual_val;
+  uint64_t *val;
+  uint64_t *actual_val;
  int *actual_sample_size, sample_size, key_size;
  int total_sample_size;
  std::shared_ptr<memory::Allocation> val_mem, actual_sample_size_mem;
  std::shared_ptr<memory::Allocation> actual_val_mem;
-  int64_t *get_val() { return val; }
-  int64_t get_actual_val() { return (int64_t)actual_val; }
+  uint64_t *get_val() { return val; }
+  uint64_t get_actual_val() { return (uint64_t)actual_val; }
  int *get_actual_sample_size() { return actual_sample_size; }
  int get_sample_size() { return sample_size; }
  int get_key_size() { return key_size; }
@@ -170,8 +185,8 @@ struct NeighborSampleResult {
    platform::CUDADeviceGuard guard(dev_id);
    platform::CUDAPlace place = platform::CUDAPlace(dev_id);
    val_mem =
-        memory::AllocShared(place, _sample_size * _key_size * sizeof(int64_t));
-    val = (int64_t *)val_mem->ptr();
+        memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t));
+    val = (uint64_t *)val_mem->ptr();
    actual_sample_size_mem =
        memory::AllocShared(place, _key_size * sizeof(int));
    actual_sample_size = (int *)actual_sample_size_mem->ptr();
@@ -217,13 +232,15 @@ struct NeighborSampleResult {
    delete[] ac_size;
    VLOG(0) << " ------------------";
  }
-  std::vector<int64_t> get_sampled_graph(NeighborSampleQuery q) {
-    std::vector<int64_t> graph;
+  std::vector<uint64_t> get_sampled_graph(NeighborSampleQuery q) {
+    std::vector<uint64_t> graph;
    int64_t *sample_keys = new int64_t[q.len];
    std::string key_str;
-    cudaMemcpy(
-        sample_keys, q.key, q.len * sizeof(int64_t), cudaMemcpyDeviceToHost);
-    int64_t *res = new int64_t[sample_size * key_size];
+    cudaMemcpy(sample_keys,
+               q.src_nodes,
+               q.len * sizeof(uint64_t),
+               cudaMemcpyDeviceToHost);
+    uint64_t *res = new uint64_t[sample_size * key_size];
    cudaMemcpy(res,
               val,
               sample_size * key_size * sizeof(int64_t),
@@ -263,25 +280,25 @@ struct NeighborSampleResult {
 };

 struct NodeQueryResult {
-  int64_t *val;
+  uint64_t *val;
  int actual_sample_size;
-  int64_t get_val() { return (int64_t)val; }
+  uint64_t get_val() { return (uint64_t)val; }
  int get_len() { return actual_sample_size; }
  std::shared_ptr<memory::Allocation> val_mem;
  void initialize(int query_size, int dev_id) {
    platform::CUDADeviceGuard guard(dev_id);
    platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-    val_mem = memory::AllocShared(place, query_size * sizeof(int64_t));
-    val = (int64_t *)val_mem->ptr();
-
-    // cudaMalloc((void **)&val, query_size * sizeof(int64_t));
+    val_mem = memory::AllocShared(place, query_size * sizeof(uint64_t));
+    val = (uint64_t *)val_mem->ptr();
    actual_sample_size = 0;
  }
  void display() {
    VLOG(0) << "in node query result display ------------------";
-    int64_t *res = new int64_t[actual_sample_size];
-    cudaMemcpy(
-        res, val, actual_sample_size * sizeof(int64_t), cudaMemcpyDeviceToHost);
+    uint64_t *res = new uint64_t[actual_sample_size];
+    cudaMemcpy(res,
+               val,
+               actual_sample_size * sizeof(uint64_t),
+               cudaMemcpyDeviceToHost);

    VLOG(0) << "actual_sample_size =" << actual_sample_size;
    std::string str;
@@ -298,7 +315,91 @@ struct NodeQueryResult {
    actual_sample_size = 0;
  };
  ~NodeQueryResult() {}
+};  // end of struct NodeQueryResult
+
+struct GpuPsFeaInfo {
+  uint32_t feature_size, feature_offset;
+  // this node's feature is stored on [feature_offset,feature_offset +
+  // feature_size) of int64_t *feature_list;
 };
-}  // namespace framework
-};  // namespace paddle
+
+struct GpuPsCommGraphFea {
+  uint64_t *node_list;     // only locate on host side, the list of node id
+  uint64_t *feature_list;  // locate on both side
+  uint8_t *slot_id_list;   // locate on both side
+  GpuPsFeaInfo
+      *fea_info_list;  // only locate on host side, the list of fea_info
+  uint64_t feature_size, node_size;
+  // the size of feature array and graph_node_list array
+  GpuPsCommGraphFea()
+      : node_list(NULL),
+        feature_list(NULL),
+        slot_id_list(NULL),
+        fea_info_list(NULL),
+        feature_size(0),
+        node_size(0) {}
+  GpuPsCommGraphFea(uint64_t *node_list_,
+                    uint64_t *feature_list_,
+                    uint8_t *slot_id_list_,
+                    GpuPsFeaInfo *fea_info_list_,
+                    uint64_t feature_size_,
+                    uint64_t node_size_)
+      : node_list(node_list_),
+        feature_list(feature_list_),
+        slot_id_list(slot_id_list_),
+        fea_info_list(fea_info_list_),
+        feature_size(feature_size_),
+        node_size(node_size_) {}
+  void init_on_cpu(uint64_t feature_size,
+                   uint64_t node_size,
+                   uint32_t slot_num) {
+    PADDLE_ENFORCE_LE(
+        slot_num,
+        255,
+        platform::errors::InvalidArgument(
+            "The number of slot_num should not be greater than 255 "
+            ", but the slot_num is %d ",
+            slot_num));
+    this->feature_size = feature_size;
+    this->node_size = node_size;
+    this->node_list = new uint64_t[node_size];
+    this->feature_list = new uint64_t[feature_size];
+    this->slot_id_list = new uint8_t[feature_size];
+    this->fea_info_list = new GpuPsFeaInfo[node_size];
+  }
+  void release_on_cpu() {
+#define DEL_PTR_ARRAY(p) \
+  if (p != nullptr) {    \
+    delete[] p;          \
+    p = nullptr;         \
+  }
+    DEL_PTR_ARRAY(node_list);
+    DEL_PTR_ARRAY(feature_list);
+    DEL_PTR_ARRAY(slot_id_list);
+    DEL_PTR_ARRAY(fea_info_list);
+  }
+  void display_on_cpu() const {
+    VLOG(1) << "feature_size = " << feature_size;
+    VLOG(1) << "node_size = " << node_size;
+    for (uint64_t i = 0; i < feature_size; i++) {
+      VLOG(1) << "feature_list[" << i << "] = " << feature_list[i];
+    }
+    for (uint64_t i = 0; i < node_size; i++) {
+      VLOG(1) << "node_id[" << node_list[i]
+              << "] feature_size = " << fea_info_list[i].feature_size;
+      std::string str;
+      uint32_t offset = fea_info_list[i].feature_offset;
+      for (uint64_t j = 0; j < fea_info_list[i].feature_size; j++) {
+        if (j > 0) str += ",";
+        str += std::to_string(slot_id_list[j + offset]);
+        str += ":";
+        str += std::to_string(feature_list[j + offset]);
+      }
+      VLOG(1) << str;
+    }
+  }
+};  // end of struct GpuPsCommGraphFea
+
+}  // end of namespace framework
+}  // end of namespace paddle
 #endif
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+#define CUDA_CHECK(cmd)                                                       \
+  do {                                                                        \
+    cudaError_t e = cmd;                                                      \
+    CHECK(e == cudaSuccess) << "Cuda failure " << __FILE__ << ":" << __LINE__ \
+                            << " " << cudaGetErrorString(e) << std::endl;     \
+  } while (0)
+
+class CudaDeviceRestorer {
+ public:
+  CudaDeviceRestorer() { cudaGetDevice(&dev_); }
+  ~CudaDeviceRestorer() { cudaSetDevice(dev_); }
+
+ private:
+  int dev_;
+};
+
+inline void debug_gpu_memory_info(int gpu_id, const char* desc) {
+  CudaDeviceRestorer r;
+
+  size_t avail{0};
+  size_t total{0};
+  cudaSetDevice(gpu_id);
+  auto err = cudaMemGetInfo(&avail, &total);
+  PADDLE_ENFORCE_EQ(
+      err,
+      cudaSuccess,
+      platform::errors::InvalidArgument("cudaMemGetInfo failed!"));
+  VLOG(0) << "updatex gpu memory on device " << gpu_id << ", "
+          << "avail=" << avail / 1024.0 / 1024.0 / 1024.0 << "g, "
+          << "total=" << total / 1024.0 / 1024.0 / 1024.0 << "g, "
+          << "use_rate=" << (total - avail) / double(total) << "%, "
+          << "desc=" << desc;
+}
+
+inline void debug_gpu_memory_info(const char* desc) {
+  CudaDeviceRestorer r;
+
+  int device_num = 0;
+  auto err = cudaGetDeviceCount(&device_num);
+  PADDLE_ENFORCE_EQ(
+      err,
+      cudaSuccess,
+      platform::errors::InvalidArgument("cudaGetDeviceCount failed!"));
+
+  size_t avail{0};
+  size_t total{0};
+  for (int i = 0; i < device_num; ++i) {
+    cudaSetDevice(i);
+    auto err = cudaMemGetInfo(&avail, &total);
+    PADDLE_ENFORCE_EQ(
+        err,
+        cudaSuccess,
+        platform::errors::InvalidArgument("cudaMemGetInfo failed!"));
+    VLOG(0) << "update gpu memory on device " << i << ", "
+            << "avail=" << avail / 1024.0 / 1024.0 / 1024.0 << "g, "
+            << "total=" << total / 1024.0 / 1024.0 / 1024.0 << "g, "
+            << "use_rate=" << (total - avail) / double(total) << "%, "
+            << "desc=" << desc;
+  }
+}
+
+};  // namespace framework
+};  // namespace paddle
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -23,23 +23,48 @@
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
 #include "paddle/fluid/platform/enforce.h"
 #ifdef PADDLE_WITH_HETERPS
+
+DECLARE_double(gpugraph_hbm_table_load_factor);
+
 namespace paddle {
 namespace framework {
+enum GraphTableType { EDGE_TABLE, FEATURE_TABLE };
 class GpuPsGraphTable
-    : public HeterComm<uint64_t, int64_t, int, CommonFeatureValueAccessor> {
+    : public HeterComm<uint64_t, uint64_t, int, CommonFeatureValueAccessor> {
 public:
-  GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware)
-      : HeterComm<uint64_t, int64_t, int, CommonFeatureValueAccessor>(
+  int get_table_offset(int gpu_id, GraphTableType type, int idx) const {
+    int type_id = type;
+    return gpu_id * (graph_table_num_ + feature_table_num_) +
+           type_id * graph_table_num_ + idx;
+  }
+  GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource,
+                  int topo_aware,
+                  int graph_table_num)
+      : HeterComm<uint64_t, uint64_t, int, CommonFeatureValueAccessor>(
            1, resource) {
-    load_factor_ = 0.25;
+    load_factor_ = FLAGS_gpugraph_hbm_table_load_factor;
+    VLOG(0) << "load_factor = " << load_factor_;
+
    rw_lock.reset(new pthread_rwlock_t());
+    this->graph_table_num_ = graph_table_num;
+    this->feature_table_num_ = 1;
    gpu_num = resource_->total_device();
    memset(global_device_map, -1, sizeof(global_device_map));
+    for (auto &table : tables_) {
+      delete table;
+      table = NULL;
+    }
+    int feature_table_num = 1;
+    tables_ = std::vector<Table *>(
+        gpu_num * (graph_table_num + feature_table_num), NULL);
    for (int i = 0; i < gpu_num; i++) {
-      gpu_graph_list.push_back(GpuPsCommGraph());
      global_device_map[resource_->dev_id(i)] = i;
-      sample_status.push_back(NULL);
-      tables_.push_back(NULL);
+      for (int j = 0; j < graph_table_num; j++) {
+        gpu_graph_list_.push_back(GpuPsCommGraph());
+      }
+      for (int j = 0; j < feature_table_num; j++) {
+        gpu_graph_fea_list_.push_back(GpuPsCommGraphFea());
+      }
    }
    cpu_table_status = -1;
    if (topo_aware) {
@@ -88,46 +113,56 @@ class GpuPsGraphTable
      }
    }
  }
-  ~GpuPsGraphTable() {
-    // if (cpu_table_status != -1) {
-    //   end_graph_sampling();
-    // }
-  }
-  void build_graph_on_single_gpu(GpuPsCommGraph &g, int gpu_id);
-  void clear_graph_info(int gpu_id);
-  void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
+  ~GpuPsGraphTable() {}
+  void build_graph_on_single_gpu(const GpuPsCommGraph &g, int gpu_id, int idx);
+  void build_graph_fea_on_single_gpu(const GpuPsCommGraphFea &g, int gpu_id);
+  void clear_graph_info(int gpu_id, int index);
+  void clear_graph_info(int index);
+  void clear_feature_info(int gpu_id, int index);
+  void clear_feature_info(int index);
+  void build_graph_from_cpu(const std::vector<GpuPsCommGraph> &cpu_node_list,
+                            int idx);
+  void build_graph_fea_from_cpu(
+      const std::vector<GpuPsCommGraphFea> &cpu_node_list, int idx);
  NodeQueryResult graph_node_sample(int gpu_id, int sample_size);
  NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
                                                bool cpu_switch);
  NeighborSampleResult graph_neighbor_sample(int gpu_id,
-                                             int64_t *key,
+                                             uint64_t *key,
                                             int sample_size,
                                             int len);
  NeighborSampleResult graph_neighbor_sample_v2(int gpu_id,
-                                                int64_t *key,
+                                                int idx,
+                                                uint64_t *key,
                                                int sample_size,
                                                int len,
                                                bool cpu_query_switch);
-  void init_sample_status();
-  void free_sample_status();
-  NodeQueryResult query_node_list(int gpu_id, int start, int query_size);
-  void clear_graph_info();
+
+  int get_feature_of_nodes(
+      int gpu_id, uint64_t *d_walk, uint64_t *d_offset, int size, int slot_num);
+
+  NodeQueryResult query_node_list(int gpu_id,
+                                  int idx,
+                                  int start,
+                                  int query_size);
  void display_sample_res(void *key, void *val, int len, int sample_len);
-  void move_neighbor_sample_result_to_source_gpu(int gpu_id,
-                                                 int gpu_num,
-                                                 int sample_size,
-                                                 int *h_left,
-                                                 int *h_right,
-                                                 int64_t *src_sample_res,
-                                                 int *actual_sample_size);
+  void move_result_to_source_gpu(int gpu_id,
+                                 int gpu_num,
+                                 int sample_size,
+                                 int *h_left,
+                                 int *h_right,
+                                 uint64_t *src_sample_res,
+                                 int *actual_sample_size);
  int init_cpu_table(const paddle::distributed::GraphParameter &graph);
+
  int gpu_num;
-  std::vector<GpuPsCommGraph> gpu_graph_list;
+  int graph_table_num_, feature_table_num_;
+  std::vector<GpuPsCommGraph> gpu_graph_list_;
+  std::vector<GpuPsCommGraphFea> gpu_graph_fea_list_;
  int global_device_map[32];
-  std::vector<int *> sample_status;
  const int parallel_sample_size = 1;
  const int dim_y = 256;
-  std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table;
+  std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table_;
  std::shared_ptr<pthread_rwlock_t> rw_lock;
  mutable std::mutex mutex_;
  std::condition_variable cv_;

--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
@@ -19,6 +19,7 @@
 #include <functional>
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 namespace paddle {
 namespace framework {
@@ -33,9 +34,9 @@ sample_result is to save the neighbor sampling result, its size is len *
 sample_size;
 */

-__global__ void get_cpu_id_index(int64_t* key,
+__global__ void get_cpu_id_index(uint64_t* key,
                                 int* actual_sample_size,
-                                 int64_t* cpu_key,
+                                 uint64_t* cpu_key,
                                 int* sum,
                                 int* index,
                                 int len) {
@@ -50,13 +51,13 @@ __global__ void get_cpu_id_index(int64_t* key,
 }

 __global__ void get_actual_gpu_ac(int* gpu_ac, int number_on_cpu) {
-  CUDA_KERNEL_LOOP(i, number_on_cpu) { gpu_ac[i] /= sizeof(int64_t); }
+  CUDA_KERNEL_LOOP(i, number_on_cpu) { gpu_ac[i] /= sizeof(uint64_t); }
 }

 template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
-__global__ void copy_buffer_ac_to_final_place(int64_t* gpu_buffer,
+__global__ void copy_buffer_ac_to_final_place(uint64_t* gpu_buffer,
                                              int* gpu_ac,
-                                              int64_t* val,
+                                              uint64_t* val,
                                              int* actual_sample_size,
                                              int* index,
                                              int* cumsum_gpu_ac,
@@ -77,14 +78,51 @@ __global__ void copy_buffer_ac_to_final_place(int64_t* gpu_buffer,
  }
 }

+__global__ void get_features_kernel(GpuPsCommGraphFea graph,
+                                    GpuPsFeaInfo* fea_info_array,
+                                    int* actual_size,
+                                    uint64_t* feature,
+                                    int slot_num,
+                                    int n) {
+  int idx = blockIdx.x * blockDim.y + threadIdx.y;
+  if (idx < n) {
+    int feature_size = fea_info_array[idx].feature_size;
+    int offset = idx * slot_num;
+    if (feature_size == 0) {
+      for (int k = 0; k < slot_num; ++k) {
+        feature[offset + k] = 0;
+      }
+      actual_size[idx] = slot_num;
+      return;
+    }
+
+    uint64_t* feature_start =
+        &(graph.feature_list[fea_info_array[idx].feature_offset]);
+    uint8_t* slot_id_start =
+        &(graph.slot_id_list[fea_info_array[idx].feature_offset]);
+    int m = 0;
+    for (int k = 0; k < slot_num; ++k) {
+      if (m >= fea_info_array[idx].feature_size || k < slot_id_start[m]) {
+        feature[offset + k] = 0;
+      } else if (k == slot_id_start[m]) {
+        feature[offset + k] = feature_start[m];
+        ++m;
+      } else {
+        assert(0);
+      }
+    }
+    actual_size[idx] = slot_num;
+  }
+}
+
 template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
-__global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
-                                           int64_t* node_index,
-                                           int* actual_size,
-                                           int64_t* res,
-                                           int sample_len,
-                                           int n,
-                                           int default_value) {
+__global__ void neighbor_sample_kernel(GpuPsCommGraph graph,
+                                       GpuPsNodeInfo* node_info_list,
+                                       int* actual_size,
+                                       uint64_t* res,
+                                       int sample_len,
+                                       int n,
+                                       int default_value) {
  assert(blockDim.x == WARP_SIZE);
  assert(blockDim.y == BLOCK_WARPS);

@@ -92,17 +130,16 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
  const int last_idx = min(static_cast<int>(blockIdx.x + 1) * TILE_SIZE, n);
  curandState rng;
  curand_init(blockIdx.x, threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng);
-
  while (i < last_idx) {
-    if (node_index[i] == -1) {
+    if (node_info_list[i].neighbor_size == 0) {
      actual_size[i] = default_value;
      i += BLOCK_WARPS;
      continue;
    }
-    int neighbor_len = (int)graph.node_list[node_index[i]].neighbor_size;
-    int64_t data_offset = graph.node_list[node_index[i]].neighbor_offset;
+    int neighbor_len = (int)node_info_list[i].neighbor_size;
+    uint32_t data_offset = node_info_list[i].neighbor_offset;
    int offset = i * sample_len;
-    int64_t* data = graph.neighbor_list;
+    uint64_t* data = graph.neighbor_list;
    if (neighbor_len <= sample_len) {
      for (int j = threadIdx.x; j < neighbor_len; j += WARP_SIZE) {
        res[offset + j] = data[data_offset + j];
@@ -131,89 +168,10 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
  }
 }

-__global__ void neighbor_sample_example(GpuPsCommGraph graph,
-                                        int64_t* node_index,
-                                        int* actual_size,
-                                        int64_t* res,
-                                        int sample_len,
-                                        int* sample_status,
-                                        int n,
-                                        int from) {
-  int id = blockIdx.x * blockDim.y + threadIdx.y;
-  if (id < n) {
-    if (node_index[id] == -1) {
-      actual_size[id] = 0;
-      return;
-    }
-    curandState rng;
-    curand_init(blockIdx.x, threadIdx.x, threadIdx.y, &rng);
-    int64_t index = threadIdx.x;
-    int64_t offset = id * sample_len;
-    int64_t* data = graph.neighbor_list;
-    int64_t data_offset = graph.node_list[node_index[id]].neighbor_offset;
-    int64_t neighbor_len = graph.node_list[node_index[id]].neighbor_size;
-    int ac_len;
-    if (sample_len > neighbor_len)
-      ac_len = neighbor_len;
-    else {
-      ac_len = sample_len;
-    }
-    if (4 * ac_len >= 3 * neighbor_len) {
-      if (index == 0) {
-        res[offset] = curand(&rng) % (neighbor_len - ac_len + 1);
-      }
-      __syncwarp();
-      int start = res[offset];
-      while (index < ac_len) {
-        res[offset + index] = data[data_offset + start + index];
-        index += blockDim.x;
-      }
-      actual_size[id] = ac_len;
-    } else {
-      while (index < ac_len) {
-        int num = curand(&rng) % neighbor_len;
-        int* addr = sample_status + data_offset + num;
-        int expected = *addr;
-        if (!(expected & (1 << from))) {
-          int old = atomicCAS(addr, expected, expected | (1 << from));
-          if (old == expected) {
-            res[offset + index] = num;
-            index += blockDim.x;
-          }
-        }
-      }
-      __syncwarp();
-      index = threadIdx.x;
-      while (index < ac_len) {
-        int* addr = sample_status + data_offset + res[offset + index];
-        int expected, old = *addr;
-        do {
-          expected = old;
-          old = atomicCAS(addr, expected, expected & (~(1 << from)));
-        } while (old != expected);
-        res[offset + index] = data[data_offset + res[offset + index]];
-        index += blockDim.x;
-      }
-      actual_size[id] = ac_len;
-    }
-  }
-  // const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  // if (i < n) {
-  //   auto node_index = index[i];
-  //   actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size
-  //                        ? graph.node_list[node_index].neighbor_size
-  //                        : sample_size;
-  //   int offset = graph.node_list[node_index].neighbor_offset;
-  //   for (int j = 0; j < actual_size[i]; j++) {
-  //     sample_result[sample_size * i + j] = graph.neighbor_list[offset + j];
-  //   }
-  // }
-}
-
 int GpuPsGraphTable::init_cpu_table(
    const paddle::distributed::GraphParameter& graph) {
-  cpu_graph_table.reset(new paddle::distributed::GraphTable);
-  cpu_table_status = cpu_graph_table->Initialize(graph);
+  cpu_graph_table_.reset(new paddle::distributed::GraphTable);
+  cpu_table_status = cpu_graph_table_->Initialize(graph);
  // if (cpu_table_status != 0) return cpu_table_status;
  // std::function<void(std::vector<GpuPsCommGraph>&)> callback =
  //     [this](std::vector<GpuPsCommGraph>& res) {
@@ -227,17 +185,6 @@ int GpuPsGraphTable::init_cpu_table(
  return cpu_table_status;
 }

-// int GpuPsGraphTable::load(const std::string& path, const std::string& param)
-// {
-//   int status = cpu_graph_table->load(path, param);
-//   if (status != 0) {
-//     return status;
-//   }
-//   std::unique_lock<std::mutex> lock(mutex_);
-//   cpu_graph_table->start_graph_sampling();
-//   cv_.wait(lock);
-//   return 0;
-// }
 /*
 comment 1
 gpu i triggers a neighbor_sample task,
@@ -263,36 +210,37 @@ void GpuPsGraphTable::display_sample_res(void* key,
                                         void* val,
                                         int len,
                                         int sample_len) {
-  char key_buffer[len * sizeof(int64_t)];
+  char key_buffer[len * sizeof(uint64_t)];
  char val_buffer[sample_len * sizeof(int64_t) * len +
-                  (len + len % 2) * sizeof(int) + len * sizeof(int64_t)];
-  cudaMemcpy(key_buffer, key, sizeof(int64_t) * len, cudaMemcpyDeviceToHost);
+                  (len + len % 2) * sizeof(int) + len * sizeof(uint64_t)];
+  cudaMemcpy(key_buffer, key, sizeof(uint64_t) * len, cudaMemcpyDeviceToHost);
  cudaMemcpy(val_buffer,
             val,
             sample_len * sizeof(int64_t) * len +
-                 (len + len % 2) * sizeof(int) + len * sizeof(int64_t),
+                 (len + len % 2) * sizeof(int) + len * sizeof(uint64_t),
             cudaMemcpyDeviceToHost);
-  int64_t* sample_val = (int64_t*)(val_buffer + (len + len % 2) * sizeof(int) +
-                                   len * sizeof(int64_t));
+  uint64_t* sample_val =
+      (uint64_t*)(val_buffer + (len + len % 2) * sizeof(int) +
+                  len * sizeof(int64_t));
  for (int i = 0; i < len; i++) {
-    printf("key %lld\n", *(int64_t*)(key_buffer + i * sizeof(int64_t)));
-    printf("index %lld\n", *(int64_t*)(val_buffer + i * sizeof(int64_t)));
+    printf("key %llu\n", *(int64_t*)(key_buffer + i * sizeof(uint64_t)));
+    printf("index %llu\n", *(int64_t*)(val_buffer + i * sizeof(uint64_t)));
    int ac_size = *(int*)(val_buffer + i * sizeof(int) + len * sizeof(int64_t));
    printf("sampled %d neigbhors\n", ac_size);
    for (int j = 0; j < ac_size; j++) {
-      printf("%lld ", sample_val[i * sample_len + j]);
+      printf("%llu ", sample_val[i * sample_len + j]);
    }
    printf("\n");
  }
 }
-void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
-    int start_index,
-    int gpu_num,
-    int sample_size,
-    int* h_left,
-    int* h_right,
-    int64_t* src_sample_res,
-    int* actual_sample_size) {
+
+void GpuPsGraphTable::move_result_to_source_gpu(int start_index,
+                                                int gpu_num,
+                                                int sample_size,
+                                                int* h_left,
+                                                int* h_right,
+                                                uint64_t* src_sample_res,
+                                                int* actual_sample_size) {
  int shard_len[gpu_num];
  for (int i = 0; i < gpu_num; i++) {
    if (h_left[i] == -1 || h_right[i] == -1) {
@@ -301,144 +249,44 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
    shard_len[i] = h_right[i] - h_left[i] + 1;
    int cur_step = (int)path_[start_index][i].nodes_.size() - 1;
    for (int j = cur_step; j > 0; j--) {
-      cudaMemcpyAsync(path_[start_index][i].nodes_[j - 1].val_storage,
-                      path_[start_index][i].nodes_[j].val_storage,
-                      path_[start_index][i].nodes_[j - 1].val_bytes_len,
-                      cudaMemcpyDefault,
-                      path_[start_index][i].nodes_[j - 1].out_stream);
+      CUDA_CHECK(
+          cudaMemcpyAsync(path_[start_index][i].nodes_[j - 1].val_storage,
+                          path_[start_index][i].nodes_[j].val_storage,
+                          path_[start_index][i].nodes_[j - 1].val_bytes_len,
+                          cudaMemcpyDefault,
+                          path_[start_index][i].nodes_[j - 1].out_stream));
    }
    auto& node = path_[start_index][i].nodes_.front();
-    cudaMemcpyAsync(
+    CUDA_CHECK(cudaMemcpyAsync(
        reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
        node.val_storage + sizeof(int64_t) * shard_len[i] +
            sizeof(int) * (shard_len[i] + shard_len[i] % 2),
-        sizeof(int64_t) * shard_len[i] * sample_size,
+        sizeof(uint64_t) * shard_len[i] * sample_size,
        cudaMemcpyDefault,
-        node.out_stream);
-    cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
-                    node.val_storage + sizeof(int64_t) * shard_len[i],
-                    sizeof(int) * shard_len[i],
-                    cudaMemcpyDefault,
-                    node.out_stream);
+        node.out_stream));
+    CUDA_CHECK(
+        cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
+                        node.val_storage + sizeof(int64_t) * shard_len[i],
+                        sizeof(int) * shard_len[i],
+                        cudaMemcpyDefault,
+                        node.out_stream));
  }
  for (int i = 0; i < gpu_num; ++i) {
    if (h_left[i] == -1 || h_right[i] == -1) {
      continue;
    }
    auto& node = path_[start_index][i].nodes_.front();
-    cudaStreamSynchronize(node.out_stream);
+    CUDA_CHECK(cudaStreamSynchronize(node.out_stream));
    // cudaStreamSynchronize(resource_->remote_stream(i, start_index));
  }
-  /*
-    std::queue<CopyTask> que;
-    // auto& node = path_[gpu_id][i].nodes_.front();
-    // cudaMemcpyAsync(
-    //     reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
-    //     node.val_storage + sizeof(int64_t) * shard_len,
-    //     node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault,
-    //     node.out_stream);
-    // cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
-    //                 node.val_storage + sizeof(int) * shard_len,
-    //                 sizeof(int) * shard_len, cudaMemcpyDefault,
-    //                 node.out_stream);
-    int cur_step = path_[start_index][i].nodes_.size() - 1;
-    auto& node = path_[start_index][i].nodes_[cur_step];
-    if (cur_step == 0) {
-      // cudaMemcpyAsync(reinterpret_cast<char*>(src_val + h_left[i]),
-      //                 node.val_storage, node.val_bytes_len,
-      //                 cudaMemcpyDefault,
-      //                 node.out_stream);
-     // VLOG(0)<<"copy "<<node.gpu_num<<" to "<<start_index;
-      cudaMemcpyAsync(
-          reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
-          node.val_storage + sizeof(int64_t) * shard_len[i],
-          node.val_bytes_len - sizeof(int64_t) * shard_len[i],
-          cudaMemcpyDefault,
-          node.out_stream);
-          //resource_->remote_stream(i, start_index));
-      cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
-                      node.val_storage + sizeof(int) * shard_len[i],
-                      sizeof(int) * shard_len[i], cudaMemcpyDefault,
-                      node.out_stream);
-                      //resource_->remote_stream(i, start_index));
-    } else {
-      CopyTask t(&path_[start_index][i], cur_step - 1);
-      que.push(t);
-       //     VLOG(0)<<"copy "<<node.gpu_num<<" to
-  "<<path_[start_index][i].nodes_[cur_step - 1].gpu_num;
-      cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage,
-                      node.val_storage,
-                      path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
-                      cudaMemcpyDefault,
-                     path_[start_index][i].nodes_[cur_step - 1].out_stream);
-                     //resource_->remote_stream(i, start_index));
-    }
-  }
-  while (!que.empty()) {
-    CopyTask& cur_task = que.front();
-    que.pop();
-    int cur_step = cur_task.step;
-    if (cur_task.path->nodes_[cur_step].sync) {
-      cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream);
-      //cudaStreamSynchronize(resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
-  start_index));
-    }
-    if (cur_step > 0) {
-      CopyTask c(cur_task.path, cur_step - 1);
-      que.push(c);
-      cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage,
-                      cur_task.path->nodes_[cur_step].val_storage,
-                      cur_task.path->nodes_[cur_step - 1].val_bytes_len,
-                      cudaMemcpyDefault,
-                      cur_task.path->nodes_[cur_step - 1].out_stream);
-                      //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
-  start_index));
-    } else if (cur_step == 0) {
-      int end_index = cur_task.path->nodes_.back().gpu_num;
-      // cudaMemcpyAsync(reinterpret_cast<char*>(src_val + h_left[end_index]),
-      //                 cur_task.path->nodes_[cur_step].val_storage,
-      //                 cur_task.path->nodes_[cur_step].val_bytes_len,
-      //                 cudaMemcpyDefault,
-      //                 cur_task.path->nodes_[cur_step].out_stream);
-      //VLOG(0)<<"copy "<<cur_task.path->nodes_[cur_step].gpu_num<< " to
-  "<<start_index;
-      cudaMemcpyAsync(reinterpret_cast<char*>(src_sample_res +
-                                              h_left[end_index] * sample_size),
-                      cur_task.path->nodes_[cur_step].val_storage +
-                          sizeof(int64_t) * shard_len[end_index],
-                      cur_task.path->nodes_[cur_step].val_bytes_len -
-                          sizeof(int64_t) * shard_len[end_index],
-                      cudaMemcpyDefault,
-                      cur_task.path->nodes_[cur_step].out_stream);
-                      //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
-  start_index));
-      cudaMemcpyAsync(
-          reinterpret_cast<char*>(actual_sample_size + h_left[end_index]),
-          cur_task.path->nodes_[cur_step].val_storage +
-              sizeof(int) * shard_len[end_index],
-          sizeof(int) * shard_len[end_index], cudaMemcpyDefault,
-          cur_task.path->nodes_[cur_step].out_stream);
-          //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
-  start_index));
-    }
-  }
-  for (int i = 0; i < gpu_num; ++i) {
-    if (h_left[i] == -1 || h_right[i] == -1) {
-      continue;
-    }
-    auto& node = path_[start_index][i].nodes_.front();
-    cudaStreamSynchronize(node.out_stream);
-    //cudaStreamSynchronize(resource_->remote_stream(i, start_index));
-  }
-  */
 }

 /*
 TODO:
 how to optimize it to eliminate the for loop
 */
-__global__ void fill_dvalues(int64_t* d_shard_vals,
-                             int64_t* d_vals,
+__global__ void fill_dvalues(uint64_t* d_shard_vals,
+                             uint64_t* d_vals,
                             int* d_shard_actual_sample_size,
                             int* d_actual_sample_size,
                             int* idx,
@@ -453,8 +301,22 @@ __global__ void fill_dvalues(int64_t* d_shard_vals,
  }
 }

-__global__ void fill_actual_vals(int64_t* vals,
-                                 int64_t* actual_vals,
+__global__ void fill_dvalues(uint64_t* d_shard_vals,
+                             uint64_t* d_vals,
+                             int* d_shard_actual_sample_size,
+                             int* idx,
+                             int sample_size,
+                             int len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    for (int j = 0; j < sample_size; j++) {
+      d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j];
+    }
+  }
+}
+
+__global__ void fill_actual_vals(uint64_t* vals,
+                                 uint64_t* actual_vals,
                                 int* actual_sample_size,
                                 int* cumsum_actual_sample_size,
                                 int sample_size,
@@ -470,40 +332,141 @@ __global__ void fill_actual_vals(int64_t* vals,
 __global__ void node_query_example(GpuPsCommGraph graph,
                                   int start,
                                   int size,
-                                   int64_t* res) {
+                                   uint64_t* res) {
  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < size) {
-    res[i] = graph.node_list[start + i].node_id;
+    res[i] = graph.node_list[start + i];
+  }
+}
+
+void GpuPsGraphTable::clear_feature_info(int gpu_id) {
+  int idx = 0;
+  if (idx >= feature_table_num_) return;
+  int offset = get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, idx);
+  if (offset < tables_.size()) {
+    delete tables_[offset];
+    tables_[offset] = NULL;
+  }
+
+  int graph_fea_idx = gpu_id * feature_table_num_ + idx;
+  if (graph_fea_idx >= gpu_graph_fea_list_.size()) {
+    return;
+  }
+  auto& graph = gpu_graph_fea_list_[graph_fea_idx];
+  if (graph.feature_list != NULL) {
+    cudaFree(graph.feature_list);
+    graph.feature_list = NULL;
+  }
+
+  if (graph.slot_id_list != NULL) {
+    cudaFree(graph.slot_id_list);
+    graph.slot_id_list = NULL;
  }
 }

-void GpuPsGraphTable::clear_graph_info(int gpu_id) {
-  if (tables_.size() && tables_[gpu_id] != NULL) {
-    delete tables_[gpu_id];
+void GpuPsGraphTable::clear_graph_info(int gpu_id, int idx) {
+  if (idx >= graph_table_num_) return;
+  int offset = get_table_offset(gpu_id, GraphTableType::EDGE_TABLE, idx);
+  if (offset < tables_.size()) {
+    delete tables_[offset];
+    tables_[offset] = NULL;
  }
-  auto& graph = gpu_graph_list[gpu_id];
+  auto& graph = gpu_graph_list_[gpu_id * graph_table_num_ + idx];
  if (graph.neighbor_list != NULL) {
    cudaFree(graph.neighbor_list);
+    graph.neighbor_list = nullptr;
  }
  if (graph.node_list != NULL) {
    cudaFree(graph.node_list);
+    graph.node_list = nullptr;
  }
 }
-void GpuPsGraphTable::clear_graph_info() {
-  if (tables_.size()) {
-    for (auto table : tables_) delete table;
+void GpuPsGraphTable::clear_graph_info(int idx) {
+  for (int i = 0; i < gpu_num; i++) clear_graph_info(i, idx);
+}
+/*
+the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu.
+it saves the graph to be saved on each gpu.
+for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number
+== i
+In this function, memory is allocated on each gpu to save the graphs,
+gpu i saves the ith graph from cpu_graph_list
+*/
+void GpuPsGraphTable::build_graph_fea_on_single_gpu(const GpuPsCommGraphFea& g,
+                                                    int gpu_id) {
+  clear_feature_info(gpu_id);
+  int ntype_id = 0;
+
+  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
+
+  int offset = gpu_id * feature_table_num_ + ntype_id;
+  gpu_graph_fea_list_[offset] = GpuPsCommGraphFea();
+
+  int table_offset =
+      get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, ntype_id);
+
+  size_t capacity = std::max((uint64_t)1, g.node_size) / load_factor_;
+  tables_[table_offset] = new Table(capacity);
+  if (g.node_size > 0) {
+    build_ps(gpu_id,
+             g.node_list,
+             (uint64_t*)g.fea_info_list,
+             g.node_size,
+             1024,
+             8,
+             table_offset);
+    gpu_graph_fea_list_[offset].node_list = NULL;
+    gpu_graph_fea_list_[offset].node_size = g.node_size;
+  } else {
+    build_ps(gpu_id, NULL, NULL, 0, 1024, 8, table_offset);
+    gpu_graph_fea_list_[offset].node_list = NULL;
+    gpu_graph_fea_list_[offset].node_size = 0;
  }
-  tables_.clear();
-  for (auto graph : gpu_graph_list) {
-    if (graph.neighbor_list != NULL) {
-      cudaFree(graph.neighbor_list);
-    }
-    if (graph.node_list != NULL) {
-      cudaFree(graph.node_list);
-    }
+  if (g.feature_size) {
+    // TODO
+    cudaError_t cudaStatus =
+        cudaMalloc((void**)&gpu_graph_fea_list_[offset].feature_list,
+                   g.feature_size * sizeof(uint64_t));
+    PADDLE_ENFORCE_EQ(
+        cudaStatus,
+        cudaSuccess,
+        platform::errors::InvalidArgument(
+            "ailed to allocate memory for graph-feature on gpu "));
+    VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint64_t)
+            << " bytes of memory for graph-feature on gpu "
+            << resource_->dev_id(gpu_id);
+    CUDA_CHECK(cudaMemcpy(gpu_graph_fea_list_[offset].feature_list,
+                          g.feature_list,
+                          g.feature_size * sizeof(uint64_t),
+                          cudaMemcpyHostToDevice));
+
+    // TODO
+    cudaStatus = cudaMalloc((void**)&gpu_graph_fea_list_[offset].slot_id_list,
+                            g.feature_size * sizeof(uint8_t));
+    PADDLE_ENFORCE_EQ(
+        cudaStatus,
+        cudaSuccess,
+        platform::errors::InvalidArgument(
+            "ailed to allocate memory for graph-feature on gpu "));
+    VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint8_t)
+            << " bytes of memory for graph-feature on gpu "
+            << resource_->dev_id(gpu_id);
+    cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list,
+               g.slot_id_list,
+               g.feature_size * sizeof(uint8_t),
+               cudaMemcpyHostToDevice);
+
+    gpu_graph_fea_list_[offset].feature_size = g.feature_size;
+  } else {
+    gpu_graph_fea_list_[offset].feature_list = NULL;
+    gpu_graph_fea_list_[offset].slot_id_list = NULL;
+    gpu_graph_fea_list_[offset].feature_size = 0;
  }
-  gpu_graph_list.clear();
+  VLOG(0) << "gpu node_feature info card :" << gpu_id << " ,node_size is "
+          << gpu_graph_fea_list_[offset].node_size << ", feature_size is "
+          << gpu_graph_fea_list_[offset].feature_size;
 }
+
 /*
 the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu.
 it saves the graph to be saved on each gpu.
@@ -512,78 +475,131 @@ for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number
 In this function, memory is allocated on each gpu to save the graphs,
 gpu i saves the ith graph from cpu_graph_list
 */
-
-void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
-  clear_graph_info(i);
+void GpuPsGraphTable::build_graph_on_single_gpu(const GpuPsCommGraph& g,
+                                                int i,
+                                                int idx) {
+  clear_graph_info(i, idx);
  platform::CUDADeviceGuard guard(resource_->dev_id(i));
-  // platform::CUDADeviceGuard guard(i);
-  gpu_graph_list[i] = GpuPsCommGraph();
-  sample_status[i] = NULL;
-  tables_[i] = new Table(std::max((int64_t)1, g.node_size) / load_factor_);
+  int offset = i * graph_table_num_ + idx;
+  gpu_graph_list_[offset] = GpuPsCommGraph();
+  int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx);
+  size_t capacity = std::max((uint64_t)1, (uint64_t)g.node_size) / load_factor_;
+  tables_[table_offset] = new Table(capacity);
  if (g.node_size > 0) {
-    std::vector<int64_t> keys;
-    std::vector<int64_t> offset;
-    cudaMalloc((void**)&gpu_graph_list[i].node_list,
-               g.node_size * sizeof(GpuPsGraphNode));
-    cudaMemcpy(gpu_graph_list[i].node_list,
-               g.node_list,
-               g.node_size * sizeof(GpuPsGraphNode),
-               cudaMemcpyHostToDevice);
-    for (int64_t j = 0; j < g.node_size; j++) {
-      keys.push_back(g.node_list[j].node_id);
-      offset.push_back(j);
+    if (FLAGS_gpugraph_load_node_list_into_hbm) {
+      CUDA_CHECK(cudaMalloc((void**)&gpu_graph_list_[offset].node_list,
+                            g.node_size * sizeof(uint64_t)));
+      CUDA_CHECK(cudaMemcpy(gpu_graph_list_[offset].node_list,
+                            g.node_list,
+                            g.node_size * sizeof(uint64_t),
+                            cudaMemcpyHostToDevice));
    }
-    build_ps(i, (uint64_t*)keys.data(), offset.data(), keys.size(), 1024, 8);
-    gpu_graph_list[i].node_size = g.node_size;
+
+    build_ps(i,
+             g.node_list,
+             (uint64_t*)(g.node_info_list),
+             g.node_size,
+             1024,
+             8,
+             table_offset);
+    gpu_graph_list_[offset].node_size = g.node_size;
  } else {
-    build_ps(i, NULL, NULL, 0, 1024, 8);
-    gpu_graph_list[i].node_list = NULL;
-    gpu_graph_list[i].node_size = 0;
+    build_ps(i, NULL, NULL, 0, 1024, 8, table_offset);
+    gpu_graph_list_[offset].node_list = NULL;
+    gpu_graph_list_[offset].node_size = 0;
  }
  if (g.neighbor_size) {
    cudaError_t cudaStatus =
-        cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
-                   g.neighbor_size * sizeof(int64_t));
+        cudaMalloc((void**)&gpu_graph_list_[offset].neighbor_list,
+                   g.neighbor_size * sizeof(uint64_t));
    PADDLE_ENFORCE_EQ(cudaStatus,
                      cudaSuccess,
                      platform::errors::InvalidArgument(
                          "ailed to allocate memory for graph on gpu "));
-    VLOG(0) << "sucessfully allocate " << g.neighbor_size * sizeof(int64_t)
+    VLOG(0) << "sucessfully allocate " << g.neighbor_size * sizeof(uint64_t)
            << " bytes of memory for graph-edges on gpu "
            << resource_->dev_id(i);
-    cudaMemcpy(gpu_graph_list[i].neighbor_list,
-               g.neighbor_list,
-               g.neighbor_size * sizeof(int64_t),
-               cudaMemcpyHostToDevice);
-    gpu_graph_list[i].neighbor_size = g.neighbor_size;
+    CUDA_CHECK(cudaMemcpy(gpu_graph_list_[offset].neighbor_list,
+                          g.neighbor_list,
+                          g.neighbor_size * sizeof(uint64_t),
+                          cudaMemcpyHostToDevice));
+    gpu_graph_list_[offset].neighbor_size = g.neighbor_size;
  } else {
-    gpu_graph_list[i].neighbor_list = NULL;
-    gpu_graph_list[i].neighbor_size = 0;
+    gpu_graph_list_[offset].neighbor_list = NULL;
+    gpu_graph_list_[offset].neighbor_size = 0;
  }
+  VLOG(0) << " gpu node_neighbor info card: " << i << " ,node_size is "
+          << gpu_graph_list_[offset].node_size << ", neighbor_size is "
+          << gpu_graph_list_[offset].neighbor_size;
 }

-void GpuPsGraphTable::init_sample_status() {
-  for (int i = 0; i < gpu_num; i++) {
-    if (gpu_graph_list[i].neighbor_size) {
-      platform::CUDADeviceGuard guard(resource_->dev_id(i));
-      int* addr;
-      cudaMalloc((void**)&addr, gpu_graph_list[i].neighbor_size * sizeof(int));
-      cudaMemset(addr, 0, gpu_graph_list[i].neighbor_size * sizeof(int));
-      sample_status[i] = addr;
+void GpuPsGraphTable::build_graph_fea_from_cpu(
+    const std::vector<GpuPsCommGraphFea>& cpu_graph_fea_list, int ntype_id) {
+  PADDLE_ENFORCE_EQ(
+      cpu_graph_fea_list.size(),
+      resource_->total_device(),
+      platform::errors::InvalidArgument("the cpu node list size doesn't match "
+                                        "the number of gpu on your machine."));
+  clear_feature_info(ntype_id);
+  for (int i = 0; i < cpu_graph_fea_list.size(); i++) {
+    int table_offset =
+        get_table_offset(i, GraphTableType::FEATURE_TABLE, ntype_id);
+    int offset = i * feature_table_num_ + ntype_id;
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    gpu_graph_fea_list_[offset] = GpuPsCommGraphFea();
+    tables_[table_offset] = new Table(
+        std::max((uint64_t)1, (uint64_t)cpu_graph_fea_list[i].node_size) /
+        load_factor_);
+    if (cpu_graph_fea_list[i].node_size > 0) {
+      build_ps(i,
+               cpu_graph_fea_list[i].node_list,
+               (uint64_t*)cpu_graph_fea_list[i].fea_info_list,
+               cpu_graph_fea_list[i].node_size,
+               1024,
+               8,
+               table_offset);
+      gpu_graph_fea_list_[offset].node_size = cpu_graph_fea_list[i].node_size;
+    } else {
+      build_ps(i, NULL, NULL, 0, 1024, 8, table_offset);
+      gpu_graph_fea_list_[offset].node_list = NULL;
+      gpu_graph_fea_list_[offset].node_size = 0;
    }
-  }
-}
-
-void GpuPsGraphTable::free_sample_status() {
-  for (int i = 0; i < gpu_num; i++) {
-    if (sample_status[i] != NULL) {
-      platform::CUDADeviceGuard guard(resource_->dev_id(i));
-      cudaFree(sample_status[i]);
+    if (cpu_graph_fea_list[i].feature_size) {
+      // TODO
+      CUDA_CHECK(
+          cudaMalloc((void**)&gpu_graph_fea_list_[offset].feature_list,
+                     cpu_graph_fea_list[i].feature_size * sizeof(uint64_t)));
+
+      CUDA_CHECK(
+          cudaMemcpy(gpu_graph_fea_list_[offset].feature_list,
+                     cpu_graph_fea_list[i].feature_list,
+                     cpu_graph_fea_list[i].feature_size * sizeof(uint64_t),
+                     cudaMemcpyHostToDevice));
+
+      // TODO
+      CUDA_CHECK(
+          cudaMalloc((void**)&gpu_graph_fea_list_[offset].slot_id_list,
+                     cpu_graph_fea_list[i].feature_size * sizeof(uint8_t)));
+
+      CUDA_CHECK(
+          cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list,
+                     cpu_graph_fea_list[i].slot_id_list,
+                     cpu_graph_fea_list[i].feature_size * sizeof(uint8_t),
+                     cudaMemcpyHostToDevice));
+
+      gpu_graph_fea_list_[offset].feature_size =
+          cpu_graph_fea_list[i].feature_size;
+    } else {
+      gpu_graph_fea_list_[offset].feature_list = NULL;
+      gpu_graph_fea_list_[offset].slot_id_list = NULL;
+      gpu_graph_fea_list_[offset].feature_size = 0;
    }
  }
+  cudaDeviceSynchronize();
 }
+
 void GpuPsGraphTable::build_graph_from_cpu(
-    std::vector<GpuPsCommGraph>& cpu_graph_list) {
+    const std::vector<GpuPsCommGraph>& cpu_graph_list, int idx) {
  VLOG(0) << "in build_graph_from_cpu cpu_graph_list size = "
          << cpu_graph_list.size();
  PADDLE_ENFORCE_EQ(
@@ -591,240 +607,77 @@ void GpuPsGraphTable::build_graph_from_cpu(
      resource_->total_device(),
      platform::errors::InvalidArgument("the cpu node list size doesn't match "
                                        "the number of gpu on your machine."));
-  clear_graph_info();
+  clear_graph_info(idx);
  for (int i = 0; i < cpu_graph_list.size(); i++) {
+    int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx);
+    int offset = i * graph_table_num_ + idx;
    platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    gpu_graph_list[i] = GpuPsCommGraph();
-    sample_status[i] = NULL;
-    tables_[i] = new Table(std::max((int64_t)1, cpu_graph_list[i].node_size) /
-                           load_factor_);
+    gpu_graph_list_[offset] = GpuPsCommGraph();
+    tables_[table_offset] =
+        new Table(std::max((uint64_t)1, (uint64_t)cpu_graph_list[i].node_size) /
+                  load_factor_);
    if (cpu_graph_list[i].node_size > 0) {
-      std::vector<int64_t> keys;
-      std::vector<int64_t> offset;
-      cudaMalloc((void**)&gpu_graph_list[i].node_list,
-                 cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode));
-      cudaMemcpy(gpu_graph_list[i].node_list,
-                 cpu_graph_list[i].node_list,
-                 cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode),
-                 cudaMemcpyHostToDevice);
-      for (int64_t j = 0; j < cpu_graph_list[i].node_size; j++) {
-        keys.push_back(cpu_graph_list[i].node_list[j].node_id);
-        offset.push_back(j);
-      }
-      build_ps(
-          i, (uint64_t*)(keys.data()), offset.data(), keys.size(), 1024, 8);
-      gpu_graph_list[i].node_size = cpu_graph_list[i].node_size;
+      CUDA_CHECK(cudaMalloc((void**)&gpu_graph_list_[offset].node_list,
+                            cpu_graph_list[i].node_size * sizeof(uint64_t)));
+      CUDA_CHECK(cudaMemcpy(gpu_graph_list_[offset].node_list,
+                            cpu_graph_list[i].node_list,
+                            cpu_graph_list[i].node_size * sizeof(uint64_t),
+                            cudaMemcpyHostToDevice));
+      build_ps(i,
+               cpu_graph_list[i].node_list,
+               (uint64_t*)(cpu_graph_list[i].node_info_list),
+               cpu_graph_list[i].node_size,
+               1024,
+               8,
+               table_offset);
+      gpu_graph_list_[offset].node_size = cpu_graph_list[i].node_size;
    } else {
-      build_ps(i, NULL, NULL, 0, 1024, 8);
-      gpu_graph_list[i].node_list = NULL;
-      gpu_graph_list[i].node_size = 0;
+      build_ps(i, NULL, NULL, 0, 1024, 8, table_offset);
+      gpu_graph_list_[offset].node_list = NULL;
+      gpu_graph_list_[offset].node_size = 0;
    }
    if (cpu_graph_list[i].neighbor_size) {
-      cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
-                 cpu_graph_list[i].neighbor_size * sizeof(int64_t));
-
-      cudaMemcpy(gpu_graph_list[i].neighbor_list,
-                 cpu_graph_list[i].neighbor_list,
-                 cpu_graph_list[i].neighbor_size * sizeof(int64_t),
-                 cudaMemcpyHostToDevice);
-      gpu_graph_list[i].neighbor_size = cpu_graph_list[i].neighbor_size;
+      CUDA_CHECK(
+          cudaMalloc((void**)&gpu_graph_list_[offset].neighbor_list,
+                     cpu_graph_list[i].neighbor_size * sizeof(uint64_t)));
+
+      CUDA_CHECK(cudaMemcpy(gpu_graph_list_[offset].neighbor_list,
+                            cpu_graph_list[i].neighbor_list,
+                            cpu_graph_list[i].neighbor_size * sizeof(uint64_t),
+                            cudaMemcpyHostToDevice));
+      gpu_graph_list_[offset].neighbor_size = cpu_graph_list[i].neighbor_size;
    } else {
-      gpu_graph_list[i].neighbor_list = NULL;
-      gpu_graph_list[i].neighbor_size = 0;
+      gpu_graph_list_[offset].neighbor_list = NULL;
+      gpu_graph_list_[offset].neighbor_size = 0;
    }
  }
-  cudaDeviceSynchronize();
+  CUDA_CHECK(cudaDeviceSynchronize());
 }

 NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v3(
    NeighborSampleQuery q, bool cpu_switch) {
-  return graph_neighbor_sample_v2(
-      global_device_map[q.gpu_id], q.key, q.sample_size, q.len, cpu_switch);
+  return graph_neighbor_sample_v2(global_device_map[q.gpu_id],
+                                  q.table_idx,
+                                  q.src_nodes,
+                                  q.sample_size,
+                                  q.len,
+                                  cpu_switch);
 }
+
 NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
-                                                            int64_t* key,
+                                                            uint64_t* key,
                                                            int sample_size,
                                                            int len) {
-  /*
- comment 2
-  this function shares some kernels with heter_comm_inl.h
-  arguments definitions:
-  gpu_id:the id of gpu.
-  len:how many keys are used,(the length of array key)
-  sample_size:how many neighbors should be sampled for each node in key.
-  the code below shuffle the key array to make the keys
-    that belong to a gpu-card stay together,
-    the shuffled result is saved on d_shard_keys,
-    if ith element in d_shard_keys_ptr is
-    from jth element in the original key array, then idx[i] = j,
-    idx could be used to recover the original array.
-    if keys in range [a,b] belong to ith-gpu, then h_left[i] = a, h_right[i] =
- b,
-    if no keys are allocated for ith-gpu, then h_left[i] == h_right[i] == -1
-    for example, suppose key = [0,1,2,3,4,5,6,7,8], gpu_num = 2
-    when we run this neighbor_sample function,
-    the key is shuffled to [0,2,4,6,8,1,3,5,7]
-    the first part (0,2,4,6,8) % 2 == 0,thus should be handled by gpu 0,
-    the rest part should be handled by gpu1, because (1,3,5,7) % 2 == 1,
-    h_left = [0,5],h_right = [4,8]
-  */
-
-  NeighborSampleResult result;
-  result.initialize(sample_size, len, resource_->dev_id(gpu_id));
-  if (len == 0) {
-    return result;
-  }
-  platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
-  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-  int* actual_sample_size = result.actual_sample_size;
-  int64_t* val = result.val;
-  int total_gpu = resource_->total_device();
-  auto stream = resource_->local_stream(gpu_id, 0);
-
-  int grid_size = (len - 1) / block_size_ + 1;
-
-  int h_left[total_gpu];   // NOLINT
-  int h_right[total_gpu];  // NOLINT
-
-  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
-  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
-  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
-  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
-
-  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
-  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
-  //
-  auto d_idx = memory::Alloc(place, len * sizeof(int));
-  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
-
-  auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
-  int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t));
-  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
-  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
-  int* d_shard_actual_sample_size_ptr =
-      reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
-
-  split_input_to_shard(
-      (uint64_t*)(key), d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
-
-  heter_comm_kernel_->fill_shard_key(
-      d_shard_keys_ptr, key, d_idx_ptr, len, stream);
-  cudaStreamSynchronize(stream);
-
-  cudaMemcpy(
-      h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);
-  cudaMemcpy(
-      h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);
-  // auto start1 = std::chrono::steady_clock::now();
-  for (int i = 0; i < total_gpu; ++i) {
-    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
-    if (shard_len == 0) {
-      continue;
-    }
-    /*
-   comment 3
-    shard_len denotes the size of keys on i-th gpu here,
-    when we sample  on i-th gpu, we allocate shard_len * (1 + sample_size)
-   int64_t units
-    of memory, we use alloc_mem_i to denote it, the range [0,shard_len) is saved
-   for the respective nodes' indexes
-    and acutal sample_size.
-    with nodes' indexes we could get the nodes to sample.
-    since size of int64_t is 8 bits, while size of int is 4,
-    the range of [0,shard_len) contains shard_len * 2 int uinits;
-    The values of the first half of this range will be updated by
-    the k-v map on i-th-gpu.
-    The second half of this range is saved for actual sample size of each node.
-    For node x,
-    its sampling result is saved on the range
-    [shard_len + sample_size * x,shard_len + sample_size * x +
-   actual_sample_size_of_x)
-    of alloc_mem_i, actual_sample_size_of_x equals ((int
-   *)alloc_mem_i)[shard_len + x]
-    */
-
-    create_storage(gpu_id,
-                   i,
-                   shard_len * sizeof(int64_t),
-                   shard_len * (1 + sample_size) * sizeof(int64_t) +
-                       sizeof(int) * (shard_len + shard_len % 2));
-    // auto& node = path_[gpu_id][i].nodes_[0];
-  }
-  walk_to_dest(
-      gpu_id, total_gpu, h_left, h_right, (uint64_t*)(d_shard_keys_ptr), NULL);
-
-  for (int i = 0; i < total_gpu; ++i) {
-    if (h_left[i] == -1) {
-      continue;
-    }
-    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
-    auto& node = path_[gpu_id][i].nodes_.back();
-    cudaMemsetAsync(
-        node.val_storage, -1, shard_len * sizeof(int64_t), node.in_stream);
-    cudaStreamSynchronize(node.in_stream);
-    platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    tables_[i]->get(reinterpret_cast<uint64_t*>(node.key_storage),
-                    reinterpret_cast<int64_t*>(node.val_storage),
-                    h_right[i] - h_left[i] + 1,
-                    resource_->remote_stream(i, gpu_id));
-    // node.in_stream);
-    auto graph = gpu_graph_list[i];
-    int64_t* id_array = reinterpret_cast<int64_t*>(node.val_storage);
-    int* actual_size_array = (int*)(id_array + shard_len);
-    int64_t* sample_array =
-        (int64_t*)(actual_size_array + shard_len + shard_len % 2);
-    int sample_grid_size = (shard_len - 1) / dim_y + 1;
-    dim3 block(parallel_sample_size, dim_y);
-    dim3 grid(sample_grid_size);
-    neighbor_sample_example<<<grid,
-                              block,
-                              0,
-                              resource_->remote_stream(i, gpu_id)>>>(
-        graph,
-        id_array,
-        actual_size_array,
-        sample_array,
-        sample_size,
-        sample_status[i],
-        shard_len,
-        gpu_id);
-  }
-
-  for (int i = 0; i < total_gpu; ++i) {
-    if (h_left[i] == -1) {
-      continue;
-    }
-    cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
-  }
-  move_neighbor_sample_result_to_source_gpu(gpu_id,
-                                            total_gpu,
-                                            sample_size,
-                                            h_left,
-                                            h_right,
-                                            d_shard_vals_ptr,
-                                            d_shard_actual_sample_size_ptr);
-  fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
-      d_shard_vals_ptr,
-      val,
-      d_shard_actual_sample_size_ptr,
-      actual_sample_size,
-      d_idx_ptr,
-      sample_size,
-      len);
-  for (int i = 0; i < total_gpu; ++i) {
-    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
-    if (shard_len == 0) {
-      continue;
-    }
-    destroy_storage(gpu_id, i);
-  }
-  cudaStreamSynchronize(stream);
-  return result;
+  return graph_neighbor_sample_v2(gpu_id, 0, key, sample_size, len, false);
 }

 NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
-    int gpu_id, int64_t* key, int sample_size, int len, bool cpu_query_switch) {
+    int gpu_id,
+    int idx,
+    uint64_t* key,
+    int sample_size,
+    int len,
+    bool cpu_query_switch) {
  NeighborSampleResult result;
  result.initialize(sample_size, len, resource_->dev_id(gpu_id));

@@ -834,8 +687,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(

  platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
+
  int* actual_sample_size = result.actual_sample_size;
-  int64_t* val = result.val;
+  uint64_t* val = result.val;
  int total_gpu = resource_->total_device();
  auto stream = resource_->local_stream(gpu_id, 0);

@@ -853,16 +707,17 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
    default_value = -1;
  }

-  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
-  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
+  CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream));
  //
  auto d_idx = memory::Alloc(place, len * sizeof(int));
  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());

-  auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
-  int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t));
-  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
+  auto d_shard_keys = memory::Alloc(place, len * sizeof(uint64_t));
+  uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
+  auto d_shard_vals =
+      memory::Alloc(place, sample_size * len * sizeof(uint64_t));
+  uint64_t* d_shard_vals_ptr = reinterpret_cast<uint64_t*>(d_shard_vals->ptr());
  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
  int* d_shard_actual_sample_size_ptr =
      reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
@@ -873,12 +728,12 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
  heter_comm_kernel_->fill_shard_key(
      d_shard_keys_ptr, key, d_idx_ptr, len, stream);

-  cudaStreamSynchronize(stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));

-  cudaMemcpy(
-      h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);
-  cudaMemcpy(
-      h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);
+  CUDA_CHECK(cudaMemcpy(
+      h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(
+      h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
  for (int i = 0; i < total_gpu; ++i) {
    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
    if (shard_len == 0) {
@@ -886,8 +741,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
    }
    create_storage(gpu_id,
                   i,
-                   shard_len * sizeof(int64_t),
-                   shard_len * (1 + sample_size) * sizeof(int64_t) +
+                   shard_len * sizeof(uint64_t),
+                   shard_len * sample_size * sizeof(uint64_t) +
+                       shard_len * sizeof(uint64_t) +
                       sizeof(int) * (shard_len + shard_len % 2));
  }
  walk_to_dest(
@@ -899,30 +755,35 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
    }
    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
    auto& node = path_[gpu_id][i].nodes_.back();
-    cudaMemsetAsync(
-        node.val_storage, -1, shard_len * sizeof(int64_t), node.in_stream);
-    cudaStreamSynchronize(node.in_stream);
+
+    CUDA_CHECK(cudaMemsetAsync(
+        node.val_storage, 0, shard_len * sizeof(int64_t), node.in_stream));
+    CUDA_CHECK(cudaStreamSynchronize(node.in_stream));
    platform::CUDADeviceGuard guard(resource_->dev_id(i));
    // If not found, val is -1.
-    tables_[i]->get(reinterpret_cast<uint64_t*>(node.key_storage),
-                    reinterpret_cast<int64_t*>(node.val_storage),
-                    h_right[i] - h_left[i] + 1,
-                    resource_->remote_stream(i, gpu_id));
-
-    auto graph = gpu_graph_list[i];
-    int64_t* id_array = reinterpret_cast<int64_t*>(node.val_storage);
-    int* actual_size_array = (int*)(id_array + shard_len);
-    int64_t* sample_array =
-        (int64_t*)(actual_size_array + shard_len + shard_len % 2);
+    int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx);
+    int offset = i * graph_table_num_ + idx;
+    tables_[table_offset]->get(reinterpret_cast<uint64_t*>(node.key_storage),
+                               reinterpret_cast<uint64_t*>(node.val_storage),
+                               (size_t)(h_right[i] - h_left[i] + 1),
+                               resource_->remote_stream(i, gpu_id));
+
+    auto graph = gpu_graph_list_[offset];
+    GpuPsNodeInfo* node_info_list =
+        reinterpret_cast<GpuPsNodeInfo*>(node.val_storage);
+    int* actual_size_array = (int*)(node_info_list + shard_len);
+    uint64_t* sample_array =
+        (uint64_t*)(actual_size_array + shard_len + shard_len % 2);
    constexpr int WARP_SIZE = 32;
    constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
    constexpr int TILE_SIZE = BLOCK_WARPS * 16;
    const dim3 block(WARP_SIZE, BLOCK_WARPS);
    const dim3 grid((shard_len + TILE_SIZE - 1) / TILE_SIZE);
-    neighbor_sample_example_v2<WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+
+    neighbor_sample_kernel<WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
        <<<grid, block, 0, resource_->remote_stream(i, gpu_id)>>>(
            graph,
-            id_array,
+            node_info_list,
            actual_size_array,
            sample_array,
            sample_size,
@@ -934,16 +795,15 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
    if (h_left[i] == -1) {
      continue;
    }
-    cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
+    CUDA_CHECK(cudaStreamSynchronize(resource_->remote_stream(i, gpu_id)));
  }
-
-  move_neighbor_sample_result_to_source_gpu(gpu_id,
-                                            total_gpu,
-                                            sample_size,
-                                            h_left,
-                                            h_right,
-                                            d_shard_vals_ptr,
-                                            d_shard_actual_sample_size_ptr);
+  move_result_to_source_gpu(gpu_id,
+                            total_gpu,
+                            sample_size,
+                            h_left,
+                            h_right,
+                            d_shard_vals_ptr,
+                            d_shard_actual_sample_size_ptr);
  fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
      d_shard_vals_ptr,
      val,
@@ -953,11 +813,11 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
      sample_size,
      len);

-  cudaStreamSynchronize(stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));

  if (cpu_query_switch) {
    // Get cpu keys and corresponding position.
-    thrust::device_vector<int64_t> t_cpu_keys(len);
+    thrust::device_vector<uint64_t> t_cpu_keys(len);
    thrust::device_vector<int> t_index(len + 1, 0);
    get_cpu_id_index<<<grid_size, block_size_, 0, stream>>>(
        key,
@@ -967,52 +827,52 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
        thrust::raw_pointer_cast(t_index.data()) + 1,
        len);

-    cudaStreamSynchronize(stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));

    int number_on_cpu = 0;
-    cudaMemcpy(&number_on_cpu,
-               thrust::raw_pointer_cast(t_index.data()),
-               sizeof(int),
-               cudaMemcpyDeviceToHost);
+    CUDA_CHECK(cudaMemcpy(&number_on_cpu,
+                          thrust::raw_pointer_cast(t_index.data()),
+                          sizeof(int),
+                          cudaMemcpyDeviceToHost));
    if (number_on_cpu > 0) {
-      int64_t* cpu_keys = new int64_t[number_on_cpu];
-      cudaMemcpy(cpu_keys,
-                 thrust::raw_pointer_cast(t_cpu_keys.data()),
-                 number_on_cpu * sizeof(int64_t),
-                 cudaMemcpyDeviceToHost);
+      uint64_t* cpu_keys = new uint64_t[number_on_cpu];
+      CUDA_CHECK(cudaMemcpy(cpu_keys,
+                            thrust::raw_pointer_cast(t_cpu_keys.data()),
+                            number_on_cpu * sizeof(uint64_t),
+                            cudaMemcpyDeviceToHost));

      std::vector<std::shared_ptr<char>> buffers(number_on_cpu);
      std::vector<int> ac(number_on_cpu);

-      auto status = cpu_graph_table->random_sample_neighbors(
-          0, cpu_keys, sample_size, buffers, ac, false);
+      auto status = cpu_graph_table_->random_sample_neighbors(
+          idx, cpu_keys, sample_size, buffers, ac, false);

      int total_cpu_sample_size = std::accumulate(ac.begin(), ac.end(), 0);
-      total_cpu_sample_size /= sizeof(int64_t);
+      total_cpu_sample_size /= sizeof(uint64_t);

-      // Merge buffers into one int64_t vector.
-      int64_t* merge_buffers = new int64_t[total_cpu_sample_size];
+      // Merge buffers into one uint64_t vector.
+      uint64_t* merge_buffers = new uint64_t[total_cpu_sample_size];
      int start = 0;
      for (int j = 0; j < number_on_cpu; j++) {
-        memcpy(merge_buffers + start, (int64_t*)(buffers[j].get()), ac[j]);
-        start += ac[j] / sizeof(int64_t);
+        memcpy(merge_buffers + start, (uint64_t*)(buffers[j].get()), ac[j]);
+        start += ac[j] / sizeof(uint64_t);
      }

      // Copy merge_buffers to gpu.
-      thrust::device_vector<int64_t> gpu_buffers(total_cpu_sample_size);
+      thrust::device_vector<uint64_t> gpu_buffers(total_cpu_sample_size);
      thrust::device_vector<int> gpu_ac(number_on_cpu);
-      int64_t* gpu_buffers_ptr = thrust::raw_pointer_cast(gpu_buffers.data());
+      uint64_t* gpu_buffers_ptr = thrust::raw_pointer_cast(gpu_buffers.data());
      int* gpu_ac_ptr = thrust::raw_pointer_cast(gpu_ac.data());
-      cudaMemcpyAsync(gpu_buffers_ptr,
-                      merge_buffers,
-                      total_cpu_sample_size * sizeof(int64_t),
-                      cudaMemcpyHostToDevice,
-                      stream);
-      cudaMemcpyAsync(gpu_ac_ptr,
-                      ac.data(),
-                      number_on_cpu * sizeof(int),
-                      cudaMemcpyHostToDevice,
-                      stream);
+      CUDA_CHECK(cudaMemcpyAsync(gpu_buffers_ptr,
+                                 merge_buffers,
+                                 total_cpu_sample_size * sizeof(uint64_t),
+                                 cudaMemcpyHostToDevice,
+                                 stream));
+      CUDA_CHECK(cudaMemcpyAsync(gpu_ac_ptr,
+                                 ac.data(),
+                                 number_on_cpu * sizeof(int),
+                                 cudaMemcpyHostToDevice,
+                                 stream));

      // Copy gpu_buffers and gpu_ac using kernel.
      // Kernel divide for gpu_ac_ptr.
@@ -1020,7 +880,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
      get_actual_gpu_ac<<<grid_size2, block_size_, 0, stream>>>(gpu_ac_ptr,
                                                                number_on_cpu);

-      cudaStreamSynchronize(stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream));

      thrust::device_vector<int> cumsum_gpu_ac(number_on_cpu);
      thrust::exclusive_scan(
@@ -1048,7 +908,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
  }

  {
-    cudaStreamSynchronize(stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
    platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
    platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));

@@ -1060,11 +920,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
                                           t_actual_sample_size.end());

    result.actual_val_mem =
-        memory::AllocShared(place, total_sample_size * sizeof(int64_t));
-    result.actual_val = (int64_t*)(result.actual_val_mem)->ptr();
+        memory::AllocShared(place, total_sample_size * sizeof(uint64_t));
+    result.actual_val = (uint64_t*)(result.actual_val_mem)->ptr();

    result.set_total_sample_size(total_sample_size);
-
    thrust::device_vector<int> cumsum_actual_sample_size(len);
    thrust::exclusive_scan(t_actual_sample_size.begin(),
                           t_actual_sample_size.end(),
@@ -1085,7 +944,6 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
    }
    destroy_storage(gpu_id, i);
  }
-
  cudaStreamSynchronize(stream);
  return result;
 }
@@ -1096,32 +954,13 @@ NodeQueryResult GpuPsGraphTable::graph_node_sample(int gpu_id,
 }

 NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id,
+                                                 int idx,
                                                 int start,
                                                 int query_size) {
  NodeQueryResult result;
+  result.actual_sample_size = 0;
  if (query_size <= 0) return result;
-  int& actual_size = result.actual_sample_size;
-  actual_size = 0;
-  // int dev_id = resource_->dev_id(gpu_id);
-  // platform::CUDADeviceGuard guard(dev_id);
-  std::vector<int> idx, gpu_begin_pos, local_begin_pos;
-  int sample_size;
-  /*
-  if idx[i] = a, gpu_begin_pos[i] = p1,
-  gpu_local_begin_pos[i] = p2;
-  sample_size[i] = s;
-  then on gpu a, the nodes of positions [p1,p1 + s) should be returned
-  and saved from the p2 position on the sample_result array
-  for example:
-  suppose
-  gpu 0 saves [0,2,4,6,8], gpu1 saves [1,3,5,7]
-  start = 3, query_size = 5
-  we know [6,8,1,3,5] should be returned;
-  idx = [0,1]
-  gpu_begin_pos = [3,0]
-  local_begin_pos = [0,3]
-  sample_size = [2,3]
-  */
+  std::vector<int> gpu_begin_pos, local_begin_pos;
  std::function<int(int, int, int, int, int&, int&)> range_check =
      [](int x, int y, int x1, int y1, int& x2, int& y2) {
        if (y <= x1 || x >= y1) return 0;
@@ -1129,7 +968,9 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id,
        x2 = max(x1, x);
        return y2 - x2;
      };
-  auto graph = gpu_graph_list[gpu_id];
+
+  int offset = gpu_id * graph_table_num_ + idx;
+  const auto& graph = gpu_graph_list_[offset];
  if (graph.node_size == 0) {
    return result;
  }
@@ -1139,69 +980,159 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id,
  if (len == 0) {
    return result;
  }
-  int64_t* val;
-  sample_size = len;
+
  result.initialize(len, resource_->dev_id(gpu_id));
-  actual_size = len;
-  val = result.val;
+  result.actual_sample_size = len;
+  uint64_t* val = result.val;
+
  int dev_id_i = resource_->dev_id(gpu_id);
  platform::CUDADeviceGuard guard(dev_id_i);
-  // platform::CUDADeviceGuard guard(i);
  int grid_size = (len - 1) / block_size_ + 1;
  node_query_example<<<grid_size,
                       block_size_,
                       0,
                       resource_->remote_stream(gpu_id, gpu_id)>>>(
-      gpu_graph_list[gpu_id], x2, len, (int64_t*)val);
-  cudaStreamSynchronize(resource_->remote_stream(gpu_id, gpu_id));
+      graph, x2, len, (uint64_t*)val);
+  CUDA_CHECK(cudaStreamSynchronize(resource_->remote_stream(gpu_id, gpu_id)));
  return result;
-  /*
-  for (int i = 0; i < gpu_graph_list.size() && query_size != 0; i++) {
-    auto graph = gpu_graph_list[i];
-    if (graph.node_size == 0) {
+}
+
+int GpuPsGraphTable::get_feature_of_nodes(int gpu_id,
+                                          uint64_t* d_nodes,
+                                          uint64_t* d_feature,
+                                          int node_num,
+                                          int slot_num) {
+  if (node_num == 0) {
+    return -1;
+  }
+
+  platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
+  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
+  int total_gpu = resource_->total_device();
+  auto stream = resource_->local_stream(gpu_id, 0);
+
+  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
+  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
+
+  CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream));
+  //
+  auto d_idx = memory::Alloc(place, node_num * sizeof(int));
+  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
+
+  auto d_shard_keys = memory::Alloc(place, node_num * sizeof(uint64_t));
+  uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
+  auto d_shard_vals =
+      memory::Alloc(place, slot_num * node_num * sizeof(uint64_t));
+  uint64_t* d_shard_vals_ptr = reinterpret_cast<uint64_t*>(d_shard_vals->ptr());
+  auto d_shard_actual_size = memory::Alloc(place, node_num * sizeof(int));
+  int* d_shard_actual_size_ptr =
+      reinterpret_cast<int*>(d_shard_actual_size->ptr());
+
+  split_input_to_shard(
+      d_nodes, d_idx_ptr, node_num, d_left_ptr, d_right_ptr, gpu_id);
+
+  heter_comm_kernel_->fill_shard_key(
+      d_shard_keys_ptr, d_nodes, d_idx_ptr, node_num, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  int h_left[total_gpu];  // NOLINT
+  CUDA_CHECK(cudaMemcpy(
+      h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
+  int h_right[total_gpu];  // NOLINT
+  CUDA_CHECK(cudaMemcpy(
+      h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
      continue;
    }
-    int x2, y2;
-    int len = range_check(start, start + query_size, size,
-                          size + graph.node_size, x2, y2);
-    if (len > 0) {
-      idx.push_back(i);
-      gpu_begin_pos.emplace_back(x2 - size);
-      local_begin_pos.emplace_back(actual_size);
-      sample_size.push_back(len);
-      actual_size += len;
-      create_storage(gpu_id, i, 1, len * sizeof(int64_t));
-    }
-    size += graph.node_size;
-  }
-  for (int i = 0; i < idx.size(); i++) {
-    int dev_id_i = resource_->dev_id(idx[i]);
-    platform::CUDADeviceGuard guard(dev_id_i);
-    // platform::CUDADeviceGuard guard(i);
-    auto& node = path_[gpu_id][idx[i]].nodes_.front();
-    int grid_size = (sample_size[i] - 1) / block_size_ + 1;
-    node_query_example<<<grid_size, block_size_, 0,
-                         resource_->remote_stream(idx[i], gpu_id)>>>(
-        gpu_graph_list[idx[i]], gpu_begin_pos[i], sample_size[i],
-        (int64_t*)node.val_storage);
+    create_storage(gpu_id,
+                   i,
+                   shard_len * sizeof(uint64_t),
+                   shard_len * slot_num * sizeof(uint64_t) +
+                       shard_len * sizeof(uint64_t) +
+                       sizeof(int) * (shard_len + shard_len % 2));
  }

-  for (int i = 0; i < idx.size(); i++) {
-    cudaStreamSynchronize(resource_->remote_stream(idx[i], gpu_id));
-    auto& node = path_[gpu_id][idx[i]].nodes_.front();
-    cudaMemcpyAsync(reinterpret_cast<char*>(val + local_begin_pos[i]),
-                    node.val_storage, node.val_bytes_len, cudaMemcpyDefault,
-                    node.out_stream);
+  walk_to_dest(
+      gpu_id, total_gpu, h_left, h_right, (uint64_t*)(d_shard_keys_ptr), NULL);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    auto& node = path_[gpu_id][i].nodes_.back();
+
+    CUDA_CHECK(cudaMemsetAsync(
+        node.val_storage, 0, shard_len * sizeof(uint64_t), node.in_stream));
+    CUDA_CHECK(cudaStreamSynchronize(node.in_stream));
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    // If not found, val is -1.
+    int table_offset = get_table_offset(i, GraphTableType::FEATURE_TABLE, 0);
+    tables_[table_offset]->get(reinterpret_cast<uint64_t*>(node.key_storage),
+                               reinterpret_cast<uint64_t*>(node.val_storage),
+                               (size_t)(h_right[i] - h_left[i] + 1),
+                               resource_->remote_stream(i, gpu_id));
+
+    int offset = i * feature_table_num_;
+    auto graph = gpu_graph_fea_list_[offset];
+
+    GpuPsFeaInfo* val_array = reinterpret_cast<GpuPsFeaInfo*>(node.val_storage);
+    int* actual_size_array = (int*)(val_array + shard_len);
+    uint64_t* feature_array =
+        (uint64_t*)(actual_size_array + shard_len + shard_len % 2);
+    dim3 grid((shard_len - 1) / dim_y + 1);
+    dim3 block(1, dim_y);
+    get_features_kernel<<<grid,
+                          block,
+                          0,
+                          resource_->remote_stream(i, gpu_id)>>>(
+        graph,
+        val_array,
+        actual_size_array,
+        feature_array,
+        slot_num,
+        shard_len);
  }
-  for (int i = 0; i < idx.size(); i++) {
-    auto& node = path_[gpu_id][idx[i]].nodes_.front();
-    cudaStreamSynchronize(node.out_stream);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    CUDA_CHECK(cudaStreamSynchronize(resource_->remote_stream(i, gpu_id)));
  }
-  for (auto x : idx) {
-    destroy_storage(gpu_id, x);
+
+  move_result_to_source_gpu(gpu_id,
+                            total_gpu,
+                            slot_num,
+                            h_left,
+                            h_right,
+                            d_shard_vals_ptr,
+                            d_shard_actual_size_ptr);
+
+  int grid_size = (node_num - 1) / block_size_ + 1;
+  fill_dvalues<<<grid_size, block_size_, 0, stream>>>(d_shard_vals_ptr,
+                                                      d_feature,
+                                                      d_shard_actual_size_ptr,
+                                                      d_idx_ptr,
+                                                      slot_num,
+                                                      node_num);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    destroy_storage(gpu_id, i);
  }
-  return result;
-  */
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  return 0;
 }
 }  // namespace framework
 };  // namespace paddle

--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -13,6 +13,8 @@
 // limitations under the License.

 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
+#include <sstream>
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
 namespace paddle {
@@ -25,12 +27,46 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) {
    device_id_mapping.push_back(device_id);
  }
 }
-std::vector<std::vector<int64_t>> GraphGpuWrapper::get_all_id(int type,
-                                                              int idx,
-                                                              int slice_num) {
+
+int GraphGpuWrapper::get_all_id(int type,
+                                int slice_num,
+                                std::vector<std::vector<uint64_t>> *output) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->get_all_id(type, slice_num, output);
+}
+
+int GraphGpuWrapper::get_all_neighbor_id(
+    int type, int slice_num, std::vector<std::vector<uint64_t>> *output) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->get_all_neighbor_id(type, slice_num, output);
+}
+
+int GraphGpuWrapper::get_all_id(int type,
+                                int idx,
+                                int slice_num,
+                                std::vector<std::vector<uint64_t>> *output) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->get_all_id(type, idx, slice_num, output);
+}
+
+int GraphGpuWrapper::get_all_neighbor_id(
+    int type,
+    int idx,
+    int slice_num,
+    std::vector<std::vector<uint64_t>> *output) {
  return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->get_all_id(type, idx, slice_num);
+      ->cpu_graph_table_->get_all_neighbor_id(type, idx, slice_num, output);
 }
+
+int GraphGpuWrapper::get_all_feature_ids(
+    int type,
+    int idx,
+    int slice_num,
+    std::vector<std::vector<uint64_t>> *output) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->get_all_feature_ids(type, idx, slice_num, output);
+}
+
 void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types,
                                   std::vector<std::string> &node_types) {
  id_to_edge = edge_types;
@@ -49,32 +85,40 @@ void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types,
  this->table_feat_conf_feat_shape.resize(node_types.size());
 }

+void GraphGpuWrapper::set_feature_separator(std::string ch) {
+  feature_separator_ = ch;
+  if (graph_table != nullptr) {
+    ((GpuPsGraphTable *)graph_table)
+        ->cpu_graph_table_->set_feature_separator(feature_separator_);
+  }
+}
+
 void GraphGpuWrapper::make_partitions(int idx,
                                      int64_t byte_size,
                                      int device_len) {
  ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->make_partitions(idx, byte_size, device_len);
+      ->cpu_graph_table_->make_partitions(idx, byte_size, device_len);
 }
 int32_t GraphGpuWrapper::load_next_partition(int idx) {
  return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->load_next_partition(idx);
+      ->cpu_graph_table_->load_next_partition(idx);
 }

 void GraphGpuWrapper::set_search_level(int level) {
-  ((GpuPsGraphTable *)graph_table)->cpu_graph_table->set_search_level(level);
+  ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->set_search_level(level);
 }

-std::vector<int64_t> GraphGpuWrapper::get_partition(int idx, int num) {
+std::vector<uint64_t> GraphGpuWrapper::get_partition(int idx, int num) {
  return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->get_partition(idx, num);
+      ->cpu_graph_table_->get_partition(idx, num);
 }
 int32_t GraphGpuWrapper::get_partition_num(int idx) {
  return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->get_partition_num(idx);
+      ->cpu_graph_table_->get_partition_num(idx);
 }
 void GraphGpuWrapper::make_complementary_graph(int idx, int64_t byte_size) {
  ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->make_complementary_graph(idx, byte_size);
+      ->cpu_graph_table_->make_complementary_graph(idx, byte_size);
 }
 void GraphGpuWrapper::load_edge_file(std::string name,
                                     std::string filepath,
@@ -90,7 +134,7 @@ void GraphGpuWrapper::load_edge_file(std::string name,
  }
  if (edge_to_id.find(name) != edge_to_id.end()) {
    ((GpuPsGraphTable *)graph_table)
-        ->cpu_graph_table->Load(std::string(filepath), params);
+        ->cpu_graph_table_->Load(std::string(filepath), params);
  }
 }

@@ -101,10 +145,21 @@ void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) {

  if (feature_to_id.find(name) != feature_to_id.end()) {
    ((GpuPsGraphTable *)graph_table)
-        ->cpu_graph_table->Load(std::string(filepath), params);
+        ->cpu_graph_table_->Load(std::string(filepath), params);
  }
 }

+void GraphGpuWrapper::load_node_and_edge(std::string etype,
+                                         std::string ntype,
+                                         std::string epath,
+                                         std::string npath,
+                                         int part_num,
+                                         bool reverse) {
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->load_node_and_edge_file(
+          etype, ntype, epath, npath, part_num, reverse);
+}
+
 void GraphGpuWrapper::add_table_feat_conf(std::string table_name,
                                          std::string feat_name,
                                          std::string feat_dtype,
@@ -137,8 +192,10 @@ void GraphGpuWrapper::init_search_level(int level) { search_level = level; }

 void GraphGpuWrapper::init_service() {
  table_proto.set_task_pool_size(24);
+  table_proto.set_shard_num(1000);
+  table_proto.set_build_sampler_on_cpu(false);
  table_proto.set_search_level(search_level);
-  table_proto.set_table_name("cpu_graph_table");
+  table_proto.set_table_name("cpu_graph_table_");
  table_proto.set_use_cache(false);
  for (int i = 0; i < id_to_edge.size(); i++)
    table_proto.add_edge_types(id_to_edge[i]);
@@ -155,76 +212,122 @@ void GraphGpuWrapper::init_service() {
  std::shared_ptr<HeterPsResource> resource =
      std::make_shared<HeterPsResource>(device_id_mapping);
  resource->enable_p2p();
-  GpuPsGraphTable *g = new GpuPsGraphTable(resource, 1);
+  GpuPsGraphTable *g = new GpuPsGraphTable(resource, 1, id_to_edge.size());
  g->init_cpu_table(table_proto);
+  g->cpu_graph_table_->set_feature_separator(feature_separator_);
  graph_table = (char *)g;
+  upload_task_pool.reset(new ::ThreadPool(upload_num));
+}
+
+void GraphGpuWrapper::finalize() {
+  ((GpuPsGraphTable *)graph_table)->show_table_collisions();
 }

-void GraphGpuWrapper::upload_batch(int idx,
-                                   std::vector<std::vector<int64_t>> &ids) {
+void GraphGpuWrapper::upload_batch(int type,
+                                   int idx,
+                                   int slice_num,
+                                   const std::string &edge_type) {
+  VLOG(0) << "begin upload edge, type[" << edge_type << "]";
+  std::vector<std::vector<uint64_t>> ids;
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->get_all_id(type, idx, slice_num, &ids);
+  debug_gpu_memory_info("upload_batch node start");
  GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
-  // std::vector<paddle::framework::GpuPsCommGraph> vec;
+  std::vector<std::future<int>> tasks;
+
  for (int i = 0; i < ids.size(); i++) {
-    // vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]));
-    GpuPsCommGraph sub_graph =
-        g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]);
-    g->build_graph_on_single_gpu(sub_graph, i);
-    sub_graph.release_on_cpu();
-    VLOG(0) << "sub graph on gpu " << i << " is built";
+    tasks.push_back(upload_task_pool->enqueue([&, i, idx, this]() -> int {
+      VLOG(0) << "begin make_gpu_ps_graph, node_id[" << i << "]_size["
+              << ids[i].size() << "]";
+      GpuPsCommGraph sub_graph =
+          g->cpu_graph_table_->make_gpu_ps_graph(idx, ids[i]);
+      g->build_graph_on_single_gpu(sub_graph, i, idx);
+      sub_graph.release_on_cpu();
+      VLOG(0) << "sub graph on gpu " << i << " is built";
+      return 0;
+    }));
  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  debug_gpu_memory_info("upload_batch node end");
+}
+
+// feature table
+void GraphGpuWrapper::upload_batch(int type, int slice_num, int slot_num) {
+  std::vector<std::vector<uint64_t>> node_ids;
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->get_all_id(type, slice_num, &node_ids);
+  debug_gpu_memory_info("upload_batch feature start");
+  GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
+  std::vector<std::future<int>> tasks;
+  for (int i = 0; i < node_ids.size(); i++) {
+    tasks.push_back(upload_task_pool->enqueue([&, i, this]() -> int {
+      VLOG(0) << "begin make_gpu_ps_graph_fea, node_ids[" << i << "]_size["
+              << node_ids[i].size() << "]";
+      GpuPsCommGraphFea sub_graph =
+          g->cpu_graph_table_->make_gpu_ps_graph_fea(node_ids[i], slot_num);
+      // sub_graph.display_on_cpu();
+      VLOG(0) << "begin build_graph_fea_on_single_gpu, node_ids[" << i
+              << "]_size[" << node_ids[i].size() << "]";
+      g->build_graph_fea_on_single_gpu(sub_graph, i);
+      sub_graph.release_on_cpu();
+      VLOG(0) << "sub graph fea on gpu " << i << " is built";
+      return 0;
+    }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
  // g->build_graph_from_cpu(vec);
+  debug_gpu_memory_info("upload_batch feature end");
 }

-// void GraphGpuWrapper::test() {
-//   int64_t cpu_key[3] = {0, 1, 2};
-//   void *key;
-//   platform::CUDADeviceGuard guard(0);
-//   cudaMalloc((void **)&key, 3 * sizeof(int64_t));
-//   cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
-//   auto neighbor_sample_res =
-//       ((GpuPsGraphTable *)graph_table)
-//           ->graph_neighbor_sample(0, (int64_t *)key, 2, 3);
-//   int64_t *res = new int64_t[7];
-//   cudaMemcpy(res, neighbor_sample_res.val, 3 * 2 * sizeof(int64_t),
-//              cudaMemcpyDeviceToHost);
-//   int *actual_sample_size = new int[3];
-//   cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
-//              3 * sizeof(int),
-//              cudaMemcpyDeviceToHost);  // 3, 1, 3
-
-//   //{0,9} or {9,0} is expected for key 0
-//   //{0,2} or {2,0} is expected for key 1
-//   //{1,3} or {3,1} is expected for key 2
-//   for (int i = 0; i < 3; i++) {
-//     VLOG(0) << "actual sample size for " << i << " is "
-//             << actual_sample_size[i];
-//     for (int j = 0; j < actual_sample_size[i]; j++) {
-//       VLOG(0) << "sampled an neighbor for node" << i << " : " << res[i * 2 +
-//       j];
-//     }
-//   }
-// }
 NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3(
    NeighborSampleQuery q, bool cpu_switch) {
  return ((GpuPsGraphTable *)graph_table)
      ->graph_neighbor_sample_v3(q, cpu_switch);
 }

+int GraphGpuWrapper::get_feature_of_nodes(int gpu_id,
+                                          uint64_t *d_walk,
+                                          uint64_t *d_offset,
+                                          uint32_t size,
+                                          int slot_num) {
+  platform::CUDADeviceGuard guard(gpu_id);
+  PADDLE_ENFORCE_NOT_NULL(graph_table,
+                          paddle::platform::errors::InvalidArgument(
+                              "graph_table should not be null"));
+  return ((GpuPsGraphTable *)graph_table)
+      ->get_feature_of_nodes(gpu_id, d_walk, d_offset, size, slot_num);
+}
+
+NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample(
+    int gpu_id, uint64_t *device_keys, int walk_degree, int len) {
+  platform::CUDADeviceGuard guard(gpu_id);
+  auto neighbor_sample_res =
+      ((GpuPsGraphTable *)graph_table)
+          ->graph_neighbor_sample(gpu_id, device_keys, walk_degree, len);
+
+  return neighbor_sample_res;
+}
+
 // this function is contributed by Liwb5
-std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
-    int gpu_id, std::vector<int64_t> &key, int sample_size) {
-  int64_t *cuda_key;
+std::vector<uint64_t> GraphGpuWrapper::graph_neighbor_sample(
+    int gpu_id, int idx, std::vector<uint64_t> &key, int sample_size) {
+  std::vector<uint64_t> res;
+  if (key.size() == 0) {
+    return res;
+  }
+  uint64_t *cuda_key;
  platform::CUDADeviceGuard guard(gpu_id);

-  cudaMalloc(&cuda_key, key.size() * sizeof(int64_t));
+  cudaMalloc(&cuda_key, key.size() * sizeof(uint64_t));
  cudaMemcpy(cuda_key,
             key.data(),
-             key.size() * sizeof(int64_t),
+             key.size() * sizeof(uint64_t),
             cudaMemcpyHostToDevice);
-
+  VLOG(0) << "key_size: " << key.size();
  auto neighbor_sample_res =
      ((GpuPsGraphTable *)graph_table)
-          ->graph_neighbor_sample(gpu_id, cuda_key, sample_size, key.size());
+          ->graph_neighbor_sample_v2(
+              gpu_id, idx, cuda_key, sample_size, key.size(), false);
  int *actual_sample_size = new int[key.size()];
  cudaMemcpy(actual_sample_size,
             neighbor_sample_res.actual_sample_size,
@@ -235,12 +338,12 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
    cumsum += actual_sample_size[i];
  }

-  std::vector<int64_t> cpu_key, res;
+  std::vector<uint64_t> cpu_key;
  cpu_key.resize(key.size() * sample_size);

  cudaMemcpy(cpu_key.data(),
             neighbor_sample_res.val,
-             key.size() * sample_size * sizeof(int64_t),
+             key.size() * sample_size * sizeof(uint64_t),
             cudaMemcpyDeviceToHost);
  for (int i = 0; i < key.size(); i++) {
    for (int j = 0; j < actual_sample_size[i]; j++) {
@@ -256,27 +359,26 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
  return res;
 }

-void GraphGpuWrapper::init_sample_status() {
-  ((GpuPsGraphTable *)graph_table)->init_sample_status();
-}
-
-void GraphGpuWrapper::free_sample_status() {
-  ((GpuPsGraphTable *)graph_table)->free_sample_status();
-}
 NodeQueryResult GraphGpuWrapper::query_node_list(int gpu_id,
+                                                 int idx,
                                                 int start,
                                                 int query_size) {
+  PADDLE_ENFORCE_EQ(FLAGS_gpugraph_load_node_list_into_hbm,
+                    true,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "when use query_node_list should set "
+                        "gpugraph_load_node_list_into_hbm true"));
  return ((GpuPsGraphTable *)graph_table)
-      ->query_node_list(gpu_id, start, query_size);
+      ->query_node_list(gpu_id, idx, start, query_size);
 }
 void GraphGpuWrapper::load_node_weight(int type_id, int idx, std::string path) {
  return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->load_node_weight(type_id, idx, path);
+      ->cpu_graph_table_->load_node_weight(type_id, idx, path);
 }

 void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) {
  return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->export_partition_files(idx, file_path);
+      ->cpu_graph_table_->export_partition_files(idx, file_path);
 }
 #endif
 }  // namespace framework

--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -32,39 +32,76 @@ class GraphGpuWrapper {
  }
  static std::shared_ptr<GraphGpuWrapper> s_instance_;
  void initialize();
-  void test();
+  void finalize();
  void set_device(std::vector<int> ids);
  void init_service();
  void set_up_types(std::vector<std::string>& edge_type,
                    std::vector<std::string>& node_type);
-  void upload_batch(int idx, std::vector<std::vector<int64_t>>& ids);
+  void upload_batch(int type,
+                    int idx,
+                    int slice_num,
+                    const std::string& edge_type);
+  void upload_batch(int type, int slice_num, int slot_num);
  void add_table_feat_conf(std::string table_name,
                           std::string feat_name,
                           std::string feat_dtype,
                           int feat_shape);
  void load_edge_file(std::string name, std::string filepath, bool reverse);
  void load_node_file(std::string name, std::string filepath);
+  void load_node_and_edge(std::string etype,
+                          std::string ntype,
+                          std::string epath,
+                          std::string npath,
+                          int part_num,
+                          bool reverse);
  int32_t load_next_partition(int idx);
  int32_t get_partition_num(int idx);
  void load_node_weight(int type_id, int idx, std::string path);
  void export_partition_files(int idx, std::string file_path);
-  std::vector<int64_t> get_partition(int idx, int num);
+  std::vector<uint64_t> get_partition(int idx, int num);
  void make_partitions(int idx, int64_t byte_size, int device_len);
  void make_complementary_graph(int idx, int64_t byte_size);
  void set_search_level(int level);
  void init_search_level(int level);
-  std::vector<std::vector<int64_t>> get_all_id(int type,
-                                               int idx,
-                                               int slice_num);
-  NodeQueryResult query_node_list(int gpu_id, int start, int query_size);
+  int get_all_id(int type,
+                 int slice_num,
+                 std::vector<std::vector<uint64_t>>* output);
+  int get_all_neighbor_id(int type,
+                          int slice_num,
+                          std::vector<std::vector<uint64_t>>* output);
+  int get_all_id(int type,
+                 int idx,
+                 int slice_num,
+                 std::vector<std::vector<uint64_t>>* output);
+  int get_all_neighbor_id(int type,
+                          int idx,
+                          int slice_num,
+                          std::vector<std::vector<uint64_t>>* output);
+  int get_all_feature_ids(int type,
+                          int idx,
+                          int slice_num,
+                          std::vector<std::vector<uint64_t>>* output);
+  NodeQueryResult query_node_list(int gpu_id,
+                                  int idx,
+                                  int start,
+                                  int query_size);
  NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
                                                bool cpu_switch);
-  std::vector<int64_t> graph_neighbor_sample(int gpu_id,
-                                             std::vector<int64_t>& key,
-                                             int sample_size);
+  NeighborSampleResult graph_neighbor_sample(int gpu_id,
+                                             uint64_t* device_keys,
+                                             int walk_degree,
+                                             int len);
+  std::vector<uint64_t> graph_neighbor_sample(int gpu_id,
+                                              int idx,
+                                              std::vector<uint64_t>& key,
+                                              int sample_size);
+  void set_feature_separator(std::string ch);
+  int get_feature_of_nodes(int gpu_id,
+                           uint64_t* d_walk,
+                           uint64_t* d_offset,
+                           uint32_t size,
+                           int slot_num);

-  void init_sample_status();
-  void free_sample_status();
  std::unordered_map<std::string, int> edge_to_id, feature_to_id;
  std::vector<std::string> id_to_feature, id_to_edge;
  std::vector<std::unordered_map<std::string, int>> table_feat_mapping;
@@ -75,6 +112,9 @@ class GraphGpuWrapper {
  std::vector<int> device_id_mapping;
  int search_level = 1;
  void* graph_table;
+  int upload_num = 8;
+  std::shared_ptr<::ThreadPool> upload_task_pool;
+  std::string feature_separator_ = std::string(" ");
 };
 #endif
 }  // namespace framework

--- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h
@@ -83,10 +83,10 @@ class CommonGraphSampler : public GraphSampler {
  virtual void init(GpuPsGraphTable *g, std::vector<std::string> args);
  GpuPsGraphTable *gpu_table;
  paddle::distributed::GraphTable *table;
-  std::vector<int64_t> gpu_edges_count;
-  int64_t cpu_edges_count;
-  int64_t gpu_edges_limit, cpu_edges_limit, gpu_edges_each_limit;
-  std::vector<std::unordered_set<int64_t>> gpu_set;
+  std::vector<uint64_t> gpu_edges_count;
+  uint64_t cpu_edges_count;
+  uint64_t gpu_edges_limit, cpu_edges_limit, gpu_edges_each_limit;
+  std::vector<std::unordered_set<uint64_t>> gpu_set;
  int gpu_num;
 };

@@ -102,8 +102,9 @@ class AllInGpuGraphSampler : public GraphSampler {
 protected:
  paddle::distributed::GraphTable *graph_table;
  GpuPsGraphTable *gpu_table;
-  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
-  std::vector<std::vector<int64_t>> sample_neighbors;
+  std::vector<std::vector<uint64_t>> sample_node_ids;
+  std::vector<std::vector<paddle::framework::GpuPsNodeInfo>> sample_node_infos;
+  std::vector<std::vector<uint64_t>> sample_neighbors;
  std::vector<GpuPsCommGraph> sample_res;
  // std::shared_ptr<std::mt19937_64> random;
  int gpu_num;

--- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h
@@ -24,7 +24,7 @@ int CommonGraphSampler::load_from_ssd(std::string path) {
    std::cout << values.size();
    if (values.size() < 2) continue;
    auto neighbors = paddle::string::split_string<std::string>(values[1], ";");
-    std::vector<int64_t> neighbor_data;
+    std::vector<uint64_t> neighbor_data;
    for (auto x : neighbors) {
      neighbor_data.push_back(std::stoll(x));
    }
@@ -33,7 +33,7 @@ int CommonGraphSampler::load_from_ssd(std::string path) {
             (char *)&src_id,
             sizeof(uint64_t),
             (char *)neighbor_data.data(),
-             sizeof(int64_t) * neighbor_data.size());
+             sizeof(uint64_t) * neighbor_data.size());
    int gpu_shard = src_id % gpu_num;
    if (gpu_edges_count[gpu_shard] + neighbor_data.size() <=
        gpu_edges_each_limit) {
@@ -52,7 +52,7 @@ int CommonGraphSampler::load_from_ssd(std::string path) {
    }
    std::vector<paddle::framework::GpuPsCommGraph> graph_list;
    for (int i = 0; i < gpu_num; i++) {
-      std::vector<int64_t> ids(gpu_set[i].begin(), gpu_set[i].end());
+      std::vector<uint64_t> ids(gpu_set[i].begin(), gpu_set[i].end());
      graph_list.push_back(table->make_gpu_ps_graph(ids));
    }
    gpu_table->build_graph_from_cpu(graph_list);
@@ -72,26 +72,29 @@ void CommonGraphSampler::init(GpuPsGraphTable *g,
  gpu_edges_each_limit = gpu_edges_limit / gpu_num;
  if (gpu_edges_each_limit > INT_MAX) gpu_edges_each_limit = INT_MAX;
  table = g->cpu_graph_table.get();
-  gpu_edges_count = std::vector<int64_t>(gpu_num, 0);
+  gpu_edges_count = std::vector<uint64_t>(gpu_num, 0);
  cpu_edges_count = 0;
-  gpu_set = std::vector<std::unordered_set<int64_t>>(gpu_num);
+  gpu_set = std::vector<std::unordered_set<uint64_t>>(gpu_num);
 }

 int AllInGpuGraphSampler::run_graph_sampling() { return 0; }
 int AllInGpuGraphSampler::load_from_ssd(std::string path) {
  graph_table->load_edges(path, false);
-  sample_nodes.clear();
-  sample_neighbors.clear();
+  sample_node_ids.clear() sample_node_infos.clear() sample_neighbors.clear();
  sample_res.clear();
-  sample_nodes.resize(gpu_num);
+  sample_node_ids.resize(gpu_num);
+  sample_node_infos.resize(gpu_num);
  sample_neighbors.resize(gpu_num);
  sample_res.resize(gpu_num);
-  std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
-      sample_nodes_ex(graph_table->task_pool_size_);
-  std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+  std::vector<std::vector<std::vector<uint64_t>>> sample_node_ids_ex(
+      graph_table->task_pool_size_);
+  std::vector<std::vector<std::vector<paddle::framework::GpuPsNodeInfo>>>
+      sample_node_infos_ex(graph_table->task_pool_size_);
+  std::vector<std::vector<std::vector<uint64_t>>> sample_neighbors_ex(
      graph_table->task_pool_size_);
  for (int i = 0; i < graph_table->task_pool_size_; i++) {
-    sample_nodes_ex[i].resize(gpu_num);
+    sample_node_ids_ex[i].resize(gpu_num);
+    sample_node_infos_ex[i].resize(gpu_num);
    sample_neighbors_ex[i].resize(gpu_num);
  }
  std::vector<std::future<int>> tasks;
@@ -100,17 +103,16 @@ int AllInGpuGraphSampler::load_from_ssd(std::string path) {
        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
            ->enqueue([&, i, this]() -> int {
              if (this->status == GraphSamplerStatus::terminating) return 0;
-              paddle::framework::GpuPsGraphNode node;
+              paddle::framework::GpuPsNodeInfo info;
              std::vector<paddle::distributed::Node *> &v =
                  this->graph_table->shards[i]->get_bucket();
              size_t ind = i % this->graph_table->task_pool_size_;
              for (size_t j = 0; j < v.size(); j++) {
-                size_t location = v[j]->get_id() % this->gpu_num;
-                node.node_id = v[j]->get_id();
-                node.neighbor_size = v[j]->get_neighbor_size();
-                node.neighbor_offset =
-                    (int)sample_neighbors_ex[ind][location].size();
-                sample_nodes_ex[ind][location].emplace_back(node);
+                info.neighbor_size = v[j]->get_neighbor_size();
+                info.neighbor_offset =
+                    sample_neighbors_ex[ind][location].size();
+                sample_node_infos_ex[ind][location].emplace_back(info);
+                sample_node_ids_ex[ind][location].emplace_back(v[j]->get_id());
                for (int k = 0; k < node.neighbor_size; k++)
                  sample_neighbors_ex[ind][location].push_back(
                      v[j]->get_neighbor_id(k));
@@ -128,9 +130,11 @@ int AllInGpuGraphSampler::load_from_ssd(std::string path) {
              int total_offset = 0;
              size_t ind = i;
              for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
-                for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) {
-                  sample_nodes[ind].push_back(sample_nodes_ex[j][ind][k]);
-                  sample_nodes[ind].back().neighbor_offset += total_offset;
+                for (size_t k = 0; k < sample_node_ids_ex[j][ind].size(); k++) {
+                  sample_node_ids[ind].push_back(sample_node_ids_ex[j][ind][k]);
+                  sample_node_infos[ind].push_back(
+                      sample_node_infos_ex[j][ind][k]);
+                  sample_node_infos[ind].back().neighbor_offset += total_offset;
                }
                size_t neighbor_size = sample_neighbors_ex[j][ind].size();
                total_offset += neighbor_size;
@@ -144,9 +148,10 @@ int AllInGpuGraphSampler::load_from_ssd(std::string path) {
  }
  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
  for (size_t i = 0; i < gpu_num; i++) {
-    sample_res[i].node_list = sample_nodes[i].data();
+    sample_res[i].node_list = sample_node_ids[i].data();
+    sample_res[i].node_info_list = sample_node_infos[i].data();
    sample_res[i].neighbor_list = sample_neighbors[i].data();
-    sample_res[i].node_size = sample_nodes[i].size();
+    sample_res[i].node_size = sample_node_ids[i].size();
    sample_res[i].neighbor_size = sample_neighbors[i].size();
  }


--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -76,6 +76,7 @@ class XPUCacheArray {
  }

  void print() {}
+  void print_collision(int i) {}

 #if defined(__xpu__)
  __device__ ValType* find(const KeyType& key) {
@@ -137,12 +138,12 @@ class HashTable {
           size_t len,
           StreamType stream);

-  template <typename StreamType, typename FVAccessor>
+  template <typename StreamType, typename GPUAccessor>
  void get(const KeyType* d_keys,
           char* d_vals,
           size_t len,
           StreamType stream,
-           FVAccessor& fv_accessor);
+           GPUAccessor& fv_accessor);

  void show();

@@ -193,6 +194,8 @@ class HashTable {
            << " push value size: " << push_grad_value_size_;
  }

+  void show_collision(int id) { return container_->print_collision(id); }
+
  std::unique_ptr<phi::RWLock> rwlock_{nullptr};

 private:

--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -83,25 +83,22 @@ __global__ void search_kernel(Table* table,
  }
 }

-template <typename Table, typename FVAccessor>
+template <typename Table, typename GPUAccessor>
 __global__ void dy_mf_search_kernel(Table* table,
                                    const typename Table::key_type* const keys,
                                    char* vals,
                                    size_t len,
                                    size_t pull_feature_value_size,
-                                    FVAccessor feature_value_accessor) {
+                                    GPUAccessor gpu_accessor) {
  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  // return;
  if (i < len) {
    auto it = table->find(keys[i]);
-
    if (it != table->end()) {
      uint64_t offset = i * pull_feature_value_size;
      float* cur = (float*)(vals + offset);
      float* input = it->second;
-      int mf_dim =
-          int(input[feature_value_accessor.common_feature_value.MfDimIndex()]);
-
-      feature_value_accessor.FeatureValueFill(cur, input, mf_dim);
+      gpu_accessor.PullValueFill(cur, input);
    }
  }
 }
@@ -137,9 +134,7 @@ __global__ void dy_mf_update_kernel(Table* table,
      float* cur = (float*)(grads + i * grad_value_size);
      sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, cur);
    } else {
-      if (keys[i] != 0) {
-        printf("warning::push miss key: %llu", keys[i]);
-      }
+      printf("warning: push miss key: %lu", keys[i]);
    }
  }
 }
@@ -147,11 +142,12 @@ __global__ void dy_mf_update_kernel(Table* table,
 template <typename KeyType, typename ValType>
 HashTable<KeyType, ValType>::HashTable(size_t capacity) {
  container_ = new TableContainer<KeyType, ValType>(capacity);
-  cudaMalloc((void**)&device_optimizer_config_, sizeof(OptimizerConfig));
-  cudaMemcpy((void*)device_optimizer_config_,
-             &host_optimizer_config_,
-             sizeof(OptimizerConfig),
-             cudaMemcpyHostToDevice);
+  CUDA_RT_CALL(
+      cudaMalloc((void**)&device_optimizer_config_, sizeof(OptimizerConfig)));
+  CUDA_RT_CALL(cudaMemcpy((void*)device_optimizer_config_,
+                          &host_optimizer_config_,
+                          sizeof(OptimizerConfig),
+                          cudaMemcpyHostToDevice));
  rwlock_.reset(new phi::RWLock);
 }

@@ -201,12 +197,12 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
 }

 template <typename KeyType, typename ValType>
-template <typename StreamType, typename FVAccessor>
+template <typename StreamType, typename GPUAccessor>
 void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
                                      char* d_vals,
                                      size_t len,
                                      StreamType stream,
-                                      FVAccessor& fv_accessor) {
+                                      GPUAccessor& fv_accessor) {
  if (len == 0) {
    return;
  }
@@ -345,6 +341,7 @@ template class HashTable<unsigned long, float*>;
 template class HashTable<long, int>;
 template class HashTable<unsigned long, int>;
 template class HashTable<unsigned long, unsigned long>;
+template class HashTable<unsigned long, unsigned long*>;
 template class HashTable<unsigned long, long>;
 template class HashTable<unsigned long, long*>;
 template class HashTable<long, long>;
@@ -377,7 +374,8 @@ template void HashTable<unsigned long, unsigned long>::get<cudaStream_t>(
    unsigned long* d_vals,
    size_t len,
    cudaStream_t stream);
-
+template void HashTable<unsigned long, long>::get<cudaStream_t>(
+    const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream);
 template void HashTable<long, unsigned long>::get<cudaStream_t>(
    const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream);
 template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
@@ -386,8 +384,6 @@ template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
                                                       cudaStream_t stream);
 template void HashTable<long, unsigned int>::get<cudaStream_t>(
    const long* d_keys, unsigned int* d_vals, size_t len, cudaStream_t stream);
-template void HashTable<unsigned long, long>::get<cudaStream_t>(
-    const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream);
 // template void
 // HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
 //    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
@@ -421,6 +417,13 @@ template void HashTable<unsigned long, int>::insert<cudaStream_t>(
    const int* d_vals,
    size_t len,
    cudaStream_t stream);
+
+template void HashTable<unsigned long, long>::insert<cudaStream_t>(
+    const unsigned long* d_keys,
+    const long* d_vals,
+    size_t len,
+    cudaStream_t stream);
+
 template void HashTable<long, unsigned long>::insert<cudaStream_t>(
    const long* d_keys,
    const unsigned long* d_vals,
@@ -433,12 +436,6 @@ template void HashTable<long, unsigned int>::insert<cudaStream_t>(
    size_t len,
    cudaStream_t stream);

-template void HashTable<unsigned long, long>::insert<cudaStream_t>(
-    const unsigned long* d_keys,
-    const long* d_vals,
-    size_t len,
-    cudaStream_t stream);
-
 template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
    const unsigned long* d_keys,
    const unsigned long* d_vals,
@@ -448,26 +445,26 @@ template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
 template void HashTable<unsigned long, float*>::dump_to_cpu<cudaStream_t>(
    int devid, cudaStream_t stream);

-template void
-HashTable<unsigned long, float*>::update<SparseAdagradOptimizer, cudaStream_t>(
-    const unsigned long* d_keys,
-    const char* d_grads,
-    size_t len,
-    SparseAdagradOptimizer sgd,
-    cudaStream_t stream);
-template void
-HashTable<unsigned long, float*>::update<SparseAdamOptimizer, cudaStream_t>(
-    const unsigned long* d_keys,
-    const char* d_grads,
-    size_t len,
-    SparseAdamOptimizer sgd,
-    cudaStream_t stream);
 template void HashTable<unsigned long, float*>::update<
-    SparseAdamSharedOptimizer,
+    SparseAdagradOptimizer<CommonFeatureValueAccessor>,
+    cudaStream_t>(const unsigned long* d_keys,
+                  const char* d_grads,
+                  size_t len,
+                  SparseAdagradOptimizer<CommonFeatureValueAccessor> sgd,
+                  cudaStream_t stream);
+template void HashTable<unsigned long, float*>::update<
+    SparseAdamOptimizer<CommonFeatureValueAccessor>,
+    cudaStream_t>(const unsigned long* d_keys,
+                  const char* d_grads,
+                  size_t len,
+                  SparseAdamOptimizer<CommonFeatureValueAccessor> sgd,
+                  cudaStream_t stream);
+template void HashTable<unsigned long, float*>::update<
+    SparseAdamSharedOptimizer<CommonFeatureValueAccessor>,
    cudaStream_t>(const unsigned long* d_keys,
                  const char* d_grads,
                  size_t len,
-                  SparseAdamSharedOptimizer sgd,
+                  SparseAdamSharedOptimizer<CommonFeatureValueAccessor> sgd,
                  cudaStream_t stream);

 // template void HashTable<unsigned long,

--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/timer.h"
 #include "thrust/pair.h"
 #elif defined(PADDLE_WITH_XPU_KP)
-// #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
 #include <xpu/runtime.h>

 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
@@ -49,14 +48,46 @@ namespace framework {
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
+          typename GPUAccessor>
 class HeterComm {
 public:
  HeterComm(size_t capacity, std::shared_ptr<HeterPsResource> resource);
+  HeterComm(size_t capacity,
+            std::shared_ptr<HeterPsResource> resource,
+            GPUAccessor& gpu_accessor);
  virtual ~HeterComm();
  HeterComm(const HeterComm&) = delete;
  HeterComm& operator=(const HeterComm&) = delete;

+  void merge_keys(int gpu_num,
+                  const KeyType* d_keys,
+                  size_t len,
+                  KeyType* d_sorted_keys,
+                  KeyType* d_merged_keys,
+                  uint32_t* d_restore_idx,
+                  size_t& uniq_len);
+  void dynamic_merge_grad(int gpu_num,
+                          KeyType* d_keys,
+                          float* d_grads,
+                          size_t len,
+                          int& uniq_len,
+                          size_t& segment_len,
+                          bool enable_segment_merge_grad);
+  void segment_merge_grad(int gpu_num,
+                          KeyType* d_keys,
+                          float* d_grads,
+                          const uint32_t* d_index,
+                          size_t len,
+                          const uint32_t* d_fea_num_info,
+                          size_t uniq_len,
+                          size_t& segment_len);
+  void build_ps(int num,
+                KeyType* h_keys,
+                ValType* h_vals,
+                size_t len,
+                size_t chunk_size,
+                int stream_num,
+                int offset = -1);
  void split_input_to_shard(KeyType* d_keys,
                            int* d_idx_ptr,
                            size_t len,
@@ -71,12 +102,6 @@ class HeterComm {
  void dynamic_merge_grad(
      int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len);
  void pull_sparse(int num, KeyType* d_keys, float* d_vals, size_t len);
-  void build_ps(int num,
-                KeyType* h_keys,
-                ValType* h_vals,
-                size_t len,
-                size_t chunk_size,
-                int stream_num);
  void build_ps(int num,
                KeyType* h_keys,
                char* pool,
@@ -86,6 +111,7 @@ class HeterComm {
                int stream_num);
  void dump();
  void show_one_table(int gpu_num);
+  void show_table_collisions();
  int get_index_by_devid(int devid);

 #if defined(PADDLE_WITH_CUDA)
@@ -150,12 +176,6 @@ class HeterComm {
    max_mf_dim_ = max_mf_dim;
  }

-  void set_accessor(FVAccessor& accessor) {
-    feature_value_accessor_ = accessor;
-    //  for (auto& ptr_table: ptr_tables_) {
-    //    ptr_table->set_accessor(feature_value_accessor_);
-    //  }
-  }
 #endif

  bool need_transfer(int send_id, int receive_id) {
@@ -167,6 +187,19 @@ class HeterComm {
  int get_transfer_devid(int send_id) { return (send_id + 4) % 8; }

  void end_pass();
+#if defined(PADDLE_WITH_CUDA)
+  // dedup
+  int dedup_keys_and_fillidx(const int gpu_id,
+                             const int total_fea_num,
+                             const KeyType* d_keys,   // input
+                             KeyType* d_merged_keys,  // output
+                             KeyType* d_sorted_keys,
+                             uint32_t* d_restore_idx,
+                             uint32_t* d_sorted_idx,
+                             uint32_t* d_offset,
+                             uint32_t* d_merged_cnts,
+                             bool filter_zero);
+#endif

  struct Node {
    ppStream in_stream;
@@ -262,7 +295,10 @@ class HeterComm {
 #endif
  }

-  void create_storage(int start_index, int end_index, int keylen, int vallen);
+  void create_storage(int start_index,
+                      int end_index,
+                      size_t keylen,
+                      size_t vallen);
  void destroy_storage(int start_index, int end_index);
  void walk_to_dest(int start_index,
                    int gpu_num,
@@ -289,9 +325,10 @@ class HeterComm {
                   char* src_val,
                   size_t val_size);

-  FVAccessor feature_value_accessor_;
-
 protected:
+  void pull_merge_sparse(int num, KeyType* d_keys, float* d_vals, size_t len);
+  void pull_normal_sparse(int num, KeyType* d_keys, float* d_vals, size_t len);
+
  using Table = HashTable<KeyType, ValType>;
  using PtrTable = HashTable<KeyType, float*>;
  std::vector<Table*> tables_;
@@ -302,6 +339,8 @@ class HeterComm {
  int block_size_{256};
  std::unique_ptr<HeterCommKernel> heter_comm_kernel_;

+  GPUAccessor gpu_accessor_;
+
 private:
  int topo_aware_{0};
  std::vector<LocalStorage> storage_;

--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -16,25 +16,34 @@ limitations under the License. */
 #include <queue>

 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_XPU_KP
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #endif

+DECLARE_double(gpugraph_hbm_table_load_factor);
+DECLARE_bool(gpugraph_enable_gpu_direct_access);
+DECLARE_bool(gpugraph_enable_segment_merge_grads);
+DECLARE_uint64(gpugraph_merge_grads_segment_size);
+DECLARE_int32(gpugraph_dedup_pull_push_mode);
+
 namespace paddle {
 namespace framework {

 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-HeterComm<KeyType, ValType, GradType, FVAccessor>::HeterComm(
+          typename GPUAccessor>
+HeterComm<KeyType, ValType, GradType, GPUAccessor>::HeterComm(
    size_t capacity, std::shared_ptr<HeterPsResource> resource) {
  VLOG(1) << "Construct new HeterComm";
  resource_ = resource;
  storage_.resize(resource_->total_device());
  multi_mf_dim_ = resource->multi_mf();
+  load_factor_ = FLAGS_gpugraph_hbm_table_load_factor;
+  VLOG(0) << "load_factor = " << load_factor_;
  for (int i = 0; i < resource_->total_device(); ++i) {
 #if defined(PADDLE_WITH_CUDA)
    platform::CUDADeviceGuard guard(resource_->dev_id(i));
@@ -47,15 +56,19 @@ HeterComm<KeyType, ValType, GradType, FVAccessor>::HeterComm(
    } else {
      max_mf_dim_ = resource_->max_mf_dim();
      auto accessor_wrapper_ptr =
-          GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+          GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
      size_t val_type_size =
          accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
      size_t grad_type_size =
          accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
+      size_t pull_type_size =
+          accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
+
      VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size
-              << ", feature_value_push_size:" << grad_type_size;
+              << ", feature_value_push_size:" << grad_type_size
+              << ", feature_pull_type_size:" << pull_type_size;
      auto ptr_table = new PtrTable(capacity / load_factor_);
-      ptr_table->set_feature_value_size(val_type_size, grad_type_size);
+      ptr_table->set_feature_value_size(pull_type_size, grad_type_size);
      ptr_tables_.push_back(ptr_table);
    }
    if (multi_node_) {
@@ -69,8 +82,58 @@ HeterComm<KeyType, ValType, GradType, FVAccessor>::HeterComm(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::init_path() {
+          typename GPUAccessor>
+HeterComm<KeyType, ValType, GradType, GPUAccessor>::HeterComm(
+    size_t capacity,
+    std::shared_ptr<HeterPsResource> resource,
+    GPUAccessor& gpu_accessor) {
+  VLOG(1) << "Construct new HeterComm";
+  resource_ = resource;
+  storage_.resize(resource_->total_device());
+  multi_mf_dim_ = resource->multi_mf();
+  gpu_accessor_ = gpu_accessor;
+  load_factor_ = FLAGS_gpugraph_hbm_table_load_factor;
+  VLOG(0) << "load_factor = " << load_factor_;
+  for (int i = 0; i < resource_->total_device(); ++i) {
+#if defined(PADDLE_WITH_CUDA)
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    allocators_.push_back(std::make_shared<cub::CachingDeviceAllocator>(
+        8, 1, (unsigned int)-1, (size_t)-1, false, false));  // NOLINT
+#endif
+    if (!multi_mf_dim_) {
+      auto table = new Table(capacity / load_factor_);
+      tables_.push_back(table);
+    } else {
+      max_mf_dim_ = resource_->max_mf_dim();
+      auto accessor_wrapper_ptr =
+          GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
+      size_t val_type_size =
+          accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
+      size_t grad_type_size =
+          accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
+      size_t pull_type_size =
+          accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
+
+      VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size
+              << ", feature_value_push_size:" << grad_type_size
+              << ", feature_pull_type_size:" << pull_type_size;
+      auto ptr_table = new PtrTable(capacity / load_factor_);
+      ptr_table->set_feature_value_size(pull_type_size, grad_type_size);
+      ptr_tables_.push_back(ptr_table);
+    }
+    if (multi_node_) {
+      storage_[i].init(feanum_, resource_->dev_id(i));
+    }
+  }
+  heter_comm_kernel_ = std::make_unique<HeterCommKernel>(block_size_);
+  init_path();
+}
+
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::init_path() {
  int total_device = resource_->total_device();
  path_.resize(total_device);
  if (!topo_aware_) {
@@ -125,9 +188,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::init_path() {
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
+          typename GPUAccessor>
 template <typename DstPlace, typename SrcPlace, typename StreamType>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::memory_copy(
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::memory_copy(
    DstPlace dst_place,
    void* dst,
    SrcPlace src_place,
@@ -135,9 +198,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::memory_copy(
    size_t count,
    StreamType stream) {
 #if defined(PADDLE_WITH_CUDA)
-  cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
+  CUDA_CHECK(cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream));
  if (stream == 0) {
-    cudaStreamSynchronize(0);
+    CUDA_CHECK(cudaStreamSynchronize(0));
  }
 #elif defined(PADDLE_WITH_XPU_KP)
  memory::Copy(dst_place, dst, src_place, src, count);
@@ -147,24 +210,24 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::memory_copy(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::create_storage(
-    int start_index, int end_index, int keylen, int vallen) {
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::create_storage(
+    int start_index, int end_index, size_t keylen, size_t vallen) {
 #if defined(PADDLE_WITH_CUDA)
  auto& allocator = allocators_[start_index];
  auto& nodes = path_[start_index][end_index].nodes_;
  for (size_t i = 0; i < nodes.size(); ++i) {
    platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num));
-    allocator->DeviceAllocate(
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
        resource_->dev_id(nodes[i].dev_num),
        (void**)&(nodes[i].key_storage),  // NOLINT
        keylen,
-        resource_->remote_stream(nodes[i].dev_num, start_index));
-    allocator->DeviceAllocate(
+        resource_->remote_stream(nodes[i].dev_num, start_index)));
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
        resource_->dev_id(nodes[i].dev_num),
        (void**)&(nodes[i].val_storage),  // NOLINT
        vallen,
-        resource_->remote_stream(nodes[i].dev_num, start_index));
+        resource_->remote_stream(nodes[i].dev_num, start_index)));
    nodes[i].key_bytes_len = keylen;
    nodes[i].val_bytes_len = vallen;
  }
@@ -186,8 +249,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::create_storage(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::destroy_storage(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::destroy_storage(
    int start_index, int end_index) {
 #if defined(PADDLE_WITH_CUDA)
  auto& allocator = allocators_[start_index];
@@ -195,10 +258,10 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::destroy_storage(
  for (size_t i = 0; i < nodes.size(); ++i) {
    platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num));

-    allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num),
-                          nodes[i].key_storage);
-    allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num),
-                          nodes[i].val_storage);
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(
+        resource_->dev_id(nodes[i].dev_num), nodes[i].key_storage));
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(
+        resource_->dev_id(nodes[i].dev_num), nodes[i].val_storage));
  }
 #endif
 }
@@ -206,8 +269,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::destroy_storage(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::walk_to_dest(
    int start_index,
    int num,
    int* h_left,
@@ -293,8 +356,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::walk_to_dest(
    int start_index,
    int gpu_num,
    int* h_left,
@@ -315,40 +378,44 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
    auto& node = path_[start_index][i].nodes_[0];
    CopyTask t(&path_[start_index][i], 0);
    que.push(t);
-    cudaMemcpyAsync(node.key_storage,
-                    reinterpret_cast<char*>(src_key + h_left[i]),
-                    node.key_bytes_len,
-                    cudaMemcpyDefault,
-                    node.in_stream);
+    CUDA_CHECK(cudaMemcpyAsync(node.key_storage,
+                               reinterpret_cast<char*>(src_key + h_left[i]),
+                               node.key_bytes_len,
+                               cudaMemcpyDefault,
+                               node.in_stream));
    if (need_copy_val) {
-      cudaMemcpyAsync(node.val_storage,
-                      src_val + uint64_t(h_left[i]) * uint64_t(val_size),
-                      node.val_bytes_len,
-                      cudaMemcpyDefault,
-                      node.in_stream);
+      CUDA_CHECK(
+          cudaMemcpyAsync(node.val_storage,
+                          src_val + uint64_t(h_left[i]) * uint64_t(val_size),
+                          node.val_bytes_len,
+                          cudaMemcpyDefault,
+                          node.in_stream));
    }
  }
  while (!que.empty()) {
    CopyTask& cur_task = que.front();
    que.pop();
    if (cur_task.path->nodes_[cur_task.step].sync) {
-      cudaStreamSynchronize(cur_task.path->nodes_[cur_task.step].in_stream);
+      CUDA_CHECK(cudaStreamSynchronize(
+          cur_task.path->nodes_[cur_task.step].in_stream));
    }
    if (cur_task.step != cur_task.path->nodes_.size() - 1) {
      int cur_step = cur_task.step;
      CopyTask c(cur_task.path, cur_step + 1);
      que.push(c);
-      cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage,
-                      cur_task.path->nodes_[cur_step].key_storage,
-                      cur_task.path->nodes_[cur_step + 1].key_bytes_len,
-                      cudaMemcpyDefault,
-                      cur_task.path->nodes_[cur_step + 1].in_stream);
+      CUDA_CHECK(
+          cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage,
+                          cur_task.path->nodes_[cur_step].key_storage,
+                          cur_task.path->nodes_[cur_step + 1].key_bytes_len,
+                          cudaMemcpyDefault,
+                          cur_task.path->nodes_[cur_step + 1].in_stream));
      if (need_copy_val) {
-        cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage,
-                        cur_task.path->nodes_[cur_step].val_storage,
-                        cur_task.path->nodes_[cur_step + 1].val_bytes_len,
-                        cudaMemcpyDefault,
-                        cur_task.path->nodes_[cur_step + 1].in_stream);
+        CUDA_CHECK(
+            cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage,
+                            cur_task.path->nodes_[cur_step].val_storage,
+                            cur_task.path->nodes_[cur_step + 1].val_bytes_len,
+                            cudaMemcpyDefault,
+                            cur_task.path->nodes_[cur_step + 1].in_stream));
      }
    }
  }
@@ -357,8 +424,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::walk_to_src(
    int start_index,
    int gpu_num,
    int* h_left,
@@ -373,19 +440,20 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src(
    int cur_step = path_[start_index][i].nodes_.size() - 1;
    auto& node = path_[start_index][i].nodes_[cur_step];
    if (cur_step == 0) {
-      cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size,
-                      node.val_storage,
-                      node.val_bytes_len,
-                      cudaMemcpyDefault,
-                      node.out_stream);
+      CUDA_CHECK(cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size,
+                                 node.val_storage,
+                                 node.val_bytes_len,
+                                 cudaMemcpyDefault,
+                                 node.out_stream));
    } else {
      CopyTask t(&path_[start_index][i], cur_step - 1);
      que.push(t);
-      cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage,
-                      node.val_storage,
-                      path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
-                      cudaMemcpyDefault,
-                      path_[start_index][i].nodes_[cur_step - 1].out_stream);
+      CUDA_CHECK(cudaMemcpyAsync(
+          path_[start_index][i].nodes_[cur_step - 1].val_storage,
+          node.val_storage,
+          path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
+          cudaMemcpyDefault,
+          path_[start_index][i].nodes_[cur_step - 1].out_stream));
    }
  }
  while (!que.empty()) {
@@ -398,18 +466,20 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src(
    if (cur_step > 0) {
      CopyTask c(cur_task.path, cur_step - 1);
      que.push(c);
-      cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage,
-                      cur_task.path->nodes_[cur_step].val_storage,
-                      cur_task.path->nodes_[cur_step - 1].val_bytes_len,
-                      cudaMemcpyDefault,
-                      cur_task.path->nodes_[cur_step - 1].out_stream);
+      CUDA_CHECK(
+          cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage,
+                          cur_task.path->nodes_[cur_step].val_storage,
+                          cur_task.path->nodes_[cur_step - 1].val_bytes_len,
+                          cudaMemcpyDefault,
+                          cur_task.path->nodes_[cur_step - 1].out_stream));
    } else if (cur_step == 0) {
      int end_index = cur_task.path->nodes_.back().dev_num;
-      cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size,
-                      cur_task.path->nodes_[cur_step].val_storage,
-                      cur_task.path->nodes_[cur_step].val_bytes_len,
-                      cudaMemcpyDefault,
-                      cur_task.path->nodes_[cur_step].out_stream);
+      CUDA_CHECK(
+          cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size,
+                          cur_task.path->nodes_[cur_step].val_storage,
+                          cur_task.path->nodes_[cur_step].val_bytes_len,
+                          cudaMemcpyDefault,
+                          cur_task.path->nodes_[cur_step].out_stream));
    }
  }
 }
@@ -417,8 +487,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-HeterComm<KeyType, ValType, GradType, FVAccessor>::~HeterComm() {
+          typename GPUAccessor>
+HeterComm<KeyType, ValType, GradType, GPUAccessor>::~HeterComm() {
  if (!multi_mf_dim_) {
    for (auto& table : tables_) {
      delete table;
@@ -439,8 +509,8 @@ HeterComm<KeyType, ValType, GradType, FVAccessor>::~HeterComm() {
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::show_one_table(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::show_one_table(
    int gpu_num) {
  if (!multi_mf_dim_) {
    tables_[gpu_num]->show();
@@ -450,8 +520,28 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::show_one_table(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-int HeterComm<KeyType, ValType, GradType, FVAccessor>::log2i(int x) {
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::
+    show_table_collisions() {
+  size_t idx = 0;
+  for (auto& table : tables_) {
+    if (table != nullptr) {
+      table->show_collision(idx++);
+    }
+  }
+  idx = 0;
+  for (auto& table : ptr_tables_) {
+    if (table != nullptr) {
+      table->show_collision(idx++);
+    }
+  }
+}
+
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename GPUAccessor>
+int HeterComm<KeyType, ValType, GradType, GPUAccessor>::log2i(int x) {
  unsigned res = 0;
  while (x >>= 1) {
    ++res;
@@ -462,8 +552,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::log2i(int x) {
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-int HeterComm<KeyType, ValType, GradType, FVAccessor>::get_index_by_devid(
+          typename GPUAccessor>
+int HeterComm<KeyType, ValType, GradType, GPUAccessor>::get_index_by_devid(
    int devid) {
  return resource_->get_index_by_devid(devid);
 }
@@ -471,8 +561,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::get_index_by_devid(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_sparse_sgd(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::set_sparse_sgd(
    const OptimizerConfig& optimizer_config) {
  for (int i = 0; i < resource_->total_device(); ++i) {
    AnyDeviceGuard guard(resource_->dev_id(i));
@@ -487,8 +577,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_sparse_sgd(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_embedx_sgd(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::set_embedx_sgd(
    const OptimizerConfig& optimizer_config) {
  for (int i = 0; i < resource_->total_device(); ++i) {
    AnyDeviceGuard guard(resource_->dev_id(i));
@@ -503,14 +593,15 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_embedx_sgd(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::build_ps(
    int dev_num,
    KeyType* h_keys,
    ValType* h_vals,
    size_t len,
    size_t chunk_size,
-    int stream_num) {
+    int stream_num,
+    int offset) {
  if (len <= 0) {
    return;
  }
@@ -557,11 +648,11 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
                h_vals + cur_len,
                sizeof(ValType) * tmp_len,
                cur_use_stream);
-
-    tables_[dev_num]->insert(
+    if (offset == -1) offset = dev_num;
+    tables_[offset]->insert(
        reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()),
        reinterpret_cast<ValType*>(d_val_bufs[cur_stream]->ptr()),
-        tmp_len,
+        (size_t)tmp_len,
        cur_use_stream);

    cur_stream += 1;
@@ -576,8 +667,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::build_ps(
    int num,
    KeyType* h_keys,
    char* pool,
@@ -642,8 +733,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::merge_grad(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::merge_grad(
    int dev_num,
    KeyType* d_keys,
    GradType* d_grads,
@@ -719,34 +810,36 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::merge_grad(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad(
-    int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len) {
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::dynamic_merge_grad(
+    int gpu_num,
+    KeyType* d_keys,
+    float* d_grads,
+    size_t len,
+    int& uniq_len,
+    size_t& segment_len,
+    bool enable_segment_merge_grad) {
  int dev_id = resource_->dev_id(gpu_num);
  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
  platform::CUDADeviceGuard guard(dev_id);
  auto stream = resource_->local_stream(gpu_num, 0);

  size_t temp_storage_bytes;
-
+  size_t grad_dim = max_mf_dim_;
  auto accessor_wrapper_ptr =
-      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+      GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);

  auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
  KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
-
-  auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
-  float* d_merge_grads_ptr = reinterpret_cast<float*>(d_merge_grads->ptr());
-
  auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1));
  uint32_t* d_fea_num_info_ptr =
      reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
  uint32_t* d_index = (uint32_t*)&d_fea_num_info_ptr[len];
  uint32_t* d_idx = (uint32_t*)&d_index[len];
  int* d_merged_size = (int*)&d_idx[len];
-  int grid_size = (len - 1) / block_size_ + 1;
  heter_comm_kernel_->fill_idx(d_idx, len, stream);
+
  PADDLE_ENFORCE_GPU_SUCCESS(
      cub::DeviceRadixSort::SortPairs(NULL,
                                      temp_storage_bytes,
@@ -758,7 +851,6 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad(
                                      0,
                                      8 * sizeof(KeyType),
                                      stream));
-  void* d_buff = NULL;
  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
  PADDLE_ENFORCE_GPU_SUCCESS(
      cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(),
@@ -772,6 +864,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad(
                                      8 * sizeof(KeyType),
                                      stream));
  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
  temp_storage_bytes = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(
      cub::DeviceRunLengthEncode::Encode(NULL,
@@ -824,20 +917,194 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad(
                                    uniq_len,
                                    stream));
  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
-  heter_comm_kernel_->merge_gradient(d_offset,
-                                     d_fea_num_info_ptr,
+
+  if (enable_segment_merge_grad) {
+    segment_merge_grad(gpu_num,
+                       d_merge_keys_ptr,
+                       d_grads,
+                       d_index,
+                       len,
+                       d_fea_num_info_ptr,
+                       uniq_len,
+                       segment_len);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys,
+                                               d_merge_keys_ptr,
+                                               sizeof(KeyType) * segment_len,
+                                               cudaMemcpyDeviceToDevice,
+                                               stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  } else {
+    auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
+    float* d_merge_grads_ptr = reinterpret_cast<float*>(d_merge_grads->ptr());
+
+    heter_comm_kernel_->merge_gradient(d_keys,
+                                       d_offset,
+                                       d_fea_num_info_ptr,
+                                       d_index,
+                                       (char*)d_grads,
+                                       (char*)d_merge_grads_ptr,
+                                       uniq_len,
+                                       grad_dim,
+                                       grad_value_size,
+                                       merger_,
+                                       stream,
+                                       gpu_accessor_);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads,
+                                               d_merge_grads_ptr,
+                                               grad_value_size * uniq_len,
+                                               cudaMemcpyDeviceToDevice,
+                                               stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  }
+}
+
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::segment_merge_grad(
+    int gpu_num,  // the device number
+    KeyType*
+        d_keys,  // the sorted keys list, which will be modified after merged
+    float* d_grads,  // the raw grads list, which will be modified after merged
+    const uint32_t*
+        d_index,  // the storage position of d_keys, its length is len.
+    size_t len,   // the number of raw input keys
+    const uint32_t*
+        d_fea_num_info,      // prefix sum array, its length is uniq_len+1
+    size_t uniq_len,         // the number of unique keys
+    size_t& segments_num) {  // the number of segment merged keys
+
+  int dev_id = resource_->dev_id(gpu_num);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(gpu_num, 0);
+
+  auto grad_dim = max_mf_dim_;
+  auto accessor_wrapper_ptr =
+      GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
+  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
+
+  auto d_buffer1 = memory::Alloc(place, sizeof(uint32_t) * len);
+  auto d_segments = reinterpret_cast<uint32_t*>(d_buffer1->ptr());
+  auto d_buffer2 = memory::Alloc(place, sizeof(uint32_t) * len);
+  auto d_segments_offset = reinterpret_cast<uint32_t*>(d_buffer2->ptr());
+  auto d_buffer3 = memory::Alloc(place, sizeof(uint32_t) * len);
+  auto d_segments_fea_num_info = reinterpret_cast<uint32_t*>(d_buffer3->ptr());
+  auto d_buffer4 = memory::Alloc(place, sizeof(uint32_t) * len);
+  auto d_segments_fea_num_offset =
+      reinterpret_cast<uint32_t*>(d_buffer4->ptr());
+  auto d_buffer5 = memory::Alloc(place, sizeof(uint32_t));
+  auto d_segments_num = reinterpret_cast<uint32_t*>(d_buffer5->ptr());
+  CUDA_CHECK(cudaMemsetAsync(d_segments_num, 0, sizeof(uint32_t), stream));
+
+  uint32_t segment_size = FLAGS_gpugraph_merge_grads_segment_size;
+  heter_comm_kernel_->split_segments(d_fea_num_info,
+                                     uniq_len,
+                                     d_segments,
+                                     d_segments_num,
+                                     segment_size,
+                                     stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  size_t temp_storage_bytes = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum(
+      NULL, temp_storage_bytes, d_segments, d_segments_num, uniq_len, stream));
+  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum(d_temp_storage->ptr(),
+                                                    temp_storage_bytes,
+                                                    d_segments,
+                                                    d_segments_num,
+                                                    uniq_len,
+                                                    stream));
+  CUDA_CHECK(cudaMemcpyAsync(&segments_num,
+                             d_segments_num,
+                             sizeof(uint32_t),
+                             cudaMemcpyDeviceToHost,
+                             stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  temp_storage_bytes = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL,
+                                                           temp_storage_bytes,
+                                                           d_segments,
+                                                           d_segments_offset,
+                                                           uniq_len,
+                                                           stream));
+  if (d_temp_storage->size() < temp_storage_bytes) {
+    d_temp_storage = NULL;
+    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
+                                    temp_storage_bytes,
+                                    d_segments,
+                                    d_segments_offset,
+                                    uniq_len,
+                                    stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  heter_comm_kernel_->expand_segments(d_fea_num_info,
+                                      d_segments_offset,
+                                      uniq_len,
+                                      d_segments_fea_num_info,
+                                      segment_size,
+                                      stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceScan::ExclusiveSum(NULL,
+                                    temp_storage_bytes,
+                                    d_segments_fea_num_info,
+                                    d_segments_fea_num_offset,
+                                    segments_num,
+                                    stream));
+  if (d_temp_storage->size() < temp_storage_bytes) {
+    d_temp_storage = NULL;
+    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
+                                    temp_storage_bytes,
+                                    d_segments_fea_num_info,
+                                    d_segments_fea_num_offset,
+                                    segments_num,
+                                    stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  auto d_segments_keys = memory::Alloc(place, sizeof(KeyType) * segments_num);
+  auto d_segments_keys_ptr = reinterpret_cast<KeyType*>(d_segments_keys->ptr());
+  heter_comm_kernel_->shrink_keys(d_keys,
+                                  d_segments_fea_num_offset,
+                                  d_segments_keys_ptr,
+                                  segments_num,
+                                  stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  auto d_segment_grads = memory::Alloc(place, segments_num * grad_value_size);
+  auto d_segment_grads_ptr = reinterpret_cast<float*>(d_segment_grads->ptr());
+  heter_comm_kernel_->merge_gradient(d_segments_keys_ptr,
+                                     d_segments_fea_num_offset,
+                                     d_segments_fea_num_info,
                                     d_index,
                                     (char*)d_grads,
-                                     (char*)d_merge_grads_ptr,
-                                     uniq_len,
+                                     (char*)d_segment_grads_ptr,
+                                     segments_num,
+                                     grad_dim,
                                     grad_value_size,
                                     merger_,
                                     stream,
-                                     feature_value_accessor_);
+                                     gpu_accessor_);
  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys,
+                                             d_segments_keys_ptr,
+                                             sizeof(KeyType) * segments_num,
+                                             cudaMemcpyDeviceToDevice,
+                                             stream));
  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads,
-                                             d_merge_grads_ptr,
-                                             grad_value_size * uniq_len,
+                                             d_segment_grads_ptr,
+                                             grad_value_size * segments_num,
                                             cudaMemcpyDeviceToDevice,
                                             stream));
  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
@@ -846,8 +1113,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::split_input_to_shard(
    KeyType* d_keys,
    int* d_idx_ptr,
    size_t len,
@@ -869,15 +1136,12 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard(
  auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int));
  int* d_shard_index_tmp_ptr = reinterpret_cast<int*>(d_shard_index_tmp->ptr());

-  // int grid_size = (len - 1) / block_size_ + 1;
-
  heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream);
  heter_comm_kernel_->calc_shard_index(
      d_keys, len, d_shard_index_tmp_ptr, total_device, stream);

  size_t temp_storage_bytes;
  const int num_bits = 1 + log2i(total_device);
-
  heter_comm_kernel_->sort_pairs(NULL,
                                 temp_storage_bytes,
                                 d_shard_index_tmp_ptr,
@@ -890,7 +1154,6 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard(
                                 stream);

  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-
  heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(),
                                 temp_storage_bytes,
                                 d_shard_index_tmp_ptr,
@@ -910,13 +1173,309 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::merge_keys(
+    int gpu_num,
+    const KeyType* d_keys,
+    size_t len,               // input
+    KeyType* d_sorted_keys,   // output
+    KeyType* d_merged_keys,   // output
+    uint32_t* d_restore_idx,  // output
+    size_t& uniq_len) {       // output
+  int dev_id = resource_->dev_id(gpu_num);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(gpu_num, 0);
+
+  size_t grad_dim = max_mf_dim_;
+  auto accessor_wrapper_ptr =
+      GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
+  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
+
+  auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 4 + 1));
+  uint32_t* d_fea_num_info_ptr =
+      reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
+  uint32_t* d_idx = (uint32_t*)&d_fea_num_info_ptr[len];
+  uint32_t* d_index = (uint32_t*)&d_idx[len];
+  uint32_t* d_offset = (uint32_t*)&d_index[len];
+  uint32_t* d_merged_size = (uint32_t*)&d_offset[len];
+  heter_comm_kernel_->fill_idx(d_idx, len, stream);
+
+  size_t temp_storage_bytes;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRadixSort::SortPairs(NULL,
+                                      temp_storage_bytes,
+                                      d_keys,
+                                      d_sorted_keys,
+                                      d_idx,
+                                      d_index,
+                                      len,
+                                      0,
+                                      8 * sizeof(KeyType),
+                                      stream));
+  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(),
+                                      temp_storage_bytes,
+                                      d_keys,
+                                      d_sorted_keys,
+                                      d_idx,
+                                      d_index,
+                                      len,
+                                      0,
+                                      8 * sizeof(KeyType),
+                                      stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  temp_storage_bytes = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRunLengthEncode::Encode(NULL,
+                                         temp_storage_bytes,
+                                         d_sorted_keys,
+                                         d_merged_keys,
+                                         d_fea_num_info_ptr,
+                                         d_merged_size,
+                                         len,
+                                         stream));
+  if (d_temp_storage->size() < temp_storage_bytes) {
+    d_temp_storage = NULL;
+    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRunLengthEncode::Encode(d_temp_storage->ptr(),
+                                         temp_storage_bytes,
+                                         d_sorted_keys,
+                                         d_merged_keys,
+                                         d_fea_num_info_ptr,
+                                         d_merged_size,
+                                         len,
+                                         stream));
+  cudaMemcpyAsync((void*)&uniq_len,
+                  d_merged_size,
+                  sizeof(int),
+                  cudaMemcpyDeviceToHost,
+                  stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  temp_storage_bytes = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL,
+                                                           temp_storage_bytes,
+                                                           d_fea_num_info_ptr,
+                                                           d_offset,
+                                                           uniq_len,
+                                                           stream));
+  if (d_temp_storage->size() < temp_storage_bytes) {
+    d_temp_storage = NULL;
+    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
+                                    temp_storage_bytes,
+                                    d_fea_num_info_ptr,
+                                    d_offset,
+                                    uniq_len,
+                                    stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  heter_comm_kernel_->fill_restore_idx(true,
+                                       len,
+                                       uniq_len,
+                                       d_merged_keys,
+                                       d_index,
+                                       d_offset,
+                                       d_fea_num_info_ptr,
+                                       d_restore_idx,
+                                       stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+}
+
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::pull_merge_sparse(
    int num, KeyType* d_keys, float* d_vals, size_t len) {
-  if (len == 0) {
-    return;
+  int total_device = resource_->total_device();
+  int dev_id = resource_->dev_id(num);
+  DevPlace place = DevPlace(dev_id);
+  AnyDeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(num, 0);
+
+  int h_left[total_device];   // NOLINT
+  int h_right[total_device];  // NOLINT
+
+  auto d_left = memory::Alloc(place, total_device * sizeof(int));
+  auto d_right = memory::Alloc(place, total_device * sizeof(int));
+  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
+  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
+
+#if defined(PADDLE_WITH_CUDA)
+  cudaMemsetAsync(d_left_ptr, -1, total_device * sizeof(int), stream);
+  cudaMemsetAsync(d_right_ptr, -1, total_device * sizeof(int), stream);
+
+#elif defined(PADDLE_WITH_XPU_KP)
+  // get XPUDeviceContext according to xpu place
+  paddle::platform::XPUDeviceContext xpu_dev_ctx(place);
+  auto xpu_context = xpu_dev_ctx.x_context();
+
+  int r = xpu::constant<int>(xpu_context, d_left_ptr, total_device, -1);
+  PADDLE_ENFORCE_EQ(r,
+                    XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU constant kernel return wrong value[%d %s]",
+                        r,
+                        XPUAPIErrorMsg[r]));
+  int r2 = xpu::constant<int>(xpu_context, d_right_ptr, total_device, -1);
+  PADDLE_ENFORCE_EQ(r2,
+                    XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU constant kernel return wrong value[%d %s]",
+                        r2,
+                        XPUAPIErrorMsg[r2]));
+#endif
+
+  auto accessor_wrapper_ptr =
+      GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
+  size_t val_type_size = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
+  VLOG(3) << "pull_sparse len:" << len << "  val_type_size: " << val_type_size;
+  auto d_sorted_keys = memory::Alloc(place, len * sizeof(KeyType));
+  auto d_sorted_keys_ptr = reinterpret_cast<KeyType*>(d_sorted_keys->ptr());
+  auto d_merged_keys = memory::Alloc(place, len * sizeof(KeyType));
+  auto d_merged_keys_ptr = reinterpret_cast<KeyType*>(d_merged_keys->ptr());
+  auto d_restore_idx = memory::Alloc(place, len * sizeof(uint32_t));
+  auto d_restore_idx_ptr = reinterpret_cast<uint32_t*>(d_restore_idx->ptr());
+  auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
+  auto d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
+  auto d_shard_vals = memory::Alloc(place, len * val_type_size);
+  auto d_shard_vals_ptr = reinterpret_cast<float*>(d_shard_vals->ptr());
+
+  size_t uniq_len = 0;
+  merge_keys(num,
+             d_keys,
+             len,
+             d_sorted_keys_ptr,
+             d_merged_keys_ptr,
+             d_restore_idx_ptr,
+             uniq_len);
+  sync_stream(stream);
+
+  auto d_idx = memory::Alloc(place, uniq_len * sizeof(int));
+  auto d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
+  split_input_to_shard(
+      d_merged_keys_ptr, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, num);
+  heter_comm_kernel_->fill_shard_key(
+      d_shard_keys_ptr, d_merged_keys_ptr, d_idx_ptr, uniq_len, stream);
+  sync_stream(stream);
+
+  auto dst_place = platform::CPUPlace();
+  auto src_place = place;
+
+  memory_copy(dst_place,
+              h_left,
+              src_place,
+              d_left_ptr,
+              total_device * sizeof(int),
+              stream);
+  memory_copy(dst_place,
+              h_right,
+              src_place,
+              d_right_ptr,
+              total_device * sizeof(int),
+              stream);
+
+  if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+    for (int i = 0; i < total_device; ++i) {
+      int shard_len = h_right[i] - h_left[i] + 1;
+      if (h_left[i] == -1 || h_right[i] == -1) {
+        continue;
+      }
+      create_storage(
+          num, i, shard_len * sizeof(KeyType), shard_len * val_type_size);
+    }
+    walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL);
+  }
+
+  for (int i = 0; i < total_device; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    auto& node = path_[num][i].nodes_.back();
+    if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+      sync_stream(node.in_stream);
+    }
+    AnyDeviceGuard guard(resource_->dev_id(i));
+    ptr_tables_[i]->rwlock_->RDLock();
+    if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+      ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
+                          node.val_storage,
+                          h_right[i] - h_left[i] + 1,
+                          resource_->remote_stream(i, num),
+                          gpu_accessor_);
+    } else {
+      ptr_tables_[i]->get(
+          d_shard_keys_ptr + h_left[i],
+          reinterpret_cast<char*>(d_shard_vals_ptr) + h_left[i] * val_type_size,
+          h_right[i] - h_left[i] + 1,
+          resource_->remote_stream(i, num),
+          gpu_accessor_);
+    }
  }

+  for (int i = 0; i < total_device; ++i) {
+    sync_stream(resource_->remote_stream(i, num));
+    if (h_left[i] == -1) {
+      continue;
+    }
+    ptr_tables_[i]->rwlock_->UNLock();
+  }
+
+  if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+    walk_to_src(num,
+                total_device,
+                h_left,
+                h_right,
+                reinterpret_cast<char*>(d_shard_vals_ptr),
+                val_type_size);
+    for (int i = 0; i < total_device; ++i) {
+      auto& node = path_[num][i].nodes_.front();
+      sync_stream(node.out_stream);
+    }
+  }
+
+  auto d_merged_vals = memory::Alloc(place, uniq_len * val_type_size);
+  auto d_merged_vals_ptr = reinterpret_cast<float*>(d_merged_vals->ptr());
+  heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr,
+                                       d_merged_vals_ptr,
+                                       d_idx_ptr,
+                                       uniq_len,
+                                       val_type_size,
+                                       stream);
+  sync_stream(stream);
+
+  heter_comm_kernel_->unpack_merged_vals(len,
+                                         d_keys,
+                                         d_merged_vals_ptr,
+                                         d_restore_idx_ptr,
+                                         d_vals,
+                                         val_type_size,
+                                         stream);
+  sync_stream(stream);
+
+  if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+    for (int i = 0; i < total_device; ++i) {
+      if (h_left[i] == -1 || h_right[i] == -1) {
+        continue;
+      }
+      destroy_storage(num, i);
+    }
+  }
+}
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::pull_normal_sparse(
+    int num, KeyType* d_keys, float* d_vals, size_t len) {
  int total_device = resource_->total_device();
  int dev_id = resource_->dev_id(num);
  DevPlace place = DevPlace(dev_id);
@@ -960,8 +1519,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());

  auto accessor_wrapper_ptr =
-      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
-  size_t val_type_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
+      GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
+  size_t val_type_size = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
  VLOG(3) << "pull_sparse len:" << len << "  val_type_size: " << val_type_size;
  auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
  KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
@@ -991,29 +1550,41 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
              total_device * sizeof(int),
              stream);

-  for (int i = 0; i < total_device; ++i) {
-    int shard_len = h_right[i] - h_left[i] + 1;
-    if (h_left[i] == -1 || h_right[i] == -1) {
-      continue;
+  if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+    for (int i = 0; i < total_device; ++i) {
+      int shard_len = h_right[i] - h_left[i] + 1;
+      if (h_left[i] == -1 || h_right[i] == -1) {
+        continue;
+      }
+      create_storage(
+          num, i, shard_len * sizeof(KeyType), shard_len * val_type_size);
    }
-    create_storage(
-        num, i, shard_len * sizeof(KeyType), shard_len * val_type_size);
+    walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL);
  }
-  walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL);
-
  for (int i = 0; i < total_device; ++i) {
    if (h_left[i] == -1) {
      continue;
    }
    auto& node = path_[num][i].nodes_.back();
-    sync_stream(node.in_stream);
+    if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+      sync_stream(node.in_stream);
+    }
    AnyDeviceGuard guard(resource_->dev_id(i));
    ptr_tables_[i]->rwlock_->RDLock();
-    ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
-                        node.val_storage,
-                        h_right[i] - h_left[i] + 1,
-                        resource_->remote_stream(i, num),
-                        feature_value_accessor_);
+    if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+      ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
+                          node.val_storage,
+                          h_right[i] - h_left[i] + 1,
+                          resource_->remote_stream(i, num),
+                          gpu_accessor_);
+    } else {
+      ptr_tables_[i]->get(
+          d_shard_keys_ptr + h_left[i],
+          reinterpret_cast<char*>(d_shard_vals_ptr) + h_left[i] * val_type_size,
+          h_right[i] - h_left[i] + 1,
+          resource_->remote_stream(i, num),
+          gpu_accessor_);
+    }
  }

  for (int i = 0; i < total_device; ++i) {
@@ -1023,31 +1594,46 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
    }
    ptr_tables_[i]->rwlock_->UNLock();
  }
-  walk_to_src(num,
-              total_device,
-              h_left,
-              h_right,
-              reinterpret_cast<char*>(d_shard_vals_ptr),
-              val_type_size);
-  for (int i = 0; i < total_device; ++i) {
-    auto& node = path_[num][i].nodes_.front();
-    sync_stream(node.out_stream);
+  if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+    walk_to_src(num,
+                total_device,
+                h_left,
+                h_right,
+                reinterpret_cast<char*>(d_shard_vals_ptr),
+                val_type_size);
+    for (int i = 0; i < total_device; ++i) {
+      auto& node = path_[num][i].nodes_.front();
+      sync_stream(node.out_stream);
+    }
  }
-  heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr,
-                                       d_vals,
-                                       d_idx_ptr,
-                                       len,
-                                       val_type_size,
-                                       stream,
-                                       feature_value_accessor_);
+  heter_comm_kernel_->dy_mf_fill_dvals(
+      d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size, stream);

  sync_stream(stream);

-  for (int i = 0; i < total_device; ++i) {
-    if (h_left[i] == -1 || h_right[i] == -1) {
-      continue;
+  if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+    for (int i = 0; i < total_device; ++i) {
+      if (h_left[i] == -1 || h_right[i] == -1) {
+        continue;
+      }
+      destroy_storage(num, i);
    }
-    destroy_storage(num, i);
+  }
+}
+
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::pull_sparse(
+    int num, KeyType* d_keys, float* d_vals, size_t len) {
+  if (len == 0) {
+    return;
+  }
+  if (!FLAGS_gpugraph_dedup_pull_push_mode) {
+    pull_merge_sparse(num, d_keys, d_vals, len);
+  } else {
+    pull_normal_sparse(num, d_keys, d_vals, len);
  }
 }

@@ -1055,9 +1641,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
+          typename GPUAccessor>
 template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::push_sparse(
    int dev_num,
    KeyType* d_keys,
    float* d_grads,
@@ -1071,7 +1657,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
  int dev_id = resource_->dev_id(dev_num);

  auto accessor_wrapper_ptr =
-      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+      GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
  DevPlace place = DevPlace(dev_id);
  AnyDeviceGuard guard(dev_id);
@@ -1116,13 +1702,30 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
  auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
  KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());

+  float* d_shard_grads_ptr;
  auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
-  float* d_shard_grads_ptr = reinterpret_cast<float*>(d_shard_grads->ptr());
+  d_shard_grads_ptr = reinterpret_cast<float*>(d_shard_grads->ptr());

  int uniq_len = len;
-  dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
-
-  int grid_size = (uniq_len - 1) / block_size_ + 1;
+  if (!FLAGS_gpugraph_dedup_pull_push_mode) {
+    size_t segment_len = 0;
+    if (FLAGS_gpugraph_enable_segment_merge_grads) {
+      // do two gradient merge
+      // 1st. do segmented gradient merge
+      // 2nd. do global gradient merge
+      dynamic_merge_grad(
+          dev_num, d_keys, d_grads, len, uniq_len, segment_len, true);
+      len = segment_len;
+      uniq_len = 0;
+      segment_len = 0;
+      dynamic_merge_grad(
+          dev_num, d_keys, d_grads, len, uniq_len, segment_len, false);
+    } else {
+      // Perform gradient merge only once
+      dynamic_merge_grad(
+          dev_num, d_keys, d_grads, len, uniq_len, segment_len, false);
+    }
+  }

  split_input_to_shard(
      d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num);
@@ -1135,7 +1738,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
                                             uniq_len,
                                             grad_value_size,
                                             stream,
-                                             feature_value_accessor_);
+                                             gpu_accessor_);

  sync_stream(stream);

@@ -1154,37 +1757,50 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
              total_device * sizeof(int),
              stream);

-  for (int i = 0; i < total_device; ++i) {
-    int shard_len = h_right[i] - h_left[i] + 1;
-    if (h_left[i] == -1 || h_right[i] == -1) {
-      continue;
+  if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+    for (int i = 0; i < total_device; ++i) {
+      int shard_len = h_right[i] - h_left[i] + 1;
+      if (h_left[i] == -1 || h_right[i] == -1) {
+        continue;
+      }
+      create_storage(
+          dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size);
    }
-    create_storage(
-        dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size);
-  }

-  walk_to_dest(dev_num,
-               total_device,
-               h_left,
-               h_right,
-               d_shard_keys_ptr,
-               reinterpret_cast<char*>(d_shard_grads_ptr),
-               grad_value_size);
+    walk_to_dest(dev_num,
+                 total_device,
+                 h_left,
+                 h_right,
+                 d_shard_keys_ptr,
+                 reinterpret_cast<char*>(d_shard_grads_ptr),
+                 grad_value_size);
+  }

  for (int i = 0; i < total_device; ++i) {
    if (h_left[i] == -1 || h_right[i] == -1) {
      continue;
    }
    auto& node = path_[dev_num][i].nodes_.back();
-    sync_stream(node.in_stream);
+    if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+      sync_stream(node.in_stream);
+    }

    AnyDeviceGuard guard(resource_->dev_id(i));
    ptr_tables_[i]->rwlock_->WRLock();
-    ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
-                           node.val_storage,
-                           h_right[i] - h_left[i] + 1,
-                           sgd,
-                           resource_->remote_stream(i, dev_num));
+    if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+      ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
+                             node.val_storage,
+                             h_right[i] - h_left[i] + 1,
+                             sgd,
+                             resource_->remote_stream(i, dev_num));
+    } else {
+      ptr_tables_[i]->update(d_shard_keys_ptr + h_left[i],
+                             reinterpret_cast<char*>(d_shard_grads_ptr) +
+                                 grad_value_size * h_left[i],
+                             h_right[i] - h_left[i] + 1,
+                             sgd,
+                             resource_->remote_stream(i, dev_num));
+    }
  }

  for (int i = 0; i < total_device; ++i) {
@@ -1198,11 +1814,13 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
    }
  }

-  for (int i = 0; i < total_device; ++i) {
-    if (h_left[i] == -1 || h_right[i] == -1) {
-      continue;
+  if (!FLAGS_gpugraph_enable_gpu_direct_access) {
+    for (int i = 0; i < total_device; ++i) {
+      if (h_left[i] == -1 || h_right[i] == -1) {
+        continue;
+      }
+      destroy_storage(dev_num, i);
    }
-    destroy_storage(dev_num, i);
  }
 }

@@ -1210,8 +1828,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::push_sparse(
    int dev_num, KeyType* d_keys, GradType* d_grads, size_t len) {
  if (len == 0) {
    return;
@@ -1269,8 +1887,6 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
  int uniq_len = len;
  merge_grad(dev_num, d_keys, d_grads, len, uniq_len);

-  // int grid_size = (uniq_len - 1) / block_size_ + 1;
-
  split_input_to_shard(
      d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num);

@@ -1351,9 +1967,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
+          typename GPUAccessor>
 template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::update_one_table(
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::update_one_table(
    int gpu_num,
    KeyType* d_keys,
    GradType* d_grads,
@@ -1375,9 +1991,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::update_one_table(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
+          typename GPUAccessor>
 template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse_multi_node(
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::push_sparse_multi_node(
    int gpu_num,
    KeyType* d_keys,
    GradType* d_grads,
@@ -1407,8 +2023,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse_multi_node(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_one_node_grad(
+          typename GPUAccessor>
+int HeterComm<KeyType, ValType, GradType, GPUAccessor>::gather_one_node_grad(
    int gpu_num, KeyType* d_keys, GradType* d_grads, int len) {
  int total_gpu = resource_->total_device();
  int dev_id = resource_->dev_id(gpu_num);
@@ -1493,7 +2109,6 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_one_node_grad(
    cudaMemcpy(
        h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);

-    // int grid_size = (h_node_len[i] - 1) / block_size_ + 1;
    heter_comm_kernel_->fill_shard_grads(storage.local_keys + merge_num,
                                         storage.all_keys + index,
                                         storage.local_grads + merge_num,
@@ -1512,8 +2127,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_one_node_grad(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_multi_node_grad(
+          typename GPUAccessor>
+int HeterComm<KeyType, ValType, GradType, GPUAccessor>::gather_multi_node_grad(
    int gpu_num, KeyType* d_keys, GradType* d_grads, int len) {
  int dev_id = resource_->dev_id(gpu_num);
  auto& storage = storage_[gpu_num];
@@ -1586,8 +2201,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_multi_node_grad(
 template <typename KeyType,
          typename ValType,
          typename GradType,
-          typename FVAccessor>
-void HeterComm<KeyType, ValType, GradType, FVAccessor>::end_pass() {
+          typename GPUAccessor>
+void HeterComm<KeyType, ValType, GradType, GPUAccessor>::end_pass() {
  int total_device = resource_->total_device();
  std::vector<std::thread> threads;

@@ -1608,10 +2223,127 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::end_pass() {
  }
 }

-// template <typename KeyType, typename ValType, typename GradType, typename
-// FVAccessor>
-// void HeterComm<KeyType, ValType, GradType, FVAccessor>::dump_to_cpu(int
-// index) {
+#if defined(PADDLE_WITH_CUDA)
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename GPUAccessor>
+int HeterComm<KeyType, ValType, GradType, GPUAccessor>::dedup_keys_and_fillidx(
+    const int gpu_id,
+    const int total_fea_num,
+    const KeyType* d_keys,   // input
+    KeyType* d_merged_keys,  // output
+    KeyType* d_sorted_keys,
+    uint32_t* d_restore_idx,
+    uint32_t* d_sorted_idx,
+    uint32_t* d_offset,
+    uint32_t* d_merged_cnts,
+    bool filter_zero) {
+  int dev_id = resource_->dev_id(gpu_id);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(gpu_id, 0);
+
+  assert(total_fea_num > 0);
+  int merged_size = 0;
+  size_t byte_size = sizeof(uint32_t) * (total_fea_num + 1);
+
+  auto d_index_ptr = memory::Alloc(place, byte_size);
+  uint32_t* d_index_in = reinterpret_cast<uint32_t*>(d_index_ptr->ptr());
+  int* d_merged_size = reinterpret_cast<int*>(&d_index_in[total_fea_num]);
+
+  heter_comm_kernel_->fill_idx(d_index_in, total_fea_num, stream);
+
+  void* d_buf = NULL;
+  size_t temp_storage_bytes = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRadixSort::SortPairs(NULL,
+                                      temp_storage_bytes,
+                                      d_keys,
+                                      d_sorted_keys,
+                                      d_index_in,
+                                      d_sorted_idx,
+                                      total_fea_num,
+                                      0,
+                                      8 * sizeof(KeyType),
+                                      stream,
+                                      false));
+  auto d_cache_ptr = memory::Alloc(place, temp_storage_bytes);
+  d_buf = reinterpret_cast<int*>(d_cache_ptr->ptr());
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRadixSort::SortPairs(d_buf,
+                                      temp_storage_bytes,
+                                      d_keys,
+                                      d_sorted_keys,
+                                      d_index_in,
+                                      d_sorted_idx,
+                                      total_fea_num,
+                                      0,
+                                      8 * sizeof(KeyType),
+                                      stream,
+                                      false));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRunLengthEncode::Encode(NULL,
+                                         temp_storage_bytes,
+                                         d_sorted_keys,
+                                         d_merged_keys,
+                                         d_merged_cnts,
+                                         d_merged_size,
+                                         total_fea_num,
+                                         stream));
+  if (d_cache_ptr->size() < temp_storage_bytes) {
+    d_cache_ptr = NULL;
+    d_cache_ptr = memory::Alloc(place, temp_storage_bytes);
+  }
+  d_buf = reinterpret_cast<int*>(d_cache_ptr->ptr());
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRunLengthEncode::Encode(d_buf,
+                                         temp_storage_bytes,
+                                         d_sorted_keys,
+                                         d_merged_keys,
+                                         d_merged_cnts,
+                                         d_merged_size,
+                                         total_fea_num,
+                                         stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync((void*)&merged_size,
+                                             (void*)d_merged_size,
+                                             sizeof(int),
+                                             cudaMemcpyDeviceToHost,
+                                             stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
+      NULL, temp_storage_bytes, d_merged_cnts, d_offset, merged_size, stream));
+  if (d_cache_ptr->size() < temp_storage_bytes) {
+    d_cache_ptr = NULL;
+    d_cache_ptr = memory::Alloc(place, temp_storage_bytes);
+  }
+  d_buf = reinterpret_cast<int*>(d_cache_ptr->ptr());
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
+      d_buf, temp_storage_bytes, d_merged_cnts, d_offset, merged_size, stream));
+
+  if (filter_zero) {
+    cudaMemsetAsync(d_restore_idx, 0, total_fea_num * sizeof(uint32_t), stream);
+  }
+  // fill restore idx [1,3,5,2,4,6] = [1,2,1,3,2,1]
+  heter_comm_kernel_->fill_restore_idx(filter_zero,
+                                       total_fea_num,
+                                       merged_size,
+                                       d_merged_keys,
+                                       d_sorted_idx,
+                                       d_offset,
+                                       d_merged_cnts,
+                                       d_restore_idx,
+                                       stream);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  return merged_size;
+}
+#endif
+// template <typename KeyType, typename ValType, typename GradType>
+// void HeterComm<KeyType, ValType, GradType>::dump_to_cpu(int index) {
 //  auto stream = resource_->local_stream(index, 0);
 //  int dev_id = resource_->dev_id(index);
 //  platform::CUDADeviceGuard guard(dev_id);

--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -128,69 +128,177 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals,
  }
 }

-template <typename KeyType, typename T, typename FVAccessor>
-__global__ void dy_mf_fill_shard_grads_kernel(
-    KeyType* d_shard_keys,
-    KeyType* d_keys,
-    float* d_shard_grads,
-    float* d_grads,
-    T* idx,
-    size_t len,
-    size_t grad_value_size,
-    FVAccessor feature_value_accessor) {
+template <typename KeyType, typename GPUAccessor>
+__global__ void merge_gradients_basic_kernel(const KeyType* d_keys,
+                                             const uint32_t* offset,
+                                             const uint32_t* fea_num,
+                                             const uint32_t* index,
+                                             const char* input,
+                                             char* output,
+                                             int n,
+                                             size_t grad_value_size,
+                                             DynamicGradMerger& merger,
+                                             GPUAccessor& gpu_accessor) {
  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
-    d_shard_keys[i] = d_keys[idx[i]];
-    float* cur = (float*)((char*)d_shard_grads + i * grad_value_size);
-    float* shard_val =
-        (float*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size);

-    feature_value_accessor.PushValueFill(cur, shard_val);
+  if (i < n) {
+    uint32_t start = offset[i];
+    uint32_t num = fea_num[i];
+    int ori_index = index[start];
+    float* out = (float*)(output + i * grad_value_size);
+    float* in = (float*)(input + size_t(ori_index) * grad_value_size);
+    merger.update_basic(out, in, gpu_accessor);
+    KeyType key = d_keys[i];
+    if (key != 0) {
+      for (int j = 1; j < num; ++j) {
+        ori_index = index[start + j];
+        in = (float*)(input + size_t(ori_index) * grad_value_size);
+        merger.merge_basic(out, in, gpu_accessor);
+      }
+    }
  }
 }

-template <typename FVAccessor>
-__global__ void merge_gradients_kernel(const uint32_t* offset,
-                                       const uint32_t* fea_num,
-                                       const uint32_t* index,
-                                       const char* input,
-                                       char* output,
-                                       int n,
-                                       size_t grad_value_size,
-                                       DynamicGradMerger& merger,
-                                       FVAccessor& feature_value_accessor) {
+template <typename KeyType, typename GPUAccessor>
+__global__ void merge_gradients_embedx_kernel(const KeyType* d_keys,
+                                              const uint32_t* offset,
+                                              const uint32_t* fea_num,
+                                              const uint32_t* index,
+                                              const char* input,
+                                              char* output,
+                                              int n,
+                                              size_t grad_dim,
+                                              size_t grad_value_size,
+                                              DynamicGradMerger& merger,
+                                              GPUAccessor& gpu_accessor) {
  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+
  if (i < n) {
-    uint32_t start = offset[i];
-    uint32_t num = fea_num[i];
+    size_t value_idx = i / grad_dim;
+    size_t field_idx = i % grad_dim;
+    uint32_t start = offset[value_idx];
+    uint32_t num = fea_num[value_idx];
    int ori_index = index[start];
-    float* out = (float*)(output + i * grad_value_size);
    float* in = (float*)(input + size_t(ori_index) * grad_value_size);
-    merger.update_one(out, in, feature_value_accessor);
-    for (int j = 1; j < num; ++j) {
-      ori_index = index[start + j];
-      in = (float*)(input + size_t(ori_index) * grad_value_size);
-      merger.merge_one(out, in, feature_value_accessor);
+    float* out = (float*)(output + value_idx * grad_value_size);
+    merger.update_embedx(out, in, field_idx, gpu_accessor);
+    KeyType key = d_keys[value_idx];
+    if (key != 0) {
+      for (int j = 1; j < num; ++j) {
+        int ori_index = index[start + j];
+        float* in = (float*)(input + size_t(ori_index) * grad_value_size);
+        merger.merge_embedx(out, in, field_idx, gpu_accessor);
+      }
    }
  }
 }

-template <typename T, typename FVAccessor>
-__global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals,
-                                        float* d_vals,
-                                        T* idx,
-                                        size_t len,
-                                        size_t val_size,
-                                        FVAccessor feature_value_accessor) {
+__global__ void split_segments_kernel(const uint32_t* d_fea_num_info,
+                                      size_t n,
+                                      uint32_t* d_segments,
+                                      uint32_t* d_segments_num,
+                                      uint32_t segment_size) {
+  const size_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tx >= n) {
+    return;
+  }
+
+  auto fea_num = d_fea_num_info[tx];
+  auto seg_num = (uint32_t)((fea_num - 1) / segment_size + 1);
+  d_segments[tx] = seg_num;
+}
+
+__global__ void expand_segments_kernel(const uint32_t* d_fea_num_info,
+                                       const uint32_t* d_segments_offset,
+                                       size_t n,
+                                       uint32_t* d_segments_fea_num_info,
+                                       uint32_t segment_size) {
+  const size_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tx >= n) {
+    return;
+  }
+
+  auto fea_num = d_fea_num_info[tx];
+  auto seg_num = (uint32_t)((fea_num - 1) / segment_size + 1);
+  auto start_pos = d_segments_offset[tx];
+  auto remains = fea_num;
+  int cur_seg_size = 0;
+  for (size_t i = 0; i < seg_num; ++i) {
+    if (remains >= segment_size) {
+      cur_seg_size = segment_size;
+    } else {
+      cur_seg_size = remains;
+    }
+    d_segments_fea_num_info[start_pos + i] = cur_seg_size;
+    remains -= cur_seg_size;
+  }
+}
+
+template <typename KeyType>
+__global__ void shrink_keys_kernel(const KeyType* d_keys,
+                                   const uint32_t* d_segments_offset,
+                                   KeyType* d_segments_keys,
+                                   size_t n) {
+  const size_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tx >= n) {
+    return;
+  }
+
+  d_segments_keys[tx] = d_keys[d_segments_offset[tx]];
+}
+
+template <typename KeyType>
+__global__ void unpack_merged_vals_kernel(const KeyType* d_keys,
+                                          const float* d_merged_vals,
+                                          const uint32_t* d_restored_idx,
+                                          float* d_out,
+                                          size_t val_size,
+                                          const size_t n) {
+  const size_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tx >= n) {
+    return;
+  }
+
+  size_t src_val_idx = 0;
+  const KeyType& key = d_keys[tx];
+  if (key != 0) {
+    src_val_idx = d_restored_idx[tx];
+  }
+
+  uint64_t dst_offset = uint64_t(tx) * val_size;
+  float* dst = (float*)((char*)d_out + dst_offset);
+  float* src_val =
+      (float*)((char*)d_merged_vals + uint64_t(src_val_idx) * val_size);
+
+  size_t n_float = val_size / sizeof(float);
+  for (size_t k = 0; k < n_float; ++k) {
+    dst[k] = src_val[k];
+  }
+}
+
+template <typename TUnit, typename T>
+__global__ void scatter_dvals_by_unit_kernel(TUnit* d_dest_vals,
+                                             const TUnit* d_src_vals,
+                                             T* idx,
+                                             size_t len,
+                                             size_t val_size_unit) {
  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < len) {
-    uint64_t new_offset = uint64_t(idx[i]) * val_size;
-    float* cur = (float*)((char*)d_vals + new_offset);
-    float* shard_val = (float*)((char*)d_shard_vals + uint64_t(i) * val_size);
-    int mf_dim = int(
-        shard_val[feature_value_accessor.common_feature_value.MfDimIndex()]);
+    size_t pos = idx[i / val_size_unit] * val_size_unit + (i % val_size_unit);
+    d_dest_vals[i] = d_src_vals[pos];
+  }
+}

-    feature_value_accessor.FeatureValueFill(cur, shard_val, mf_dim);
+template <typename TUnit, typename T>
+__global__ void gather_dvals_by_unit_kernel(TUnit* d_dest_vals,
+                                            const TUnit* d_src_vals,
+                                            T* idx,
+                                            size_t len,
+                                            const size_t val_size_unit) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    size_t pos = idx[i / val_size_unit] * val_size_unit + (i % val_size_unit);
+    d_dest_vals[pos] = d_src_vals[i];
  }
 }

@@ -325,43 +433,47 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage,
 template <typename KeyType,
          typename T,
          typename StreamType,
-          typename FVAccessor>
-void HeterCommKernel::dy_mf_fill_shard_grads(
-    KeyType* d_shard_keys,
-    KeyType* d_keys,
-    float* d_shard_grads,
-    float* d_grads,
-    T* idx,
-    long long len,
-    size_t grad_value_size,
-    const StreamType& stream,
-    FVAccessor& feature_value_accessor) {
+          typename GPUAccessor>
+void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys,
+                                             KeyType* d_keys,
+                                             float* d_shard_grads,
+                                             float* d_grads,
+                                             T* idx,
+                                             long long len,
+                                             size_t grad_value_size,
+                                             const StreamType& stream,
+                                             GPUAccessor& gpu_accessor) {
  int grid_size = (len - 1) / block_size_ + 1;
  size_t c_len = (size_t)len;
-  dy_mf_fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>(
-      d_shard_keys,
-      d_keys,
-      d_shard_grads,
-      d_grads,
-      idx,
-      c_len,
-      grad_value_size,
-      feature_value_accessor);
+
+  const size_t grad_value_size_float = grad_value_size / sizeof(float);
+  // d_keys to d_shard_keys
+  fill_shard_key_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_keys, d_keys, idx, c_len);
+
+  CHECK((grad_value_size % sizeof(float)) == 0);
+  size_t N = len * grad_value_size_float;
+  grid_size = (N - 1) / block_size_ + 1;
+  scatter_dvals_by_unit_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_grads, d_grads, idx, N, grad_value_size_float);
 }

-template <typename StreamType, typename FVAccessor>
-void HeterCommKernel::merge_gradient(const uint32_t* offset,
+template <typename KeyType, typename StreamType, typename GPUAccessor>
+void HeterCommKernel::merge_gradient(const KeyType* d_keys,
+                                     const uint32_t* offset,
                                     const uint32_t* fea_num,
                                     const uint32_t* index,
                                     const char* input,
                                     char* output,
                                     int n,
+                                     size_t grad_dim,
                                     size_t grad_value_size,
-                                     DynamicGradMerger& merger_,
+                                     DynamicGradMerger& merger,
                                     const StreamType& stream,
-                                     FVAccessor& feature_value_accessor) {
-  int grid_size = (n - 1) / block_size_ + 1;
-  merge_gradients_kernel<<<grid_size, block_size_, 0, stream>>>(
+                                     GPUAccessor& gpu_accessor) {
+  int grid_size1 = (n - 1) / block_size_ + 1;
+  merge_gradients_basic_kernel<<<grid_size1, block_size_, 0, stream>>>(
+      d_keys,
      offset,
      fea_num,
      index,
@@ -369,22 +481,189 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset,
      output,
      n,
      grad_value_size,
-      merger_,
-      feature_value_accessor);
+      merger,
+      gpu_accessor);
+  if (grad_dim > 0) {
+    int grid_size2 = (n * grad_dim - 1) / block_size_ + 1;
+    merge_gradients_embedx_kernel<<<grid_size2, block_size_, 0, stream>>>(
+        d_keys,
+        offset,
+        fea_num,
+        index,
+        input,
+        output,
+        n * grad_dim,
+        grad_dim,
+        grad_value_size,
+        merger,
+        gpu_accessor);
+  }
 }

-template <typename T, typename StreamType, typename FVAccessor>
+template <typename T, typename StreamType>
 void HeterCommKernel::dy_mf_fill_dvals(float* d_shard_vals,
                                       float* d_vals,
                                       T* idx,
                                       long long len,
                                       size_t val_size,
-                                       const StreamType& stream,
-                                       FVAccessor& feature_value_accessor) {
-  int grid_size = (len - 1) / block_size_ + 1;
-  size_t c_len = (size_t)len;
-  dy_mf_fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>(
-      d_shard_vals, d_vals, idx, c_len, val_size, feature_value_accessor);
+                                       const StreamType& stream) {
+  const size_t val_size_float = val_size / sizeof(float);
+  CHECK((val_size % sizeof(float)) == 0);
+  size_t N = len * val_size_float;
+  const int grid_size = (N - 1) / block_size_ + 1;
+  // fill by float, d_shard_vals to d_vals
+  gather_dvals_by_unit_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_vals, d_shard_vals, idx, N, val_size_float);
+}
+
+template <typename StreamType>
+void HeterCommKernel::split_segments(const uint32_t* d_fea_num_info,
+                                     size_t n,
+                                     uint32_t* d_segments,
+                                     uint32_t* d_segments_num,
+                                     size_t segment_size,
+                                     const StreamType& stream) {
+  int grid_size = (n - 1) / block_size_ + 1;
+  split_segments_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_fea_num_info, n, d_segments, d_segments_num, segment_size);
+}
+
+template <typename StreamType>
+void HeterCommKernel::expand_segments(const uint32_t* d_fea_num_info,
+                                      const uint32_t* d_segments_offset,
+                                      size_t n,
+                                      uint32_t* d_segments_fea_num_info,
+                                      uint32_t segment_size,
+                                      const StreamType& stream) {
+  int grid_size = (n - 1) / block_size_ + 1;
+  expand_segments_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_fea_num_info,
+      d_segments_offset,
+      n,
+      d_segments_fea_num_info,
+      segment_size);
+}
+
+template <typename KeyType, typename StreamType>
+void HeterCommKernel::shrink_keys(const KeyType* d_keys,
+                                  const uint32_t* d_segments_offset,
+                                  KeyType* d_segments_keys,
+                                  size_t n,
+                                  const StreamType& stream) {
+  int grid_size = (n - 1) / block_size_ + 1;
+  shrink_keys_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_keys, d_segments_offset, d_segments_keys, n);
+}
+template <typename T>
+__global__ void kernel_fill_restore_idx(const size_t N,
+                                        const T* d_sorted_idx,
+                                        const T* d_offset,
+                                        const T* d_merged_cnts,
+                                        T* d_restore_idx) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N) {
+    const T& off = d_offset[i];
+    const T& num = d_merged_cnts[i];
+    for (size_t k = 0; k < num; ++k) {
+      d_restore_idx[d_sorted_idx[off + k]] = i;
+    }
+  }
+}
+template <typename KeyType, typename T>
+__global__ void kernel_fill_restore_idx_filter_zero(const size_t N,
+                                                    const KeyType* d_keys,
+                                                    const T* d_sorted_idx,
+                                                    const T* d_offset,
+                                                    const T* d_merged_cnts,
+                                                    T* d_restore_idx) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N) {
+    if (d_keys[i] == 0) {
+      return;
+    }
+    const T& off = d_offset[i];
+    const T& num = d_merged_cnts[i];
+    for (size_t k = 0; k < num; ++k) {
+      d_restore_idx[d_sorted_idx[off + k]] = i;
+    }
+  }
+}
+template <typename T>
+__global__ void kernel_fill_restore_idx_by_search(const size_t N,
+                                                  const T* d_sorted_idx,
+                                                  const size_t merge_num,
+                                                  const T* d_offset,
+                                                  T* d_restore_idx) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N) {
+    if (i < d_offset[1]) {
+      d_restore_idx[d_sorted_idx[i]] = 0;
+      return;
+    }
+    int high = merge_num - 1;
+    int low = 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < d_offset[mid + 1]) {
+        high = mid;
+      } else {
+        low = mid + 1;
+      }
+    }
+    d_restore_idx[d_sorted_idx[i]] = low;
+  }
+}
+template <typename KeyType, typename StreamType>
+void HeterCommKernel::fill_restore_idx(bool filter_zero,
+                                       const size_t total_num,
+                                       const size_t merge_size,
+                                       const KeyType* d_keys,
+                                       const uint32_t* d_sorted_idx,
+                                       const uint32_t* d_offset,
+                                       const uint32_t* d_merged_cnts,
+                                       uint32_t* d_restore_idx,
+                                       const StreamType& stream) {
+  // fill restore idx [1,3,5,2,4,6] = [1,2,1,3,2,1]
+  if (merge_size * 3 > total_num) {
+    // repetition rate is not very high
+    size_t grid_size = (merge_size - 1) / block_size_ + 1;
+    if (filter_zero) {
+      kernel_fill_restore_idx_filter_zero<<<grid_size,
+                                            block_size_,
+                                            0,
+                                            stream>>>(merge_size,
+                                                      d_keys,
+                                                      d_sorted_idx,
+                                                      d_offset,
+                                                      d_merged_cnts,
+                                                      d_restore_idx);
+    } else {
+      kernel_fill_restore_idx<<<grid_size, block_size_, 0, stream>>>(
+          merge_size, d_sorted_idx, d_offset, d_merged_cnts, d_restore_idx);
+    }
+  } else {
+    size_t grid_size = (total_num - 1) / block_size_ + 1;
+    // mid search
+    kernel_fill_restore_idx_by_search<<<grid_size, block_size_, 0, stream>>>(
+        total_num, d_sorted_idx, merge_size, d_offset, d_restore_idx);
+  }
+}
+template <typename KeyType, typename StreamType>
+void HeterCommKernel::unpack_merged_vals(size_t n,
+                                         const KeyType* d_keys,
+                                         const void* d_merged_vals,
+                                         const uint32_t* d_restore_idx,
+                                         void* d_vals,
+                                         size_t val_size,
+                                         const StreamType& stream) {
+  int grid_size = (n - 1) / block_size_ + 1;
+  unpack_merged_vals_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_keys,
+      (const float*)d_merged_vals,
+      d_restore_idx,
+      (float*)d_vals,
+      val_size,
+      n);
 }

 template void HeterCommKernel::fill_idx<int, cudaStream_t>(
@@ -491,43 +770,127 @@ template void HeterCommKernel::reduce_by_key<
                  cudaStream_t stream,
                  bool debug_synchronous);

-template void
-HeterCommKernel::dy_mf_fill_shard_grads<unsigned long,
-                                        int,
-                                        cudaStream_t,
-                                        CommonFeatureValueAccessor>(
-    unsigned long* d_shard_keys,
-    unsigned long* d_keys,
-    float* d_shard_grads,
-    float* d_grads,
-    int* idx,
-    long long len,
-    size_t grad_value_size,
-    const cudaStream_t& stream,
-    CommonFeatureValueAccessor& feature_value_accessor);
+template void HeterCommKernel::dy_mf_fill_shard_grads<
+    unsigned long,
+    int,
+    cudaStream_t,
+    CommonFeatureValueAccessor>(unsigned long* d_shard_keys,
+                                unsigned long* d_keys,
+                                float* d_shard_grads,
+                                float* d_grads,
+                                int* idx,
+                                long long len,
+                                size_t grad_value_size,
+                                const cudaStream_t& stream,
+                                CommonFeatureValueAccessor& gpu_accessor);

-template void
-HeterCommKernel::merge_gradient<cudaStream_t, CommonFeatureValueAccessor>(
-    const uint32_t* offset,
-    const uint32_t* fea_num,
-    const uint32_t* index,
-    const char* input,
-    char* output,
-    int n,
-    size_t grad_value_size,
-    DynamicGradMerger& merger_,
-    const cudaStream_t& stream,
-    CommonFeatureValueAccessor& feature_value_accessor);
+template void HeterCommKernel::
+    merge_gradient<uint32_t, cudaStream_t, CommonFeatureValueAccessor>(
+        const uint32_t* d_keys,
+        const uint32_t* offset,
+        const uint32_t* fea_num,
+        const uint32_t* index,
+        const char* input,
+        char* output,
+        int n,
+        size_t grad_dim,
+        size_t grad_value_size,
+        DynamicGradMerger& merger_,
+        const cudaStream_t& stream,
+        CommonFeatureValueAccessor& gpu_accessor);

 template void HeterCommKernel::
-    dy_mf_fill_dvals<int, cudaStream_t, CommonFeatureValueAccessor>(
-        float* d_shard_vals,
-        float* d_vals,
-        int* idx,
-        long long len,
-        size_t val_size,
+    merge_gradient<uint64_t, cudaStream_t, CommonFeatureValueAccessor>(
+        const uint64_t* d_keys,
+        const uint32_t* offset,
+        const uint32_t* fea_num,
+        const uint32_t* index,
+        const char* input,
+        char* output,
+        int n,
+        size_t grad_dim,
+        size_t grad_value_size,
+        DynamicGradMerger& merger_,
        const cudaStream_t& stream,
-        CommonFeatureValueAccessor& feature_value_accessor);
+        CommonFeatureValueAccessor& gpu_accessor);
+
+template void HeterCommKernel::dy_mf_fill_dvals<int, cudaStream_t>(
+    float* d_shard_vals,
+    float* d_vals,
+    int* idx,
+    long long len,
+    size_t val_size,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::split_segments<cudaStream_t>(
+    const uint32_t* d_fea_num_info,
+    size_t n,
+    uint32_t* d_segment,
+    uint32_t* d_segments_num,
+    size_t segment_size,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::expand_segments<cudaStream_t>(
+    const uint32_t* d_fea_num_info,
+    const uint32_t* d_segments_offset,
+    size_t n,
+    uint32_t* d_segments_fea_num_info,
+    uint32_t segment_size,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::shrink_keys<uint32_t, cudaStream_t>(
+    const uint32_t* d_keys,
+    const uint32_t* d_segments_offset,
+    uint32_t* d_segments_keys,
+    size_t segment_num,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::shrink_keys<uint64_t, cudaStream_t>(
+    const uint64_t* d_keys,
+    const uint32_t* d_segments,
+    uint64_t* d_segments_keys,
+    size_t total_segment_num,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::fill_restore_idx<uint64_t, cudaStream_t>(
+    bool filter_zero,
+    const size_t total_num,
+    const size_t merge_size,
+    const uint64_t* d_keys,
+    const uint32_t* d_sorted_idx,
+    const uint32_t* d_offset,
+    const uint32_t* d_merged_cnts,
+    uint32_t* d_restore_idx,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::fill_restore_idx<uint32_t, cudaStream_t>(
+    bool filter_zero,
+    const size_t total_num,
+    const size_t merge_size,
+    const uint32_t* d_keys,
+    const uint32_t* d_sorted_idx,
+    const uint32_t* d_offset,
+    const uint32_t* d_merged_cnts,
+    uint32_t* d_restore_idx,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::unpack_merged_vals<uint64_t, cudaStream_t>(
+    size_t n,
+    const uint64_t* d_keys,
+    const void* d_merged_vals,
+    const uint32_t* d_restore_idx,
+    void* d_vals,
+    size_t val_size,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::unpack_merged_vals<uint32_t, cudaStream_t>(
+    size_t n,
+    const uint32_t* d_keys,
+    const void* d_merged_vals,
+    const uint32_t* d_restore_idx,
+    void* d_vals,
+    size_t val_size,
+    const cudaStream_t& stream);
 #endif

 }  // namespace framework

--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
@@ -41,16 +41,54 @@ struct DynamicGradMerger {
    return out;
  }

-  template <typename FVAccessor>
-  __device__ __forceinline__ void update_one(
-      float* output, const float* input, FVAccessor& feature_value_accessor) {
-    feature_value_accessor.PushValueFill(output, input);
+  template <typename GPUAccessor>
+  __device__ __forceinline__ void update_one(float* output,
+                                             const float* input,
+                                             GPUAccessor& gpu_accessor) {
+    gpu_accessor.PushValueFill(output, input);
  }

-  template <typename FVAccessor>
-  __device__ __forceinline__ void merge_one(
-      float* output, const float* input, FVAccessor& feature_value_accessor) {
-    feature_value_accessor.MergePushValue(output, input);
+  template <typename GPUAccessor>
+  __device__ __forceinline__ void merge_one(float* output,
+                                            const float* input,
+                                            GPUAccessor& gpu_accessor) {
+    gpu_accessor.MergePushValue(output, input);
+  }
+
+  template <typename GPUAccessor>
+  __device__ __forceinline__ void update_basic(float* output,
+                                               const float* input,
+                                               GPUAccessor& fv_accessor) {
+    fv_accessor.PushValueFillBasic(output, input);
+  }
+
+  template <typename GPUAccessor>
+  __device__ __forceinline__ void merge_basic(float* output,
+                                              const float* input,
+                                              GPUAccessor& fv_accessor) {
+    fv_accessor.MergePushValueBasic(output, input);
+  }
+
+  template <typename GPUAccessor>
+  __device__ __forceinline__ void update_embedx(float* output,
+                                                const float* input,
+                                                size_t embedx_idx,
+                                                GPUAccessor& fv_accessor) {
+    if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) {
+      output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] =
+          input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx];
+    }
+  }
+
+  template <typename GPUAccessor>
+  __device__ __forceinline__ void merge_embedx(float* output,
+                                               const float* input,
+                                               size_t embedx_idx,
+                                               GPUAccessor& fv_accessor) {
+    if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) {
+      output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] +=
+          input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx];
+    }
  }
 };

@@ -139,7 +177,7 @@ class HeterCommKernel {
  template <typename KeyType,
            typename T,
            typename StreamType,
-            typename FVAccessor>
+            typename GPUAccessor>
  void dy_mf_fill_shard_grads(KeyType* d_shard_keys,
                              KeyType* d_keys,
                              float* d_shard_grads,
@@ -148,28 +186,72 @@ class HeterCommKernel {
                              long long len,
                              size_t grad_value_size,
                              const StreamType& stream,
-                              FVAccessor& feature_value_accessor);
+                              GPUAccessor& gpu_accessor);

-  template <typename StreamType, typename FVAccessor>
-  void merge_gradient(const uint32_t* offset,
+  template <typename KeyType, typename StreamType, typename GPUAccessor>
+  void merge_gradient(const KeyType* d_shard_keys,
+                      const uint32_t* offset,
                      const uint32_t* fea_num,
                      const uint32_t* index,
                      const char* input,
                      char* output,
                      int n,
+                      size_t grad_dim,
                      size_t grad_value_size,
-                      DynamicGradMerger& merger_,
+                      DynamicGradMerger& merger,
                      const StreamType& stream,
-                      FVAccessor& feature_value_accessor);
+                      GPUAccessor& gpu_accessor);

-  template <typename T, typename StreamType, typename FVAccessor>
+  template <typename T, typename StreamType>
  void dy_mf_fill_dvals(float* d_shard_vals,
                        float* d_vals,
                        T* idx,
                        long long len,
                        size_t val_size,
-                        const StreamType& stream,
-                        FVAccessor& feature_value_accessor);
+                        const StreamType& stream);
+
+  template <typename StreamType>
+  void split_segments(const uint32_t* d_fea_num_info,
+                      size_t len,
+                      uint32_t* d_segments,
+                      uint32_t* d_segments_num,
+                      size_t segment_size,
+                      const StreamType& stream);
+
+  template <typename StreamType>
+  void expand_segments(const uint32_t* d_fea_num_info,
+                       const uint32_t* d_segments_offset,
+                       size_t segments_num,
+                       uint32_t* d_segments_fea_num_info,
+                       uint32_t segment_size,
+                       const StreamType& stream);
+
+  template <typename KeyType, typename StreamType>
+  void shrink_keys(const KeyType* d_keys,
+                   const uint32_t* d_segments_offset,
+                   KeyType* d_segments_keys,
+                   size_t segments_num,
+                   const StreamType& stream);
+
+  template <typename KeyType, typename StreamType>
+  void fill_restore_idx(bool filter_zero,
+                        const size_t total_num,
+                        const size_t merge_size,
+                        const KeyType* d_keys,
+                        const uint32_t* d_sorted_idx,
+                        const uint32_t* d_offset,
+                        const uint32_t* d_merged_cnts,
+                        uint32_t* d_restore_idx,
+                        const StreamType& stream);
+
+  template <typename KeyType, typename StreamType>
+  void unpack_merged_vals(size_t n,
+                          const KeyType* d_keys,
+                          const void* d_merged_vals,
+                          const uint32_t* d_restore_idx,
+                          void* d_vals,
+                          size_t val_size,
+                          const StreamType& stream);

 private:
  int block_size_{256};

--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
-
 #include <vector>

 #ifdef PADDLE_WITH_HETERPS
@@ -27,58 +26,83 @@ HeterPsBase* HeterPsBase::get_instance(
    std::unordered_map<std::string, float> fleet_config,
    std::string accessor_type,
    int optimizer_type) {
-  if (accessor_type == "CtrDymfAccessor" &&
-      (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) {
-    return new HeterPs<CommonFeatureValueAccessor>(
-        capacity, resource, accessor_type, fleet_config, optimizer_type);
+  if (accessor_type == "CtrDymfAccessor") {
+    auto* accessor_wrapper_ptr =
+        GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
+    CommonFeatureValueAccessor* gpu_accessor =
+        ((AccessorWrapper<CommonFeatureValueAccessor>*)accessor_wrapper_ptr)
+            ->AccessorPtr();
+    if (optimizer_type == 1) {
+      return new HeterPs<CommonFeatureValueAccessor, SparseAdagradOptimizer>(
+          capacity, resource, *gpu_accessor);
+    } else if (optimizer_type == 3) {
+      return new HeterPs<CommonFeatureValueAccessor, SparseAdamOptimizer>(
+          capacity, resource, *gpu_accessor);
+    } else if (optimizer_type == 4) {
+      return new HeterPs<CommonFeatureValueAccessor, SparseAdamSharedOptimizer>(
+          capacity, resource, *gpu_accessor);
+    }
  } else {
    VLOG(0) << " HeterPsBase get_instance Warning: now only support "
               "CtrDymfAccessor, but get "
-            << accessor_type_;
-    return new HeterPs<CommonFeatureValueAccessor>(
-        capacity, resource, accessor_type, fleet_config, optimizer_type);
+            << accessor_type;
+    return new HeterPs<CommonFeatureValueAccessor, SparseAdagradOptimizer>(
+        capacity, resource, fleet_config, accessor_type, optimizer_type);
  }
 }

-HeterPs::HeterPs(size_t capacity,
-                 std::shared_ptr<HeterPsResource> resource,
-                 std::unordered_map<std::string, float> fleet_config,
-                 std::string accessor_type,
-                 int optimizer_type) {
-  comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>(
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+HeterPs<GPUAccessor, GPUOptimizer>::HeterPs(
+    size_t capacity,
+    std::shared_ptr<HeterPsResource> resource,
+    GPUAccessor& gpu_accessor) {
+  comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, GPUAccessor>>(
      capacity, resource);
-  optimizer_type_ = optimizer_type;
+  opt_ = GPUOptimizer<GPUAccessor>(gpu_accessor);
 }

-HeterPs::~HeterPs() {}
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+HeterPs<GPUAccessor, GPUOptimizer>::~HeterPs() {}

-void HeterPs::pull_sparse(int num,
-                          FeatureKey* d_keys,
-                          float* d_vals,
-                          size_t len) {
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::pull_sparse(int num,
+                                                     FeatureKey* d_keys,
+                                                     float* d_vals,
+                                                     size_t len) {
  comm_->pull_sparse(num, d_keys, d_vals, len);
 }

-int HeterPs::get_index_by_devid(int devid) {
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+int HeterPs<GPUAccessor, GPUOptimizer>::get_index_by_devid(int devid) {
  return comm_->get_index_by_devid(devid);
 }

-void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) {
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::set_sparse_sgd(
+    const OptimizerConfig& optimizer_config) {
  comm_->set_sparse_sgd(optimizer_config);
 }

-void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) {
+void HeterPs<GPUAccessor, GPUOptimizer>::set_embedx_sgd(
+    const OptimizerConfig& optimizer_config) {
  comm_->set_embedx_sgd(optimizer_config);
 }

-void HeterPs::end_pass() { comm_->end_pass(); }
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::end_pass() {
+  comm_->end_pass();
+}

-void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::show_one_table(int gpu_num) {
+  comm_->show_one_table(gpu_num);
+}

-void HeterPs::push_sparse(int num,
-                          FeatureKey* d_keys,
-                          float* d_grads,
-                          size_t len) {
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::push_sparse(int num,
+                                                     FeatureKey* d_keys,
+                                                     float* d_grads,
+                                                     size_t len) {
  comm_->push_sparse(num, d_keys, d_grads, len);
  // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
 }

--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -27,132 +27,138 @@ HeterPsBase* HeterPsBase::get_instance(
    std::unordered_map<std::string, float> fleet_config,
    std::string accessor_type,
    int optimizer_type) {
-  if (accessor_type == "CtrDymfAccessor" &&
-      (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) {
-    return new HeterPs<CommonFeatureValueAccessor>(
-        capacity, resource, fleet_config, accessor_type, optimizer_type);
+  if (accessor_type == "CtrDymfAccessor") {
+    auto* accessor_wrapper_ptr =
+        GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
+    CommonFeatureValueAccessor* gpu_accessor =
+        ((AccessorWrapper<CommonFeatureValueAccessor>*)accessor_wrapper_ptr)
+            ->AccessorPtr();
+    if (optimizer_type == 1) {
+      return new HeterPs<CommonFeatureValueAccessor, SparseAdagradOptimizer>(
+          capacity, resource, *gpu_accessor);
+    } else if (optimizer_type == 3) {
+      return new HeterPs<CommonFeatureValueAccessor, SparseAdamOptimizer>(
+          capacity, resource, *gpu_accessor);
+    } else if (optimizer_type == 4) {
+      return new HeterPs<CommonFeatureValueAccessor, SparseAdamSharedOptimizer>(
+          capacity, resource, *gpu_accessor);
+    }
  } else {
    VLOG(0) << " HeterPsBase get_instance Warning: now only support "
               "CtrDymfAccessor, but get "
            << accessor_type;
-    return new HeterPs<CommonFeatureValueAccessor>(
-        capacity, resource, fleet_config, accessor_type, optimizer_type);
  }
 }

-template <typename FVAccessor>
-HeterPs<FVAccessor>::HeterPs(
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+HeterPs<GPUAccessor, GPUOptimizer>::HeterPs(
    size_t capacity,
    std::shared_ptr<HeterPsResource> resource,
-    std::unordered_map<std::string, float> fleet_config,
-    std::string accessor_type,
-    int optimizer_type) {
-  comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>(
-      capacity, resource);
-  feature_value_accessor_.Configure(fleet_config);
-  set_accessor(feature_value_accessor_);
-  accessor_type_ = accessor_type;
-  optimizer_type_ = optimizer_type;
+    GPUAccessor& gpu_accessor) {
+  comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, GPUAccessor>>(
+      capacity, resource, gpu_accessor);
+  opt_ = GPUOptimizer<GPUAccessor>(gpu_accessor);
 }

-template <typename FVAccessor>
-HeterPs<FVAccessor>::~HeterPs() {}
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+HeterPs<GPUAccessor, GPUOptimizer>::~HeterPs() {}

-template <typename FVAccessor>
-void HeterPs<FVAccessor>::pull_sparse(int num,
-                                      FeatureKey* d_keys,
-                                      float* d_vals,
-                                      size_t len) {
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::pull_sparse(int num,
+                                                     FeatureKey* d_keys,
+                                                     float* d_vals,
+                                                     size_t len) {
  comm_->pull_sparse(num, d_keys, d_vals, len);
 }

-template <typename FVAccessor>
-void HeterPs<FVAccessor>::build_ps(int num,
-                                   FeatureKey* h_keys,
-                                   char* pool,
-                                   size_t len,
-                                   size_t feature_value_size,
-                                   size_t chunk_size,
-                                   int stream_num) {
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::build_ps(int num,
+                                                  FeatureKey* h_keys,
+                                                  char* pool,
+                                                  size_t len,
+                                                  size_t feature_value_size,
+                                                  size_t chunk_size,
+                                                  int stream_num) {
  comm_->build_ps(
      num, h_keys, pool, len, feature_value_size, chunk_size, stream_num);
 }

-template <typename FVAccessor>
-int HeterPs<FVAccessor>::get_index_by_devid(int devid) {
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+int HeterPs<GPUAccessor, GPUOptimizer>::get_index_by_devid(int devid) {
  return comm_->get_index_by_devid(devid);
 }

-template <typename FVAccessor>
-void HeterPs<FVAccessor>::set_sparse_sgd(
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::set_sparse_sgd(
    const OptimizerConfig& optimizer_config) {
  comm_->set_sparse_sgd(optimizer_config);
 }

-template <typename FVAccessor>
-void HeterPs<FVAccessor>::set_embedx_sgd(
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::set_embedx_sgd(
    const OptimizerConfig& optimizer_config) {
  comm_->set_embedx_sgd(optimizer_config);
 }

-template <typename FVAccessor>
-void HeterPs<FVAccessor>::end_pass() {
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::end_pass() {
  comm_->end_pass();
 }

-template <typename FVAccessor>
-void HeterPs<FVAccessor>::show_one_table(int gpu_num) {
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::show_one_table(int gpu_num) {
  comm_->show_one_table(gpu_num);
 }

-template <typename FVAccessor>
-void HeterPs<FVAccessor>::push_sparse(int num,
-                                      FeatureKey* d_keys,
-                                      float* d_grads,
-                                      size_t len) {
-  if (accessor_type_ == "CtrDymfAccessor") {
-    if (optimizer_type_ == 3) {  // adam
-      auto optimizer = SparseAdamOptimizer(feature_value_accessor_);
-      VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():"
-              << optimizer.EmbedDim();
-      comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
-    } else if (optimizer_type_ == 4) {  // shared_adam
-      auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_);
-      VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():"
-              << optimizer.EmbedDim();
-      comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
-    } else if (optimizer_type_ == 1) {  // adagrad  {
-      auto optimizer = SparseAdagradOptimizer(feature_value_accessor_);
-      VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():"
-              << optimizer.EmbedDim();
-      comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
-    } else {
-      VLOG(0) << " push sparse Error: CtrDymfAccessor only support adagrad(1),"
-                 "adam(3) or shared_adam(4), bug get optimizer type:"
-              << optimizer_type_;
-    }
-  } else {
-    VLOG(0) << " push sparse Error: now only support CtrDymfAccessor, but get "
-            << accessor_type_;
-  }
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::push_sparse(int num,
+                                                     FeatureKey* d_keys,
+                                                     float* d_grads,
+                                                     size_t len) {
+  comm_->push_sparse(num, d_keys, d_grads, len, opt_);
 }

-template <typename FVAccessor>
-void HeterPs<FVAccessor>::set_nccl_comm_and_size(
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::set_nccl_comm_and_size(
    const std::vector<ncclComm_t>& inner_comms,
    const std::vector<ncclComm_t>& inter_comms,
    int comm_size) {
  comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
 }

-template <typename FVAccessor>
-void HeterPs<FVAccessor>::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::set_multi_mf_dim(int multi_mf_dim,
+                                                          int max_mf_dim) {
  comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim);
 }

-template <typename FVAccessor>
-void HeterPs<FVAccessor>::set_accessor(FVAccessor& accessor) {
-  comm_->set_accessor(accessor);
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+void HeterPs<GPUAccessor, GPUOptimizer>::show_table_collisions() {
+  comm_->show_table_collisions();
+}
+
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
+int HeterPs<GPUAccessor, GPUOptimizer>::dedup_keys_and_fillidx(
+    const int gpu_id,
+    const int total_fea_num,
+    const FeatureKey* d_keys,   // input
+    FeatureKey* d_merged_keys,  // output
+    FeatureKey* d_sorted_keys,
+    uint32_t* d_restore_idx,
+    uint32_t* d_sorted_idx,
+    uint32_t* d_offset,
+    uint32_t* d_merged_cnts,
+    bool filter_zero) {
+  return comm_->dedup_keys_and_fillidx(gpu_id,
+                                       total_fea_num,
+                                       d_keys,         // input
+                                       d_merged_keys,  // output
+                                       d_sorted_keys,
+                                       d_restore_idx,
+                                       d_sorted_idx,
+                                       d_offset,
+                                       d_merged_cnts,
+                                       filter_zero);
 }

 }  // end namespace framework

--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -26,15 +26,13 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-template <typename FVAccessor>
+template <typename GPUAccessor, template <typename T> class GPUOptimizer>
 class HeterPs : public HeterPsBase {
 public:
  HeterPs() {}
  HeterPs(size_t capacity,
          std::shared_ptr<HeterPsResource> resource,
-          std::unordered_map<std::string, float> fleet_config,
-          std::string accessor_type,
-          int optimizer_type);
+          GPUAccessor& gpu_accessor);
  virtual ~HeterPs();
  HeterPs(const HeterPs&) = delete;
  HeterPs& operator=(const HeterPs&) = delete;
@@ -43,6 +41,8 @@ class HeterPs : public HeterPsBase {
                   FeatureKey* d_keys,
                   float* d_vals,
                   size_t len) override;
+  // void build_ps(int num, FeatureKey* h_keys, float* h_vals, size_t len,
+  //               size_t chunk_size, int stream_num) override;
  void build_ps(int num,
                FeatureKey* h_keys,
                char* pool,
@@ -56,7 +56,6 @@ class HeterPs : public HeterPsBase {
                              int comm_size) override;
  void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override;

-  void set_accessor(FVAccessor& accessor);
 #endif

  void set_sparse_sgd(const OptimizerConfig& optimizer_config) override;
@@ -65,17 +64,25 @@ class HeterPs : public HeterPsBase {
  void end_pass() override;
  int get_index_by_devid(int devid) override;
  void show_one_table(int gpu_num) override;
-  void push_sparse(int num,
-                   FeatureKey* d_keys,
-                   float* d_grads,
-                   size_t len) override;
-
+  void push_sparse(int num, FeatureKey* d_keys, float* d_grads, size_t len);
+  void show_table_collisions() override;
+#if defined(PADDLE_WITH_CUDA)
+  // dedup
+  int dedup_keys_and_fillidx(const int gpu_id,
+                             const int total_fea_num,
+                             const FeatureKey* d_keys,   // input
+                             FeatureKey* d_merged_keys,  // output
+                             FeatureKey* d_sorted_keys,
+                             uint32_t* d_restore_idx,
+                             uint32_t* d_sorted_idx,
+                             uint32_t* d_offset,
+                             uint32_t* d_merged_cnts,
+                             bool filter_zero);
+#endif
 private:
-  std::shared_ptr<HeterComm<FeatureKey, float*, float*, FVAccessor>> comm_;
+  std::shared_ptr<HeterComm<FeatureKey, float*, float*, GPUAccessor>> comm_;
 #if defined(PADDLE_WITH_CUDA)
-  FVAccessor feature_value_accessor_;
-  std::string accessor_type_;
-  int optimizer_type_;
+  GPUOptimizer<GPUAccessor> opt_;
 #endif
 };


--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -54,6 +54,7 @@ class HeterPsBase {
 #endif
  virtual void end_pass() = 0;
  virtual void show_one_table(int gpu_num) = 0;
+  virtual void show_table_collisions() = 0;
  virtual void push_sparse(int num,
                           FeatureKey* d_keys,
                           float* d_grads,
@@ -65,10 +66,22 @@ class HeterPsBase {
  static HeterPsBase* get_instance(
      size_t capacity,
      std::shared_ptr<HeterPsResource> resource,
-      //  CommonFeatureValueAccessor feature_value_accessor,
      std::unordered_map<std::string, float> fleet_config,
      std::string accessor_type,
      int optimizer_type);
+#if defined(PADDLE_WITH_CUDA)
+  // dedup
+  virtual int dedup_keys_and_fillidx(const int gpu_id,
+                                     const int total_fea_num,
+                                     const FeatureKey* d_keys,   // input
+                                     FeatureKey* d_merged_keys,  // output
+                                     FeatureKey* d_sorted_keys,
+                                     uint32_t* d_restore_idx,
+                                     uint32_t* d_sorted_idx,
+                                     uint32_t* d_offset,
+                                     uint32_t* d_merged_cnts,
+                                     bool filter_zero) = 0;
+#endif
 };

 }  // end namespace framework

--- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
+++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <iostream>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"

 namespace paddle {
 namespace framework {
@@ -60,9 +61,9 @@ class HBMMemoryPool : public managed {
    block_size_ = mem_pool->block_size();
    VLOG(3) << "hbm memory pool with capacity" << capacity_
            << " bs: " << block_size_;
-    cudaMalloc(&mem_, block_size_ * capacity_);
-    cudaMemcpy(
-        mem_, mem_pool->mem(), mem_pool->byte_size(), cudaMemcpyHostToDevice);
+    CUDA_CHECK(cudaMalloc(&mem_, block_size_ * capacity_));
+    CUDA_CHECK(cudaMemcpy(
+        mem_, mem_pool->mem(), mem_pool->byte_size(), cudaMemcpyHostToDevice));
  }

  ~HBMMemoryPool() {
@@ -78,8 +79,8 @@ class HBMMemoryPool : public managed {
    cudaFree(mem_);
    mem_ = NULL;
    capacity_ = capacity;
-    cudaMalloc(&mem_, (block_size_ * capacity / 8 + 1) * 8);
-    cudaMemset(mem_, 0, block_size_ * capacity);
+    CUDA_CHECK(cudaMalloc(&mem_, (block_size_ * capacity / 8 + 1) * 8));
+    CUDA_CHECK(cudaMemset(mem_, 0, block_size_ * capacity));
  }

  char* mem() { return mem_; }

--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <curand_kernel.h>
 #endif
 #include <vector>
-
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"

@@ -28,50 +27,35 @@ namespace framework {

 #if defined(PADDLE_WITH_CUDA)

-class Optimizer {
- public:
-  __host__ Optimizer(CommonFeatureValueAccessor feature_value_accessor) {
-    feature_value_accessor_ = feature_value_accessor;
-  }
-  __host__ ~Optimizer() {}
-
-  __device__ void update_value(const OptimizerConfig& optimizer_config,
-                               float& val,  // NOLINT
-                               const float& grad) {
-    printf(
-        "Warning: update_value will not used. Please use dy_mf_update_value\n");
-  }
-
-  __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
-                                     float* ptr,
-                                     const float* grad) {}
-
-  CommonFeatureValueAccessor feature_value_accessor_;
-
-  size_t _embedding_dim;
-  size_t _lr_embedding_dim;
-};
-
-class SparseAdagradOptimizer : public Optimizer {
+template <typename GPUAccessor>
+class SparseAdagradOptimizer {
 public:
-  __host__ SparseAdagradOptimizer(
-      CommonFeatureValueAccessor feature_value_accessor)
-      : Optimizer(feature_value_accessor) {
+  SparseAdagradOptimizer() {}
+  SparseAdagradOptimizer(GPUAccessor gpu_accessor) {
+    gpu_accessor_ = gpu_accessor;
    _lr_embedding_dim = 1;
-    _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim();
+    _embedding_dim = gpu_accessor_.common_feature_value.EmbedWDim();
  }

+  ~SparseAdagradOptimizer() {}
+
  __device__ void update_value_work(const OptimizerConfig& optimizer_config,
                                    int n,
                                    float* w,
                                    float* sgd,  // NOLINT
                                    const float* g,
-                                    float scale) {
+                                    float scale,
+                                    float slot) {
    float& g2sum = sgd[G2SumIndex()];
    double add_g2sum = 0;
-    double ratio = optimizer_config.mf_learning_rate *
-                   sqrt(optimizer_config.mf_initial_g2sum /
-                        (optimizer_config.mf_initial_g2sum + g2sum));
+
+    float learning_rate = optimizer_config.mf_learning_rate;
+    if (slot != optimizer_config.nodeid_slot) {
+      learning_rate = optimizer_config.feature_learning_rate;
+    }
+    double ratio =
+        learning_rate * sqrt(optimizer_config.mf_initial_g2sum /
+                             (optimizer_config.mf_initial_g2sum + g2sum));
    for (int i = 0; i < n; ++i) {
      double scaled_grad = g[i] / scale;

@@ -96,47 +80,43 @@ class SparseAdagradOptimizer : public Optimizer {
  __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
                                     float* ptr,
                                     const float* grad) {
-    float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()];
-    float g_click =
-        grad[feature_value_accessor_.common_push_value.ClickIndex()];
-
-    ptr[feature_value_accessor_.common_feature_value.SlotIndex()] =
-        grad[feature_value_accessor_.common_push_value.SlotIndex()];
-    ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show;
-    ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click;
-    ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
+    float g_show = grad[gpu_accessor_.common_push_value.ShowIndex()];
+    float g_click = grad[gpu_accessor_.common_push_value.ClickIndex()];
+
+    ptr[gpu_accessor_.common_feature_value.SlotIndex()] =
+        grad[gpu_accessor_.common_push_value.SlotIndex()];
+    ptr[gpu_accessor_.common_feature_value.ShowIndex()] += g_show;
+    ptr[gpu_accessor_.common_feature_value.ClickIndex()] += g_click;
+    ptr[gpu_accessor_.common_feature_value.DeltaScoreIndex()] +=
        optimizer_config.nonclk_coeff * (g_show - g_click) +
        optimizer_config.clk_coeff * g_click;
+    float slot = ptr[gpu_accessor_.common_feature_value.SlotIndex()];

    update_value_work(
        optimizer_config,
        1,
-        ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(),
-        ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(),
-        grad + feature_value_accessor_.common_push_value.EmbedGIndex(),
-        g_show);
-
-    int mf_dim =
-        int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]);
-    if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) {
+        ptr + gpu_accessor_.common_feature_value.EmbedWIndex(),
+        ptr + gpu_accessor_.common_feature_value.EmbedG2SumIndex(),
+        grad + gpu_accessor_.common_push_value.EmbedGIndex(),
+        g_show,
+        slot);
+
+    int mf_dim = int(ptr[gpu_accessor_.common_feature_value.MfDimIndex()]);
+    if (ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] == 0) {
      if (optimizer_config.mf_create_thresholds <=
          optimizer_config.nonclk_coeff *
-                  (ptr[feature_value_accessor_.common_feature_value
-                           .ShowIndex()] -
-                   ptr[feature_value_accessor_.common_feature_value
-                           .ClickIndex()]) +
+                  (ptr[gpu_accessor_.common_feature_value.ShowIndex()] -
+                   ptr[gpu_accessor_.common_feature_value.ClickIndex()]) +
              optimizer_config.clk_coeff *
-                  ptr[feature_value_accessor_.common_feature_value
-                          .ClickIndex()]) {
-        ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] =
-            feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
-            sizeof(float);
+                  ptr[gpu_accessor_.common_feature_value.ClickIndex()]) {
+        ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] =
+            gpu_accessor_.common_feature_value.MFSize(mf_dim) / sizeof(float);

        int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
        curandState state;
        curand_init(clock64(), tid_x, 0, &state);
        for (int i = 0; i < mf_dim; ++i) {
-          ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] =
+          ptr[gpu_accessor_.common_feature_value.EmbedxWIndex() + i] =
              (curand_uniform(&state)) * optimizer_config.mf_initial_range;
        }
      }
@@ -144,10 +124,11 @@ class SparseAdagradOptimizer : public Optimizer {
      update_value_work(
          optimizer_config,
          mf_dim,
-          ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(),
-          ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(),
-          grad + feature_value_accessor_.common_push_value.EmbedxGIndex(),
-          g_show);
+          ptr + gpu_accessor_.common_feature_value.EmbedxWIndex(),
+          ptr + gpu_accessor_.common_feature_value.EmbedxG2SumIndex(),
+          grad + gpu_accessor_.common_push_value.EmbedxGIndex(),
+          g_show,
+          slot);
    }
  }

@@ -156,17 +137,25 @@ class SparseAdagradOptimizer : public Optimizer {
  __host__ __device__ size_t EmbedxDim() { return _embedding_dim; }
  __host__ __device__ size_t G2SumIndex() { return 0; }
  __host__ __device__ size_t EmbedxG2SumIndex() { return 0; }
+
+ private:
+  GPUAccessor gpu_accessor_;
+  size_t _embedding_dim;
+  size_t _lr_embedding_dim;
 };

-class SparseAdamOptimizer : public Optimizer {
+template <typename GPUAccessor>
+class SparseAdamOptimizer {
 public:
-  __host__ SparseAdamOptimizer(
-      CommonFeatureValueAccessor feature_value_accessor)
-      : Optimizer(feature_value_accessor) {
+  SparseAdamOptimizer() {}
+  SparseAdamOptimizer(GPUAccessor gpu_accessor) {
+    gpu_accessor_ = gpu_accessor;
    _lr_embedding_dim = 1;
-    _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim();
+    _embedding_dim = gpu_accessor_.common_feature_value.EmbedWDim();
  }

+  ~SparseAdamOptimizer() {}
+
  __device__ void update_lr(const OptimizerConfig& optimizer_config,
                            int n,
                            float* w,
@@ -256,65 +245,57 @@ class SparseAdamOptimizer : public Optimizer {
  __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
                                     float* ptr,
                                     const float* grad) {
-    float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()];
-    float g_click =
-        grad[feature_value_accessor_.common_push_value.ClickIndex()];
-
-    ptr[feature_value_accessor_.common_feature_value.SlotIndex()] =
-        grad[feature_value_accessor_.common_push_value.SlotIndex()];
-    ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show;
-    ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click;
-    ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
+    float g_show = grad[gpu_accessor_.common_push_value.ShowIndex()];
+    float g_click = grad[gpu_accessor_.common_push_value.ClickIndex()];
+
+    ptr[gpu_accessor_.common_feature_value.SlotIndex()] =
+        grad[gpu_accessor_.common_push_value.SlotIndex()];
+    ptr[gpu_accessor_.common_feature_value.ShowIndex()] += g_show;
+    ptr[gpu_accessor_.common_feature_value.ClickIndex()] += g_click;
+    ptr[gpu_accessor_.common_feature_value.DeltaScoreIndex()] +=
        optimizer_config.nonclk_coeff * (g_show - g_click) +
        optimizer_config.clk_coeff * g_click;

-    update_lr(
-        optimizer_config,
-        1,
-        ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(),
-        ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(),
-        grad + feature_value_accessor_.common_push_value.EmbedGIndex(),
-        g_show);
-    int mf_dim =
-        int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]);
-    if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) {
+    update_lr(optimizer_config,
+              1,
+              ptr + gpu_accessor_.common_feature_value.EmbedWIndex(),
+              ptr + gpu_accessor_.common_feature_value.EmbedG2SumIndex(),
+              grad + gpu_accessor_.common_push_value.EmbedGIndex(),
+              g_show);
+    int mf_dim = int(ptr[gpu_accessor_.common_feature_value.MfDimIndex()]);
+    if (ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] == 0) {
      if (optimizer_config.mf_create_thresholds <=
          optimizer_config.nonclk_coeff *
-                  (ptr[feature_value_accessor_.common_feature_value
-                           .ShowIndex()] -
-                   ptr[feature_value_accessor_.common_feature_value
-                           .ClickIndex()]) +
+                  (ptr[gpu_accessor_.common_feature_value.ShowIndex()] -
+                   ptr[gpu_accessor_.common_feature_value.ClickIndex()]) +
              optimizer_config.clk_coeff *
-                  ptr[feature_value_accessor_.common_feature_value
-                          .ClickIndex()]) {
-        ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] =
-            feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
-            sizeof(float);
+                  ptr[gpu_accessor_.common_feature_value.ClickIndex()]) {
+        ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] =
+            gpu_accessor_.common_feature_value.MFSize(mf_dim) / sizeof(float);

        int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
        curandState state;
        curand_init(clock64(), tid_x, 0, &state);
        for (int i = 0; i < mf_dim; ++i) {
-          ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] =
+          ptr[gpu_accessor_.common_feature_value.EmbedxWIndex() + i] =
              (curand_uniform(&state)) * optimizer_config.mf_initial_range;
        }
-        ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
+        ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex() +
            EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate;
-        ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
+        ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex() +
            EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate;
      }
    } else {
-      update_mf(
-          optimizer_config,
-          mf_dim,
-          ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(),
-          ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(),
-          grad + feature_value_accessor_.common_push_value.EmbedxGIndex(),
-          g_show);
+      update_mf(optimizer_config,
+                mf_dim,
+                ptr + gpu_accessor_.common_feature_value.EmbedxWIndex(),
+                ptr + gpu_accessor_.common_feature_value.EmbedxG2SumIndex(),
+                grad + gpu_accessor_.common_push_value.EmbedxGIndex(),
+                g_show);
    }
    // printf("EmbedxGIndex: %f, mf_gsum: %f, ",
-    // feature_value_accessor_.common_push_value.EmbedxGIndex(),
-    //          ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex()]);
+    // gpu_accessor_.common_push_value.EmbedxGIndex(),
+    //          ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex()]);
  }

  __host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); }
@@ -338,17 +319,25 @@ class SparseAdamOptimizer : public Optimizer {
  __host__ __device__ size_t EmbedxBeta2PowIndex() {
    return EmbedxBeta1PowIndex() + 1;
  }
+
+ private:
+  GPUAccessor gpu_accessor_;
+  size_t _embedding_dim;
+  size_t _lr_embedding_dim;
 };

-class SparseAdamSharedOptimizer : public Optimizer {
+template <typename GPUAccessor>
+class SparseAdamSharedOptimizer {
 public:
-  __host__ SparseAdamSharedOptimizer(
-      CommonFeatureValueAccessor feature_value_accessor)
-      : Optimizer(feature_value_accessor) {
+  SparseAdamSharedOptimizer() {}
+  SparseAdamSharedOptimizer(GPUAccessor gpu_accessor) {
+    gpu_accessor_ = gpu_accessor;
    _lr_embedding_dim = 1;
-    _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim();
+    _embedding_dim = gpu_accessor_.common_feature_value.EmbedWDim();
  }

+  ~SparseAdamSharedOptimizer() {}
+
  __device__ void update_value_work(const OptimizerConfig& optimizer_config,
                                    int n,
                                    float* w,
@@ -406,60 +395,54 @@ class SparseAdamSharedOptimizer : public Optimizer {
  __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
                                     float* ptr,
                                     const float* grad) {
-    float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()];
-    float g_click =
-        grad[feature_value_accessor_.common_push_value.ClickIndex()];
-
-    ptr[feature_value_accessor_.common_feature_value.SlotIndex()] =
-        grad[feature_value_accessor_.common_push_value.SlotIndex()];
-    ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show;
-    ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click;
-    ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
+    float g_show = grad[gpu_accessor_.common_push_value.ShowIndex()];
+    float g_click = grad[gpu_accessor_.common_push_value.ClickIndex()];
+
+    ptr[gpu_accessor_.common_feature_value.SlotIndex()] =
+        grad[gpu_accessor_.common_push_value.SlotIndex()];
+    ptr[gpu_accessor_.common_feature_value.ShowIndex()] += g_show;
+    ptr[gpu_accessor_.common_feature_value.ClickIndex()] += g_click;
+    ptr[gpu_accessor_.common_feature_value.DeltaScoreIndex()] +=
        optimizer_config.nonclk_coeff * (g_show - g_click) +
        optimizer_config.clk_coeff * g_click;

    update_value_work(
        optimizer_config,
        1,
-        ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(),
-        ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(),
-        grad + feature_value_accessor_.common_push_value.EmbedGIndex(),
+        ptr + gpu_accessor_.common_feature_value.EmbedWIndex(),
+        ptr + gpu_accessor_.common_feature_value.EmbedG2SumIndex(),
+        grad + gpu_accessor_.common_push_value.EmbedGIndex(),
        g_show);
-    int mf_dim =
-        int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]);
-    if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) {
+    int mf_dim = int(ptr[gpu_accessor_.common_feature_value.MfDimIndex()]);
+    if (ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] == 0) {
      if (optimizer_config.mf_create_thresholds <=
          optimizer_config.nonclk_coeff *
-                  (ptr[feature_value_accessor_.common_feature_value
-                           .ShowIndex()] -
-                   ptr[feature_value_accessor_.common_feature_value
-                           .ClickIndex()]) +
+                  (ptr[gpu_accessor_.common_feature_value.ShowIndex()] -
+                   ptr[gpu_accessor_.common_feature_value.ClickIndex()]) +
              optimizer_config.clk_coeff *
-                  ptr[feature_value_accessor_.common_feature_value
-                          .ClickIndex()]) {
-        ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] =
-            feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
-            sizeof(float);
+                  ptr[gpu_accessor_.common_feature_value.ClickIndex()]) {
+        ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] =
+            gpu_accessor_.common_feature_value.MFSize(mf_dim) / sizeof(float);

        int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
        curandState state;
        curand_init(clock64(), tid_x, 0, &state);
        for (int i = 0; i < mf_dim; ++i) {
-          ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] =
+          ptr[gpu_accessor_.common_feature_value.EmbedxWIndex() + i] =
              (curand_uniform(&state)) * optimizer_config.mf_initial_range;
        }
-        ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
+        ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex() +
            EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate;
-        ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
+        ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex() +
            EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate;
      }
    } else {
      update_value_work(
          optimizer_config,
          mf_dim,
-          ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(),
-          ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(),
-          grad + feature_value_accessor_.common_push_value.EmbedxGIndex(),
+          ptr + gpu_accessor_.common_feature_value.EmbedxWIndex(),
+          ptr + gpu_accessor_.common_feature_value.EmbedxG2SumIndex(),
+          grad + gpu_accessor_.common_push_value.EmbedxGIndex(),
          g_show);
    }
  }
@@ -481,6 +464,11 @@ class SparseAdamSharedOptimizer : public Optimizer {
  __host__ __device__ size_t EmbedxBeta2PowIndex() {
    return EmbedxBeta1PowIndex() + 1;
  }
+
+ private:
+  GPUAccessor gpu_accessor_;
+  size_t _embedding_dim;
+  size_t _lr_embedding_dim;
 };

 #endif

--- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
@@ -41,6 +41,9 @@ class OptimizerConfig {
  float mf_max_bound = 10;
  float mf_ada_epsilon = 1e-8;

+  float nodeid_slot = 9008;
+  float feature_learning_rate = 0.05;
+
  void set_sparse_sgd(float nonclk_coeff,
                      float clk_coeff,
                      float min_bound,
@@ -84,7 +87,9 @@ class OptimizerConfig {
                      float mf_max_bound,
                      float mf_beta1_decay_rate,
                      float mf_beta2_decay_rate,
-                      float mf_ada_epsilon) {
+                      float mf_ada_epsilon,
+                      float nodeid_slot,
+                      float feature_learning_rate) {
    this->mf_create_thresholds = mf_create_thresholds;
    this->mf_learning_rate = mf_learning_rate;
    this->mf_initial_g2sum = mf_initial_g2sum;
@@ -94,6 +99,9 @@ class OptimizerConfig {
    this->mf_beta1_decay_rate = mf_beta1_decay_rate;
    this->mf_beta2_decay_rate = mf_beta2_decay_rate;
    this->mf_ada_epsilon = mf_ada_epsilon;
+
+    this->nodeid_slot = nodeid_slot;
+    this->feature_learning_rate = feature_learning_rate;
  }

  void set_embedx_sgd(const OptimizerConfig& optimizer_config) {
@@ -106,6 +114,9 @@ class OptimizerConfig {
    this->mf_beta1_decay_rate = optimizer_config.mf_beta1_decay_rate;
    this->mf_beta2_decay_rate = optimizer_config.mf_beta2_decay_rate;
    this->mf_ada_epsilon = optimizer_config.mf_ada_epsilon;
+
+    this->nodeid_slot = nodeid_slot;
+    this->feature_learning_rate = feature_learning_rate;
  }
 };


--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
@@ -27,9 +27,6 @@

 using namespace paddle::framework;
 namespace platform = paddle::platform;
-// paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph
-// paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
-//     std::vector<int64_t> ids)

 std::string edges[] = {
    std::string("0\t1"),
@@ -121,13 +118,13 @@ TEST(TEST_FLEET, test_cpu_cache) {
      std::make_shared<HeterPsResource>(device_id_mapping);
  resource->enable_p2p();
  int use_nv = 1;
-  GpuPsGraphTable g(resource, use_nv);
+  GpuPsGraphTable g(resource, 1, 2);
  g.init_cpu_table(table_proto);
-  g.cpu_graph_table->Load(node_file_name, "nuser");
-  g.cpu_graph_table->Load(node_file_name, "nitem");
+  g.cpu_graph_table_->Load(node_file_name, "nuser");
+  g.cpu_graph_table_->Load(node_file_name, "nitem");
  std::remove(node_file_name);
  std::vector<paddle::framework::GpuPsCommGraph> vec;
-  std::vector<int64_t> node_ids;
+  std::vector<uint64_t> node_ids;
  node_ids.push_back(37);
  node_ids.push_back(96);
  std::vector<std::vector<std::string>> node_feat(2,
@@ -135,38 +132,29 @@ TEST(TEST_FLEET, test_cpu_cache) {
  std::vector<std::string> feature_names;
  feature_names.push_back(std::string("c"));
  feature_names.push_back(std::string("d"));
-  g.cpu_graph_table->get_node_feat(0, node_ids, feature_names, node_feat);
+  g.cpu_graph_table_->get_node_feat(0, node_ids, feature_names, node_feat);
  VLOG(0) << "get_node_feat: " << node_feat[0][0];
  VLOG(0) << "get_node_feat: " << node_feat[0][1];
  VLOG(0) << "get_node_feat: " << node_feat[1][0];
  VLOG(0) << "get_node_feat: " << node_feat[1][1];
  int n = 10;
-  std::vector<int64_t> ids0, ids1;
+  std::vector<uint64_t> ids0, ids1;
  for (int i = 0; i < n; i++) {
-    g.cpu_graph_table->add_comm_edge(0, i, (i + 1) % n);
-    g.cpu_graph_table->add_comm_edge(0, i, (i - 1 + n) % n);
+    g.cpu_graph_table_->add_comm_edge(0, i, (i + 1) % n);
+    g.cpu_graph_table_->add_comm_edge(0, i, (i - 1 + n) % n);
    if (i % 2 == 0) ids0.push_back(i);
  }
-  g.cpu_graph_table->build_sampler(0);
+  g.cpu_graph_table_->build_sampler(0);
  ids1.push_back(5);
  ids1.push_back(7);
-  vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids0));
-  vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids1));
+  vec.push_back(g.cpu_graph_table_->make_gpu_ps_graph(0, ids0));
+  vec.push_back(g.cpu_graph_table_->make_gpu_ps_graph(0, ids1));
  vec[0].display_on_cpu();
  vec[1].display_on_cpu();
  // g.build_graph_from_cpu(vec);
-  g.build_graph_on_single_gpu(vec[0], 0);
-  g.build_graph_on_single_gpu(vec[1], 1);
-  int64_t cpu_key[3] = {0, 1, 2};
-  /*
-  std::vector<std::shared_ptr<char>> buffers(3);
-  std::vector<int> actual_sizes(3,0);
-  g.cpu_graph_table->random_sample_neighbors(cpu_key,2,buffers,actual_sizes,false);
-  for(int i = 0;i < 3;i++){
-    VLOG(0)<<"sample from cpu key->"<<cpu_key[i]<<" actual sample size =
-  "<<actual_sizes[i]/sizeof(int64_t);
-  }
-  */
+  g.build_graph_on_single_gpu(vec[0], 0, 0);
+  g.build_graph_on_single_gpu(vec[1], 1, 0);
+  uint64_t cpu_key[3] = {0, 1, 2};
  void *key;
  int device_len = 2;
  for (int i = 0; i < 2; i++) {
@@ -178,7 +166,7 @@ TEST(TEST_FLEET, test_cpu_cache) {
    int step = 2;
    int cur = 0;
    while (true) {
-      auto node_query_res = g.query_node_list(i, cur, step);
+      auto node_query_res = g.query_node_list(i, 0, cur, step);
      node_query_res.display();
      if (node_query_res.get_len() == 0) {
        VLOG(0) << "no more ids,break";
@@ -187,19 +175,20 @@ TEST(TEST_FLEET, test_cpu_cache) {
      cur += node_query_res.get_len();
      NeighborSampleQuery query;
      query.initialize(
-          i, node_query_res.get_val(), 1, node_query_res.get_len());
+          i, 0, node_query_res.get_val(), 1, node_query_res.get_len());
      query.display();
      auto c = g.graph_neighbor_sample_v3(query, false);
      c.display();
    }
  }
-  g.cpu_graph_table->set_search_level(2);
-  // g.cpu_graph_table->Load_to_ssd(edge_file_name,"e>u2u");
-  g.cpu_graph_table->Load(edge_file_name, "e>u2u");
-  g.cpu_graph_table->make_partitions(0, 64, 2);
+  g.cpu_graph_table_->clear_graph(0);
+  g.cpu_graph_table_->set_search_level(2);
+  g.cpu_graph_table_->Load(edge_file_name, "e>u2u");
+  g.cpu_graph_table_->make_partitions(0, 64, 2);
  int index = 0;
-  while (g.cpu_graph_table->load_next_partition(0) != -1) {
-    auto all_ids = g.cpu_graph_table->get_all_id(0, 0, device_len);
+  /*
+  while (g.cpu_graph_table_->load_next_partition(0) != -1) {
+    auto all_ids = g.cpu_graph_table_->get_all_id(0, 0, device_len);
    for (auto x : all_ids) {
      for (auto y : x) {
        VLOG(0) << "part " << index << " " << y;
@@ -207,19 +196,19 @@ TEST(TEST_FLEET, test_cpu_cache) {
    }
    for (int i = 0; i < all_ids.size(); i++) {
      GpuPsCommGraph sub_graph =
-          g.cpu_graph_table->make_gpu_ps_graph(0, all_ids[i]);
-      g.build_graph_on_single_gpu(sub_graph, i);
+          g.cpu_graph_table_->make_gpu_ps_graph(0, all_ids[i]);
+      g.build_graph_on_single_gpu(sub_graph, i, 0);
      VLOG(2) << "sub graph on gpu " << i << " is built";
    }
    VLOG(0) << "start to iterate gpu graph node";
-    g.cpu_graph_table->make_complementary_graph(0, 64);
+    g.cpu_graph_table_->make_complementary_graph(0, 64);
    for (int i = 0; i < 2; i++) {
      // platform::CUDADeviceGuard guard(i);
      LOG(0) << "query on card " << i;
      int step = 2;
      int cur = 0;
      while (true) {
-        auto node_query_res = g.query_node_list(i, cur, step);
+        auto node_query_res = g.query_node_list(i, 0, cur, step);
        node_query_res.display();
        if (node_query_res.get_len() == 0) {
          VLOG(0) << "no more ids,break";
@@ -227,23 +216,23 @@ TEST(TEST_FLEET, test_cpu_cache) {
        }
        cur += node_query_res.get_len();
        NeighborSampleQuery query, q1;
-        query.initialize(
-            i, node_query_res.get_val(), 4, node_query_res.get_len());
+        query.initialize(i, 0, node_query_res.get_val(), 4,
+                         node_query_res.get_len());
        query.display();
        auto c = g.graph_neighbor_sample_v3(query, true);
        c.display();
        platform::CUDADeviceGuard guard(i);
-        int64_t *key;
+        uint64_t *key;
        VLOG(0) << "sample key 1 globally";
-        g.cpu_graph_table->set_search_level(2);
-        cudaMalloc((void **)&key, sizeof(int64_t));
-        int64_t t_key = 1;
-        cudaMemcpy(key, &t_key, sizeof(int64_t), cudaMemcpyHostToDevice);
-        q1.initialize(i, (int64_t)key, 2, 1);
+        g.cpu_graph_table_->set_search_level(2);
+        cudaMalloc((void **)&key, sizeof(uint64_t));
+        uint64_t t_key = 1;
+        cudaMemcpy(key, &t_key, sizeof(uint64_t), cudaMemcpyHostToDevice);
+        q1.initialize(i, 0, (uint64_t)key, 2, 1);
        auto d = g.graph_neighbor_sample_v3(q1, true);
        d.display();
        cudaFree(key);
-        g.cpu_graph_table->set_search_level(1);
+        g.cpu_graph_table_->set_search_level(1);
      }
    }
    index++;
@@ -253,4 +242,5 @@ TEST(TEST_FLEET, test_cpu_cache) {
  device.push_back(0);
  device.push_back(1);
  iter->set_device(device);
+  */
 }
--- a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
@@ -50,15 +50,16 @@ TEST(TEST_FLEET, graph_comm) {
  }
  std::vector<int> neighbor_offset(gpu_count, 0), node_index(gpu_count, 0);
  for (int i = 0; i < graph_list.size(); i++) {
-    graph_list[i].node_list = new GpuPsGraphNode[graph_list[i].node_size];
+    graph_list[i].node_list = new uint64_t[graph_list[i].node_size];
+    graph_list[i].node_info_list = new GpuPsNodeInfo[graph_list[i].node_size];
    graph_list[i].neighbor_list = new int64_t[graph_list[i].neighbor_size];
  }
  for (int i = 0; i < node_count; i++) {
    ind = i % gpu_count;
-    graph_list[ind].node_list[node_index[ind]].node_id = i;
-    graph_list[ind].node_list[node_index[ind]].neighbor_offset =
+    graph_list[ind].node_list[node_index[ind]] = i;
+    graph_list[ind].node_info_list[node_index[ind]].neighbor_offset =
        neighbor_offset[ind];
-    graph_list[ind].node_list[node_index[ind]].neighbor_size =
+    graph_list[ind].node_info_list[node_index[ind]].neighbor_size =
        neighbors[i].size();
    for (auto x : neighbors[i]) {
      graph_list[ind].neighbor_list[neighbor_offset[ind]++] = x;

--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -25,7 +25,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #ifdef PADDLE_WITH_HETERPS

 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
@@ -34,11 +33,14 @@ limitations under the License. */
 #include <deque>

 #include "paddle/fluid/framework/data_set.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 #include "paddle/fluid/platform/timer.h"
 #if defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
 #endif

+DECLARE_int32(gpugraph_dedup_pull_push_mode);
+
 namespace paddle {
 namespace framework {

@@ -117,7 +119,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
  gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);

  std::vector<std::thread> threads;
-
  // data should be in input channel

  thread_dim_keys_.resize(thread_keys_thread_num_);
@@ -135,94 +136,161 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {

  std::string data_set_name = std::string(typeid(*dataset_).name());

-  if (data_set_name.find("SlotRecordDataset") != std::string::npos) {
-    SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
-    auto input_channel = dataset->GetInputChannel();
-    VLOG(0) << "psgpu wrapperinputslotchannle size: " << input_channel->Size();
-    const std::deque<SlotRecord>& vec_data = input_channel->GetData();
-    total_len = vec_data.size();
-    len_per_thread = total_len / thread_keys_thread_num_;
-    remain = total_len % thread_keys_thread_num_;
-    VLOG(0) << "total len: " << total_len;
-    auto gen_dynamic_mf_func = [this](const std::deque<SlotRecord>& total_data,
-                                      int begin_index,
-                                      int end_index,
-                                      int i) {
-      for (auto iter = total_data.begin() + begin_index;
-           iter != total_data.begin() + end_index;
-           iter++) {
-        const auto& ins = *iter;
-        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
-        const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets;
-        for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size();
-             slot_idx++) {
-          for (size_t j = slot_offset[slot_offset_vector_[slot_idx]];
-               j < slot_offset[slot_offset_vector_[slot_idx] + 1];
-               j++) {
-            int shard_id = feasign_v[j] % thread_keys_shard_num_;
-            int dim_id = slot_index_vec_[slot_idx];
-            if (feasign_v[j] != 0) {
-              this->thread_dim_keys_[i][shard_id][dim_id].insert(feasign_v[j]);
+  VLOG(0) << "gpu_graph_mode_:" << gpu_graph_mode_;
+  if (!gpu_graph_mode_) {
+    if (data_set_name.find("SlotRecordDataset") != std::string::npos) {
+      VLOG(0) << "ps_gpu_wrapper use SlotRecordDataset";
+      SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
+      auto input_channel = dataset->GetInputChannel();
+      VLOG(0) << "psgpu wrapperinputslotchannle size: "
+              << input_channel->Size();
+      const std::deque<SlotRecord>& vec_data = input_channel->GetData();
+      total_len = vec_data.size();
+      len_per_thread = total_len / thread_keys_thread_num_;
+      remain = total_len % thread_keys_thread_num_;
+      VLOG(0) << "total len: " << total_len;
+      auto gen_dynamic_mf_func = [this](
+                                     const std::deque<SlotRecord>& total_data,
+                                     int begin_index,
+                                     int end_index,
+                                     int i) {
+        for (auto iter = total_data.begin() + begin_index;
+             iter != total_data.begin() + end_index;
+             iter++) {
+          const auto& ins = *iter;
+          const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
+          const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets;
+          for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size();
+               slot_idx++) {
+            for (size_t j = slot_offset[slot_offset_vector_[slot_idx]];
+                 j < slot_offset[slot_offset_vector_[slot_idx] + 1];
+                 j++) {
+              int shard_id = feasign_v[j] % thread_keys_shard_num_;
+              int dim_id = slot_index_vec_[slot_idx];
+              if (feasign_v[j] != 0) {
+                this->thread_dim_keys_[i][shard_id][dim_id].insert(
+                    feasign_v[j]);
+              }
            }
          }
        }
+      };
+      for (int i = 0; i < thread_keys_thread_num_; i++) {
+        threads.push_back(
+            std::thread(gen_dynamic_mf_func,
+                        std::ref(vec_data),
+                        begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0),
+                        i));
+
+        begin += len_per_thread + (i < remain ? 1 : 0);
+      }
+      for (std::thread& t : threads) {
+        t.join();
+      }
+      timeline.Pause();
+      VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec()
+              << " seconds.";
+    } else {
+      CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
+      VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
+      MultiSlotDataset* dataset = (MultiSlotDataset*)(dataset_);
+      auto input_channel = dataset->GetInputChannel();
+
+      const std::deque<Record>& vec_data = input_channel->GetData();
+      total_len = vec_data.size();
+      len_per_thread = total_len / thread_keys_thread_num_;
+      remain = total_len % thread_keys_thread_num_;
+      auto gen_func = [this](const std::deque<Record>& total_data,
+                             int begin_index,
+                             int end_index,
+                             int i) {
+        for (auto iter = total_data.begin() + begin_index;
+             iter != total_data.begin() + end_index;
+             iter++) {
+          const auto& ins = *iter;
+          const auto& feasign_v = ins.uint64_feasigns_;
+          for (const auto feasign : feasign_v) {
+            uint64_t cur_key = feasign.sign().uint64_feasign_;
+            int shard_id = cur_key % thread_keys_shard_num_;
+            this->thread_keys_[i][shard_id].insert(cur_key);
+          }
+        }
+      };
+      for (int i = 0; i < thread_keys_thread_num_; i++) {
+        threads.push_back(
+            std::thread(gen_func,
+                        std::ref(vec_data),
+                        begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0),
+                        i));
+        begin += len_per_thread + (i < remain ? 1 : 0);
      }
-    };
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      threads.push_back(
-          std::thread(gen_dynamic_mf_func,
-                      std::ref(vec_data),
-                      begin,
-                      begin + len_per_thread + (i < remain ? 1 : 0),
-                      i));
-
-      begin += len_per_thread + (i < remain ? 1 : 0);
-    }
-    for (std::thread& t : threads) {
-      t.join();
+      for (std::thread& t : threads) {
+        t.join();
+      }
+      timeline.Pause();
+      VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec()
+              << " seconds.";
    }
-    timeline.Pause();
-    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
  } else {
-    CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
-    VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
-    MultiSlotDataset* dataset = (MultiSlotDataset*)(dataset_);
-    auto input_channel = dataset->GetInputChannel();
+    VLOG(0) << "PreBuild in GpuGraph mode";
+    SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
+    const std::vector<uint64_t>& vec_data = dataset->GetGpuGraphTotalKeys();

-    const std::deque<Record>& vec_data = input_channel->GetData();
    total_len = vec_data.size();
    len_per_thread = total_len / thread_keys_thread_num_;
+    VLOG(0) << "GpuGraphTotalKeys: " << total_len;
    remain = total_len % thread_keys_thread_num_;
-    auto gen_func = [this](const std::deque<Record>& total_data,
-                           int begin_index,
-                           int end_index,
-                           int i) {
+    auto gen_graph_data_func = [this](const std::vector<uint64_t>& total_data,
+                                      int begin_index,
+                                      int end_index,
+                                      int i) {
      for (auto iter = total_data.begin() + begin_index;
           iter != total_data.begin() + end_index;
           iter++) {
-        const auto& ins = *iter;
-        const auto& feasign_v = ins.uint64_feasigns_;
-        for (const auto feasign : feasign_v) {
-          uint64_t cur_key = feasign.sign().uint64_feasign_;
-          int shard_id = cur_key % thread_keys_shard_num_;
-          this->thread_keys_[i][shard_id].insert(cur_key);
-        }
+        uint64_t cur_key = *iter;
+        int shard_id = cur_key % thread_keys_shard_num_;
+        this->thread_keys_[i][shard_id].insert(cur_key);
      }
    };
+    auto gen_graph_dynamic_mf_func =
+        [this](const std::vector<uint64_t>& total_data,
+               int begin_index,
+               int end_index,
+               int i) {
+          for (auto iter = total_data.begin() + begin_index;
+               iter != total_data.begin() + end_index;
+               iter++) {
+            uint64_t cur_key = *iter;
+            int shard_id = cur_key % thread_keys_shard_num_;
+            // TODO: feasign <-> slot <-> multi_dim
+            this->thread_dim_keys_[i][shard_id][0].insert(cur_key);
+          }
+        };
    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      threads.push_back(
-          std::thread(gen_func,
-                      std::ref(vec_data),
-                      begin,
-                      begin + len_per_thread + (i < remain ? 1 : 0),
-                      i));
+      if (!multi_mf_dim_) {
+        VLOG(1) << "psgpu graph wrapper genfunc";
+        threads.push_back(
+            std::thread(gen_graph_data_func,
+                        std::ref(vec_data),
+                        begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0),
+                        i));
+      } else {
+        VLOG(1) << "psgpu graph wrapper genfunc with dynamic mf";
+        threads.push_back(
+            std::thread(gen_graph_dynamic_mf_func,
+                        std::ref(vec_data),
+                        begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0),
+                        i));
+      }
      begin += len_per_thread + (i < remain ? 1 : 0);
    }
    for (std::thread& t : threads) {
      t.join();
    }
-    timeline.Pause();
-    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
  }

  timeline.Start();
@@ -255,6 +323,9 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
  VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
  for (int i = 0; i < thread_keys_shard_num_; i++) {
    for (int j = 0; j < multi_mf_dim_; j++) {
+      if (i == 0 && j == multi_mf_dim_ - 1) {
+        gpu_task->feature_dim_keys_[i][j].push_back(0);
+      }
      VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
              << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
      gpu_task->value_dim_ptr_[i][j].resize(
@@ -640,7 +711,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
  }
  std::vector<std::thread> threads(device_num);
  auto accessor_wrapper_ptr =
-      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+      GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
  HeterPs_ = HeterPsBase::get_instance(
      size_max, resource_, fleet_config_, accessor_class_, optimizer_type_);
 #ifdef PADDLE_WITH_CUDA
@@ -824,6 +895,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
    dataset_->LocalShuffle();
  }
  InitSlotInfo();
+  gpu_graph_mode_ = dataset_->GetGpuGraphMode();
  std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
  gpu_task->Reset();

@@ -890,15 +962,22 @@ void PSGPUWrapper::BeginPass() {
        platform::errors::Fatal("[BeginPass] current task is not ended."));
  }

+  debug_gpu_memory_info("befor build task");
  build_task();
+  debug_gpu_memory_info("after build task");
  timer.Pause();

  if (current_task_ == nullptr) {
    PADDLE_THROW(platform::errors::Fatal(
        "[BeginPass] after build_task, current task is not null."));
  }
-
-  VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
+  if (FLAGS_gpugraph_dedup_pull_push_mode) {
+    VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec()
+            << "s, enable pull push dedup mode="
+            << FLAGS_gpugraph_dedup_pull_push_mode;
+  } else {
+    VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
+  }
 }

 void PSGPUWrapper::EndPass() {
@@ -919,7 +998,7 @@ void PSGPUWrapper::EndPass() {
  }
  int thread_num = 8;
  auto accessor_wrapper_ptr =
-      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+      GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
  auto dump_pool_to_cpu_func = [this, thread_num, &accessor_wrapper_ptr](
                                   int i, int j, int z) {
    PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
@@ -961,30 +1040,7 @@ void PSGPUWrapper::EndPass() {
      size_t local_offset = (i - left) * feature_value_size;
      float* gpu_val = (float*)(test_build_values + local_offset);
 #ifdef PADDLE_WITH_PSLIB
-      auto* downpour_value =
-          (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
-      int downpour_value_size = downpour_value->size();
-      if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
-        downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
-      }
-      float* cpu_val = downpour_value->data();
-      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
-                  delta_score_index()] = gpu_val->delta_score;
-      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
-                  show_index()] = gpu_val->show;
-      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
-                  click_index()] = gpu_val->clk;
-      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
-                  embed_w_index()] = gpu_val->lr;
-      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
-                  embed_g2sum_index()] = gpu_val->lr_g2sum;
-      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
-                  slot_index()] = gpu_val->slot;
-      if (gpu_val->mf_size > 0) {
-        for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
-          cpu_val[x + 8] = gpu_val->mf[x];
-        }
-      }
+      // TODO: PSLIB DumpFill
 #endif
 #ifdef PADDLE_WITH_PSCORE
      accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim);
@@ -1043,102 +1099,220 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
  platform::Timer all_timer;
  platform::Timer pull_gpups_timer;
  all_timer.Start();
-  size_t total_length =
-      std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-  size_t feature_value_size = 0;

  auto accessor_wrapper_ptr =
-      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
-  feature_value_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
+      GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
+  size_t feature_value_size =
+      accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
  VLOG(3) << "PullSparse max_dim:" << max_mf_dim_
-          << " feature_value_size:" << feature_value_size;
+          << " pull_feature_value_size:" << pull_type_size_;

-#ifdef PADDLE_WITH_CUDA
-  VLOG(3) << "Begine Gpu Ps PullSparse";
-  auto buf = memory::Alloc(place, total_length * feature_value_size);
-  float* total_values_gpu = reinterpret_cast<float*>(buf->ptr());
-#endif
-#ifdef PADDLE_WITH_XPU_KP
-  VLOG(3) << "Begine Xpu Ps PullSparse";
-  FeatureValue* total_values_gpu = nullptr;
-  xpu_malloc(reinterpret_cast<void**>(&total_values_gpu),
-             total_length * feature_value_size);
-#endif
  if (platform::is_cpu_place(place)) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Warning:: CPUPlace is not supported in GpuPs now."));
  } else if (platform::is_gpu_place(place)) {
-    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
+#ifdef PADDLE_WITH_CUDA
    int device_id = place.GetDeviceId();
    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
-    LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
-    uint64_t* total_keys =
-        reinterpret_cast<uint64_t*>(total_keys_tensor.mutable_data<int64_t>(
-            {int64_t(total_length), 1}, place));
-
-    // construct slot_level lod info
-    auto slot_lengths_lod = slot_lengths;
-    for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
-      slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+    if (FLAGS_gpugraph_dedup_pull_push_mode > 0) {
+      auto& dev = device_caches_[devid_2_index];
+      int slot_num = static_cast<int>(slot_lengths.size());
+      std::vector<int64_t> slot_lengths_lod;
+      slot_lengths_lod.reserve(slot_num + 1);
+      slot_lengths_lod.push_back(0);
+
+      int64_t total_length = 0;
+      for (int i = 0; i < slot_num; ++i) {
+        total_length += slot_lengths[i];
+        slot_lengths_lod.push_back(total_length);
+      }
+      dev.total_key_length = total_length;
+      VLOG(3) << "[" << device_id << "]Begin copy keys, key_num["
+              << total_length << "] dedup mode";
+
+      auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                        platform::DeviceContextPool::Instance().Get(place))
+                        ->stream();
+
+      uint64_t* total_keys = dev.keys_tensor.mutable_data<uint64_t>(
+          (total_length * 3) * sizeof(uint64_t), place);
+
+      int* gpu_slot_dims = dev.dims_tensor.mutable_data<int>(
+          slot_dim.size() * sizeof(int), place);
+      uint64_t** gpu_keys = dev.keys_ptr_tensor.mutable_data<uint64_t*>(
+          keys.size() * sizeof(uint64_t*), place);
+
+      int64_t* slot_lens = dev.slot_lens.mutable_data<int64_t>(
+          (slot_num + 1) * sizeof(int64_t), place);
+      cudaMemcpyAsync(gpu_keys,
+                      keys.data(),
+                      keys.size() * sizeof(uint64_t*),
+                      cudaMemcpyHostToDevice,
+                      stream);
+      cudaMemcpyAsync(slot_lens,
+                      slot_lengths_lod.data(),
+                      slot_lengths_lod.size() * sizeof(int64_t),
+                      cudaMemcpyHostToDevice,
+                      stream);
+
+      cudaMemcpyAsync(gpu_slot_dims,
+                      slot_dim.data(),
+                      slot_dim.size() * sizeof(int),
+                      cudaMemcpyHostToDevice,
+                      stream);
+      float** gpu_values = dev.values_ptr_tensor.mutable_data<float*>(
+          values.size() * sizeof(float*), place);
+      cudaMemcpyAsync(gpu_values,
+                      values.data(),
+                      values.size() * sizeof(float*),
+                      cudaMemcpyHostToDevice,
+                      stream);
+
+      int* key2slot = dev.keys2slot.mutable_data<int>(
+          (total_length * 5) * sizeof(int), place);
+
+      this->CopyKeys(place,
+                     gpu_keys,
+                     total_keys,
+                     slot_lens,
+                     slot_num,
+                     static_cast<int>(total_length),
+                     key2slot);
+
+      uint32_t* d_restore_idx =
+          reinterpret_cast<uint32_t*>(&key2slot[total_length]);
+      uint32_t* d_sorted_idx =
+          reinterpret_cast<uint32_t*>(&d_restore_idx[total_length]);
+      uint32_t* d_offset =
+          reinterpret_cast<uint32_t*>(&d_sorted_idx[total_length]);
+      uint32_t* d_merged_cnts =
+          reinterpret_cast<uint32_t*>(&d_offset[total_length]);
+      uint64_t* d_merged_keys =
+          reinterpret_cast<uint64_t*>(&total_keys[total_length]);
+      uint64_t* d_sorted_keys =
+          reinterpret_cast<uint64_t*>(&d_merged_keys[total_length]);
+
+      int dedup_size = HeterPs_->dedup_keys_and_fillidx(
+          devid_2_index,
+          static_cast<int>(total_length),
+          total_keys,     // input
+          d_merged_keys,  // output
+          d_sorted_keys,  // sort keys
+          d_restore_idx,  // pull fill idx
+          d_sorted_idx,   // sort old idx
+          d_offset,       // offset
+          d_merged_cnts,
+          FLAGS_gpugraph_dedup_pull_push_mode & 0x02);
+      //      printf("device %d, end dedup_keys_and_fillidx total %d, "
+      //              "dedup_size %d, slot num: %d, value size: %d\n",
+      //             device_id, int(total_length), dedup_size, slot_num,
+      //             int(feature_value_size));
+
+      PADDLE_ENFORCE_GT(dedup_size,
+                        0,
+                        platform::errors::PreconditionNotMet(
+                            "dedup keys need more than zero failed in BoxPS."));
+      dev.dedup_key_length = dedup_size;
+
+      int64_t total_bytes = dedup_size * feature_value_size;
+      float* total_values_gpu =
+          dev.pull_push_tensor.mutable_data<float>(total_bytes, place);
+      pull_gpups_timer.Start();
+      HeterPs_->pull_sparse(
+          devid_2_index, d_merged_keys, total_values_gpu, dedup_size);
+
+      // values.size() not sure equal slot_num
+      accessor_wrapper_ptr->CopyForPull(place,
+                                        total_keys,
+                                        gpu_values,
+                                        total_values_gpu,
+                                        slot_lens,
+                                        key2slot,
+                                        max_mf_dim_ + 3,
+                                        total_length,
+                                        gpu_slot_dims,
+                                        d_restore_idx,
+                                        feature_value_size);
+    } else {
+      size_t total_length =
+          std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+      auto buf = memory::Alloc(place, total_length * feature_value_size);
+      float* total_values_gpu = reinterpret_cast<float*>(buf->ptr());
+      VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
+      LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
+      uint64_t* total_keys =
+          reinterpret_cast<uint64_t*>(total_keys_tensor.mutable_data<int64_t>(
+              {int64_t(total_length), 1}, place));
+      // construct slot_level lod info
+      auto slot_lengths_lod = slot_lengths;
+      for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
+        slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+      }
+      auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*));
+      auto buf_length =
+          memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+      uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
+      int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+      cudaMemcpy(gpu_keys,
+                 keys.data(),
+                 keys.size() * sizeof(uint64_t*),
+                 cudaMemcpyHostToDevice);
+      cudaMemcpy(gpu_len,
+                 slot_lengths_lod.data(),
+                 slot_lengths.size() * sizeof(int64_t),
+                 cudaMemcpyHostToDevice);
+
+      auto buf_dim = memory::Alloc(place, slot_dim.size() * sizeof(int));
+      int* gpu_dim = reinterpret_cast<int*>(buf_dim->ptr());
+      cudaMemcpy(gpu_dim,
+                 slot_dim.data(),
+                 slot_dim.size() * sizeof(int),
+                 cudaMemcpyHostToDevice);
+
+      this->CopyKeys(place,
+                     gpu_keys,
+                     total_keys,
+                     gpu_len,
+                     static_cast<int>(slot_lengths.size()),
+                     static_cast<int>(total_length));
+      VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
+              << " len: " << total_length;
+
+      pull_gpups_timer.Start();
+      HeterPs_->pull_sparse(
+          devid_2_index, total_keys, total_values_gpu, total_length);
+
+      VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
+              << "]";
+
+      accessor_wrapper_ptr->CopyForPull(place,
+                                        gpu_keys,
+                                        values,
+                                        total_values_gpu,
+                                        gpu_len,
+                                        static_cast<int>(slot_lengths.size()),
+                                        hidden_size,
+                                        total_length,
+                                        gpu_dim,
+                                        feature_value_size);
    }
-    auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*));
-    auto buf_length =
-        memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
-    uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
-    int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
-    cudaMemcpy(gpu_keys,
-               keys.data(),
-               keys.size() * sizeof(uint64_t*),
-               cudaMemcpyHostToDevice);
-    cudaMemcpy(gpu_len,
-               slot_lengths_lod.data(),
-               slot_lengths.size() * sizeof(int64_t),
-               cudaMemcpyHostToDevice);
-
-    auto buf_dim = memory::Alloc(place, slot_dim.size() * sizeof(int));
-    int* gpu_dim = reinterpret_cast<int*>(buf_dim->ptr());
-    cudaMemcpy(gpu_dim,
-               slot_dim.data(),
-               slot_dim.size() * sizeof(int),
-               cudaMemcpyHostToDevice);
-
-    this->CopyKeys(place,
-                   gpu_keys,
-                   total_keys,
-                   gpu_len,
-                   static_cast<int>(slot_lengths.size()),
-                   static_cast<int>(total_length));
-    VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
-            << " len: " << total_length;
-
-    pull_gpups_timer.Start();
-    HeterPs_->pull_sparse(
-        devid_2_index, total_keys, total_values_gpu, total_length);
-
-    VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
-            << "]";
-
-    accessor_wrapper_ptr->CopyForPull(place,
-                                      gpu_keys,
-                                      values,
-                                      total_values_gpu,
-                                      gpu_len,
-                                      static_cast<int>(slot_lengths.size()),
-                                      hidden_size,
-                                      total_length,
-                                      gpu_dim,
-                                      val_type_size_);
-
    pull_gpups_timer.Pause();
-
+#endif
  } else if (platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU_KP
+    VLOG(3) << "Begine Xpu Ps PullSparse";
+    size_t total_length =
+        std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+    FeatureValue* total_values_gpu = nullptr;
+    xpu_malloc(reinterpret_cast<void**>(&total_values_gpu),
+               total_length * feature_value_size);
    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
    int device_id = place.GetDeviceId();
    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
    LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
-    uint64_t* total_keys = reinterpret_cast<uint64_t*>(
-        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
+    uint64_t* total_keys =
+        reinterpret_cast<uint64_t*>(total_keys_tensor.mutable_data<int64_t>(
+            {int64_t(total_length), 1}, place));

    // construct slot_level lod info
    auto slot_lengths_lod = slot_lengths;
@@ -1185,7 +1359,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
                                      static_cast<int>(slot_lengths.size()),
                                      hidden_size,
                                      total_length,
-                                      val_type_size_);
+                                      feature_value_size);
 #endif
  } else {
    PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -1208,17 +1382,10 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
  platform::Timer all_timer;
  platform::Timer push_gpups_timer;
  all_timer.Start();
-  int64_t total_length =
-      std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-  // #ifdef PADDLE_WITH_CUDA
-  VLOG(3) << "Begin GPUPS PushSparseGrad";
  auto accessor_wrapper_ptr =
-      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+      GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
-  auto buf = memory::Alloc(place, total_length * grad_value_size);
-  VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_
-          << "grad_value_size:" << grad_value_size;
-  float* total_grad_values_gpu = reinterpret_cast<float*>(buf->ptr());
+
  if (platform::is_cpu_place(place)) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Warning:: CPUPlace is not supported in GPUPS now."));
@@ -1226,36 +1393,142 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
 #ifdef PADDLE_WITH_CUDA
    int device_id = place.GetDeviceId();
    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
-    LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
-    uint64_t* total_keys =
-        reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
-    VLOG(3) << "Begin copy grad tensor to gpups struct";
-    accessor_wrapper_ptr->CopyForPush(place,
-                                      grad_values,
-                                      total_grad_values_gpu,
-                                      slot_lengths,
-                                      total_length,
-                                      batch_size,
-                                      grad_value_size,
-                                      slot_vector_,
-                                      slot_mf_dim_vector_);
+    if (FLAGS_gpugraph_dedup_pull_push_mode > 0) {
+      auto& dev = device_caches_[devid_2_index];
+      int64_t total_length = dev.total_key_length;
+      VLOG(3) << "Begin push sparse, key_num[" << total_length
+              << "] dedup mode, device:" << device_id << ", index"
+              << devid_2_index;
+      auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                        platform::DeviceContextPool::Instance().Get(place))
+                        ->stream();
+      uint64_t* total_keys = dev.keys_tensor.data<uint64_t>();
+      int* slot_dims = dev.dims_tensor.data<int>();
+      int slot_num = static_cast<int>(slot_lengths.size());
+      if (!dev.d_slot_vector.IsInitialized()) {
+        int* buf_slot_vector =
+            dev.d_slot_vector.mutable_data<int>(slot_num * sizeof(int), place);
+        cudaMemcpyAsync(buf_slot_vector,
+                        slot_vector_.data(),
+                        slot_num * sizeof(int),
+                        cudaMemcpyHostToDevice,
+                        stream);
+      }

-    VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index
-            << " len: " << total_length;
-    push_gpups_timer.Start();
-    HeterPs_->push_sparse(devid_2_index,
-                          total_keys,
-                          total_grad_values_gpu,
-                          static_cast<int>(total_length));
+      const int64_t* slot_lens = dev.slot_lens.data<int64_t>();
+      const int* d_slot_vector = dev.d_slot_vector.data<int>();
+      const int* key2slot = dev.keys2slot.data<int>();
+      float** gpu_values = dev.values_ptr_tensor.data<float*>();
+      cudaMemcpyAsync(gpu_values,
+                      grad_values.data(),
+                      grad_values.size() * sizeof(float*),
+                      cudaMemcpyHostToDevice,
+                      stream);
+
+      uint64_t* d_merged_keys = &total_keys[total_length];
+
+      int64_t dedup_size = dev.dedup_key_length;
+      int64_t total_bytes = dedup_size * grad_value_size;
+      float* total_grad_values_gpu =
+          dev.pull_push_tensor.mutable_data<float>(total_bytes, place);
+      // dedup rate more than 3
+      if (total_length > dedup_size * 3) {
+        const uint32_t* d_restore_idx =
+            reinterpret_cast<const uint32_t*>(&key2slot[total_length]);
+        accessor_wrapper_ptr->CopyForPush(place,
+                                          total_keys,
+                                          gpu_values,
+                                          total_grad_values_gpu,
+                                          d_slot_vector,
+                                          slot_lens,
+                                          max_mf_dim_ + 3,
+                                          total_length,
+                                          dedup_size,
+                                          batch_size,
+                                          slot_dims,
+                                          key2slot,
+                                          d_restore_idx,
+                                          grad_value_size);
+      } else {
+        const uint32_t* d_sorted_idx =
+            reinterpret_cast<const uint32_t*>(&key2slot[total_length * 2]);
+        const uint32_t* d_offset =
+            reinterpret_cast<const uint32_t*>(&d_sorted_idx[total_length]);
+        const uint32_t* d_merged_cnts =
+            reinterpret_cast<const uint32_t*>(&d_offset[total_length]);
+        accessor_wrapper_ptr->CopyForPush(place,
+                                          d_merged_keys,
+                                          gpu_values,
+                                          total_grad_values_gpu,
+                                          d_slot_vector,
+                                          slot_lens,
+                                          max_mf_dim_ + 3,
+                                          total_length,
+                                          dedup_size,
+                                          batch_size,
+                                          slot_dims,
+                                          key2slot,
+                                          d_sorted_idx,
+                                          d_offset,
+                                          d_merged_cnts,
+                                          grad_value_size);
+      }
+
+      push_gpups_timer.Start();
+      HeterPs_->push_sparse(devid_2_index,
+                            d_merged_keys,
+                            total_grad_values_gpu,
+                            static_cast<int>(dedup_size));
+    } else {
+      int64_t total_length =
+          std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+      VLOG(3) << "Begin GPUPS PushSparseGrad";
+
+      auto buf = memory::Alloc(place, total_length * grad_value_size);
+      VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_
+              << "grad_value_size:" << grad_value_size;
+      float* total_grad_values_gpu = reinterpret_cast<float*>(buf->ptr());
+
+      LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
+      uint64_t* total_keys =
+          reinterpret_cast<uint64_t*>(total_keys_tensor.data<int64_t>());
+      VLOG(3) << "Begin copy grad tensor to gpups struct";
+
+      accessor_wrapper_ptr->CopyForPush(place,
+                                        grad_values,
+                                        total_grad_values_gpu,
+                                        slot_lengths,
+                                        total_length,
+                                        batch_size,
+                                        grad_value_size,
+                                        slot_vector_,
+                                        slot_mf_dim_vector_);
+
+      VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index
+              << " len: " << total_length;
+      push_gpups_timer.Start();
+      HeterPs_->push_sparse(devid_2_index,
+                            total_keys,
+                            total_grad_values_gpu,
+                            static_cast<int>(total_length));
+    }
    push_gpups_timer.Pause();
 #endif
  } else if (platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU_KP
    int device_id = place.GetDeviceId();
    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
-    LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
+    int64_t total_length =
+        std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+    VLOG(3) << "Begin GPUPS PushSparseGrad";
+
+    auto buf = memory::Alloc(place, total_length * grad_value_size);
+    VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_
+            << "grad_value_size:" << grad_value_size;
+    float* total_grad_values_gpu = reinterpret_cast<float*>(buf->ptr());
+    LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
    uint64_t* total_keys =
-        reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
+        reinterpret_cast<uint64_t*>(total_keys_tensor.data<int64_t>());
    VLOG(3) << "Begin copy grad tensor to xpups struct";
    accessor_wrapper_ptr->CopyForPush(place,
                                      grad_values,
@@ -1288,6 +1561,6 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
  VLOG(3) << "End PushSparseGrad";
 }

-}  // end namespace framework
+}  // namespace framework
 }  // end namespace paddle
 #endif
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -22,10 +22,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"

 namespace paddle {
 namespace framework {

+const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
+#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
+#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
+
 __global__ void CopyKeysKernel(uint64_t** src_keys,
                               uint64_t* dest_total_keys,
                               const int64_t* len,
@@ -93,6 +98,44 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
  cudaStreamSynchronize(stream);
 }

+__global__ void CopyKeysKernel2(const int total_len,
+                                uint64_t** src_keys,
+                                uint64_t* dest_total_keys,
+                                const int slot_num,
+                                const int64_t* slot_lens,
+                                int* key2slots) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < slot_lens[mid + 1]) {
+        high = mid;
+      } else {
+        low = mid + 1;
+      }
+    }
+    key2slots[i] = low;
+    int y = i - slot_lens[low];
+    dest_total_keys[i] = src_keys[low][y];
+  }
+}
+
+void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
+                            uint64_t** origin_keys,
+                            uint64_t* total_keys,
+                            const int64_t* slot_lens,
+                            int slot_num,
+                            int total_len,
+                            int* key2slot) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  CopyKeysKernel2<<<CUDA_BLOCK(total_len), stream>>>(
+      total_len, origin_keys, total_keys, slot_num, slot_lens, key2slot);
+  cudaStreamSynchronize(stream);
+}
+
 void PSGPUWrapper::SetSparseSGD(float nonclk_coeff,
                                float clk_coeff,
                                float min_bound,
@@ -123,7 +166,9 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
                                float mf_max_bound,
                                float mf_beta1_decay_rate,
                                float mf_beta2_decay_rate,
-                                float mf_ada_epsilon) {
+                                float mf_ada_epsilon,
+                                float nodeid_slot,
+                                float feature_learning_rate) {
  optimizer_config_.set_embedx_sgd(mf_create_thresholds,
                                   mf_learning_rate,
                                   mf_initial_g2sum,
@@ -132,7 +177,9 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
                                   mf_max_bound,
                                   mf_beta1_decay_rate,
                                   mf_beta2_decay_rate,
-                                   mf_ada_epsilon);
+                                   mf_ada_epsilon,
+                                   nodeid_slot,
+                                   feature_learning_rate);
 }

 }  // end namespace framework

--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-
 #ifdef PADDLE_WITH_HETERPS

 #include <atomic>
@@ -98,20 +97,61 @@ class AfsWrapper {
 #endif

 class PSGPUWrapper {
+  class DCacheBuffer {
+   public:
+    DCacheBuffer() : buf_(nullptr) {}
+    ~DCacheBuffer() {}
+    /**
+     * @Brief get data
+     */
+    template <typename T>
+    T* mutable_data(const size_t total_bytes,
+                    const paddle::platform::Place& place) {
+      if (buf_ == nullptr) {
+        buf_ = memory::AllocShared(place, total_bytes);
+      } else if (buf_->size() < total_bytes) {
+        buf_.reset();
+        buf_ = memory::AllocShared(place, total_bytes);
+      }
+      return reinterpret_cast<T*>(buf_->ptr());
+    }
+    template <typename T>
+    T* data() {
+      return reinterpret_cast<T*>(buf_->ptr());
+    }
+    size_t memory_size() {
+      if (buf_ == nullptr) {
+        return 0;
+      }
+      return buf_->size();
+    }
+    bool IsInitialized(void) { return (buf_ != nullptr); }
+
+   private:
+    std::shared_ptr<memory::Allocation> buf_ = nullptr;
+  };
+  struct PSDeviceData {
+    DCacheBuffer keys_tensor;
+    DCacheBuffer dims_tensor;
+    DCacheBuffer keys_ptr_tensor;
+    DCacheBuffer values_ptr_tensor;
+    DCacheBuffer pull_push_tensor;
+
+    DCacheBuffer slot_lens;
+    DCacheBuffer d_slot_vector;
+    DCacheBuffer keys2slot;
+
+    int64_t total_key_length = 0;
+    int64_t dedup_key_length = 0;
+  };
+  PSDeviceData* device_caches_ = nullptr;
+
 public:
  ~PSGPUWrapper();

  PSGPUWrapper() {
    HeterPs_ = NULL;
    sleep_seconds_before_fail_exit_ = 300;
-    pull_thread_pool_.resize(thread_keys_shard_num_);
-    for (size_t i = 0; i < pull_thread_pool_.size(); i++) {
-      pull_thread_pool_[i].reset(new ::ThreadPool(1));
-    }
-    hbm_thread_pool_.resize(thread_keys_shard_num_);
-    for (size_t i = 0; i < hbm_thread_pool_.size(); i++) {
-      hbm_thread_pool_[i].reset(new ::ThreadPool(1));
-    }
  }

  void PullSparse(const paddle::platform::Place& place,
@@ -140,6 +180,13 @@ class PSGPUWrapper {
                const int64_t* gpu_len,
                int slot_num,
                int total_len);
+  void CopyKeys(const paddle::platform::Place& place,
+                uint64_t** origin_keys,
+                uint64_t* total_keys,
+                const int64_t* gpu_len,
+                int slot_num,
+                int total_len,
+                int* key2slot);

  void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
  void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
@@ -164,6 +211,11 @@ class PSGPUWrapper {
    pre_build_threads_.join();
    s_instance_ = nullptr;
    VLOG(3) << "PSGPUWrapper Finalize Finished.";
+    HeterPs_->show_table_collisions();
+    if (device_caches_ != nullptr) {
+      delete[] device_caches_;
+      device_caches_ = nullptr;
+    }
  }

  void InitializeGPU(const std::vector<int>& dev_ids) {
@@ -173,6 +225,7 @@ class PSGPUWrapper {
      resource_ = std::make_shared<HeterPsResource>(dev_ids);
      resource_->enable_p2p();
      keys_tensor.resize(resource_->total_device());
+      device_caches_ = new PSDeviceData[resource_->total_device()];
 #ifdef PADDLE_WITH_GLOO
      auto gloo = paddle::framework::GlooWrapper::GetInstance();
      if (gloo->Size() > 1) {
@@ -256,7 +309,9 @@ class PSGPUWrapper {
                    float mf_max_bound,
                    float mf_beta1_decay_rate,
                    float mf_beta2_decay_rate,
-                    float mf_ada_epsilon);
+                    float mf_ada_epsilon,
+                    float nodeid_slot,
+                    float feature_learning_rate);

 #ifdef PADDLE_WITH_PSCORE
  void add_sparse_optimizer(
@@ -308,6 +363,21 @@ class PSGPUWrapper {
  void InitializeGPUServer(paddle::distributed::PSParameter ps_param) {
    auto sparse_table =
        ps_param.server_param().downpour_server_param().downpour_table_param(0);
+    // set build thread_num and shard_num
+    thread_keys_thread_num_ = sparse_table.shard_num();
+    thread_keys_shard_num_ = sparse_table.shard_num();
+    VLOG(1) << "ps_gpu build phase thread_num:" << thread_keys_thread_num_
+            << " shard_num:" << thread_keys_shard_num_;
+
+    pull_thread_pool_.resize(thread_keys_shard_num_);
+    for (size_t i = 0; i < pull_thread_pool_.size(); i++) {
+      pull_thread_pool_[i].reset(new ::ThreadPool(1));
+    }
+    hbm_thread_pool_.resize(thread_keys_shard_num_);
+    for (size_t i = 0; i < hbm_thread_pool_.size(); i++) {
+      hbm_thread_pool_[i].reset(new ::ThreadPool(1));
+    }
+
    auto sparse_table_accessor = sparse_table.accessor();
    auto sparse_table_accessor_parameter =
        sparse_table_accessor.ctr_accessor_param();
@@ -319,6 +389,11 @@ class PSGPUWrapper {
    config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
    config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();

+    config["nodeid_slot"] =
+        sparse_table_accessor.graph_sgd_param().nodeid_slot();
+    config["feature_learning_rate"] =
+        sparse_table_accessor.graph_sgd_param().feature_learning_rate();
+
    if (accessor_class_ == "CtrDymfAccessor") {
      // optimizer config for embed_w and embedx
      add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param());
@@ -327,8 +402,8 @@ class PSGPUWrapper {
    }

    fleet_config_ = config;
-    GlobalAccessorTransfor::GetInstance().Init(accessor_class_);
-    GlobalAccessorTransfor::GetInstance().GetAccessorWrapper()->Configure(
+    GlobalAccessorFactory::GetInstance().Init(accessor_class_);
+    GlobalAccessorFactory::GetInstance().GetAccessorWrapper()->Configure(
        config);
    InitializeGPUServer(config);
  }
@@ -394,6 +469,16 @@ class PSGPUWrapper {
    float mf_ada_epsilon = (config.find("mf_ada_epsilon") == config.end())
                               ? 1e-8
                               : config["mf_ada_epsilon"];
+
+    float feature_learning_rate =
+        (config.find("feature_learning_rate") == config.end())
+            ? 0.05
+            : config["feature_learning_rate"];
+
+    float nodeid_slot = (config.find("nodeid_slot") == config.end())
+                            ? 9008
+                            : config["nodeid_slot"];
+
    this->SetSparseSGD(nonclk_coeff,
                       clk_coeff,
                       min_bound,
@@ -412,12 +497,18 @@ class PSGPUWrapper {
                       mf_max_bound,
                       mf_beta1_decay_rate,
                       mf_beta2_decay_rate,
-                       mf_ada_epsilon);
+                       mf_ada_epsilon,
+                       nodeid_slot,
+                       feature_learning_rate);

    // set optimizer type(naive,adagrad,std_adagrad,adam,share_adam)
    optimizer_type_ = (config.find("optimizer_type") == config.end())
                          ? 1
-                          : static_cast<int>(config["optimizer_type"]);
+                          : int(config["optimizer_type"]);
+
+    VLOG(0) << "InitializeGPUServer optimizer_type_:" << optimizer_type_
+            << " nodeid_slot:" << nodeid_slot
+            << " feature_learning_rate:" << feature_learning_rate;
  }

  void SetDate(int year, int month, int day) {
@@ -508,11 +599,13 @@ class PSGPUWrapper {
    }

    auto accessor_wrapper_ptr =
-        GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+        GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
    val_type_size_ = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
    grad_type_size_ = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
+    pull_type_size_ = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
    VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_
-            << " grad_type_size_:" << grad_type_size_;
+            << " grad_type_size_:" << grad_type_size_
+            << " pull_type_size_:" << pull_type_size_;
    slot_info_initialized_ = true;
  }
 #endif
@@ -564,6 +657,7 @@ class PSGPUWrapper {
  int max_mf_dim_{0};
  size_t val_type_size_{0};
  size_t grad_type_size_{0};
+  size_t pull_type_size_{0};

  double time_1 = 0.0;
  double time_2 = 0.0;
@@ -573,6 +667,7 @@ class PSGPUWrapper {
  int multi_node_{0};
  int node_size_;
  uint64_t table_id_;
+  int gpu_graph_mode_ = 0;
 #ifdef PADDLE_WITH_CUDA
  std::vector<ncclComm_t> inner_comms_;
  std::vector<ncclComm_t> inter_comms_;

--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
@@ -220,52 +220,6 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
  xpu_wait(stream);
 }

-void PSGPUWrapper::SetSparseSGD(float nonclk_coeff,
-                                float clk_coeff,
-                                float min_bound,
-                                float max_bound,
-                                float learning_rate,
-                                float initial_g2sum,
-                                float initial_range,
-                                float beta1_decay_rate,
-                                float beta2_decay_rate,
-                                float ada_epsilon) {
-  OptimizerConfig optimizer_config;
-  optimizer_config.set_sparse_sgd(nonclk_coeff,
-                                  clk_coeff,
-                                  min_bound,
-                                  max_bound,
-                                  learning_rate,
-                                  initial_g2sum,
-                                  initial_range,
-                                  beta1_decay_rate,
-                                  beta2_decay_rate,
-                                  ada_epsilon);
-  HeterPs_->set_sparse_sgd(optimizer_config);
-}
-
-void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
-                                float mf_learning_rate,
-                                float mf_initial_g2sum,
-                                float mf_initial_range,
-                                float mf_min_bound,
-                                float mf_max_bound,
-                                float mf_beta1_decay_rate,
-                                float mf_beta2_decay_rate,
-                                float mf_ada_epsilon) {
-  OptimizerConfig optimizer_config;
-  optimizer_config.set_embedx_sgd(mf_create_thresholds,
-                                  mf_learning_rate,
-                                  mf_initial_g2sum,
-                                  mf_initial_range,
-                                  mf_min_bound,
-                                  mf_max_bound,
-                                  mf_beta1_decay_rate,
-                                  mf_beta2_decay_rate,
-                                  mf_ada_epsilon);
-  HeterPs_->set_embedx_sgd(optimizer_config);
-}
-
 }  // end namespace framework
 }  // end namespace paddle
 #endif
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -119,6 +119,12 @@ void HogwildWorker::CreateDeviceResource(const ProgramDesc &main_prog) {

 void HogwildWorker::TrainFilesWithProfiler() {
  platform::SetNumThreads(1);
+#if defined(PADDLE_WITH_HETERPS) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+  platform::SetDeviceId(thread_id_);
+#elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL)
+  platform::SetXPUDeviceId(thread_id_);
+#endif
  device_reader_->Start();
  std::vector<double> op_total_time;
  std::vector<std::string> op_name;
@@ -175,8 +181,6 @@ void HogwildWorker::TrainFilesWithProfiler() {
    PrintFetchVars();
 #ifdef PADDLE_WITH_HETERPS
    dev_ctx_->Wait();
-    VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " << total_time
-            << " seconds, ins_num: " << total_inst;
    for (size_t i = 0; i < op_name.size(); ++i) {
      VLOG(1) << "card:" << thread_id_ << ", op: " << op_name[i]
              << ", mean time: " << op_total_time[i] / total_inst
@@ -201,6 +205,9 @@ void HogwildWorker::TrainFilesWithProfiler() {
    thread_scope_->DropKids();
    timeline.Start();
  }
+  VLOG(0) << "GpuPs worker " << thread_id_ << " train cost " << total_time
+          << " seconds, ins_num: " << total_inst << " read time: " << read_time
+          << "seconds ";

  if (need_dump_field_ || need_dump_param_) {
    writer_.Flush();
@@ -217,16 +224,19 @@ void HogwildWorker::TrainFiles() {
  platform::SetNumThreads(1);
  platform::Timer timeline;
  timeline.Start();
+#if defined(PADDLE_WITH_HETERPS) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+  platform::SetDeviceId(thread_id_);
+#elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL)
+  platform::SetXPUDeviceId(thread_id_);
+#endif

-  int total_ins_num = 0;
+  int total_batch_num = 0;
  // how to accumulate fetched values here
  device_reader_->Start();
  int cur_batch;
  int batch_cnt = 0;

-#if defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_CUDA)
-  platform::SetDeviceId(thread_id_);
-#endif
  while ((cur_batch = device_reader_->Next()) > 0) {
    for (auto &op : ops_) {
      bool need_skip = false;
@@ -248,7 +258,7 @@ void HogwildWorker::TrainFiles() {
      DumpParam(*thread_scope_, batch_cnt);
    }

-    total_ins_num += cur_batch;
+    total_batch_num += cur_batch;
    ++batch_cnt;
    PrintFetchVars();
    thread_scope_->DropKids();
@@ -257,8 +267,8 @@ void HogwildWorker::TrainFiles() {
 #endif
  }
  timeline.Pause();
-  VLOG(1) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
-          << " seconds, ins_num: " << total_ins_num;
+  VLOG(0) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
+          << " seconds, batch_num: " << total_batch_num;

  if (need_dump_field_ || need_dump_param_) {
    writer_.Flush();

--- a/paddle/fluid/framework/io/fs.cc
+++ b/paddle/fluid/framework/io/fs.cc
@@ -157,7 +157,7 @@ std::vector<std::string> localfs_list(const std::string& path) {
  std::shared_ptr<FILE> pipe;
  int err_no = 0;
  pipe = shell_popen(
-      string::format_string("find %s -type f -maxdepth 1", path.c_str()),
+      string::format_string("find %s -type f -maxdepth 1 | sort", path.c_str()),
      "r",
      &err_no);
  string::LineFileReader reader;

--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -128,16 +128,16 @@ void PSGPUWorker::TrainFiles() {
  timeline.Start();

  int total_ins_num = 0;
-
-  // how to accumulate fetched values here
-  device_reader_->Start();
-  int cur_batch;
-  int batch_cnt = 0;
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  platform::SetDeviceId(thread_id_);
 #elif defined(PADDLE_WITH_XPU_BKCL)
  platform::SetXPUDeviceId(thread_id_);
 #endif
+
+  // how to accumulate fetched values here
+  device_reader_->Start();
+  int cur_batch;
+  int batch_cnt = 0;
  while ((cur_batch = device_reader_->Next()) > 0) {
    total_ins_num += cur_batch;
    for (auto& op : ops_) {

--- a/paddle/fluid/framework/trainer.cc
+++ b/paddle/fluid/framework/trainer.cc
@@ -58,7 +58,6 @@ void TrainerBase::DumpWork(int tid) {
  int err_no = 0;
  // GetDumpPath is implemented in each Trainer
  std::string path = GetDumpPath(tid);
-
  std::shared_ptr<FILE> fp = fs_open_write(path, &err_no, dump_converter_);
  while (1) {
    std::string out_str;

--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -68,7 +68,7 @@ message TrainerDesc {

  // add for gpu
  optional string fleet_desc = 37;
-
+  optional bool is_dump_in_simple_mode = 38 [ default = false ];
  // device worker parameters
  optional HogwildWorkerParameter hogwild_param = 101;
  optional DownpourWorkerParameter downpour_param = 103;

--- a/paddle/fluid/jit/CMakeLists.txt
+++ b/paddle/fluid/jit/CMakeLists.txt
@@ -32,7 +32,7 @@ cc_library(
 if(WITH_TESTING AND NOT WIN32)
  add_custom_target(
    jit_download_program
-    COMMAND wget -nc -q
+    COMMAND wget -nc -q --no-check-certificate
            https://paddle-ci.gz.bcebos.com/dy2st/multi_program_load.tar.gz
    COMMAND tar zxf multi_program_load.tar.gz)
  set(JIT_DEPS

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -170,7 +170,7 @@ if(WITH_TESTING)
  if(NOT WIN32)
    add_custom_target(
      download_data
-      COMMAND wget -nc
+      COMMAND wget -nc --no-check-certificate
              https://paddle-ci.cdn.bcebos.com/buddy_allocator_test_data.tar
      COMMAND tar -xf buddy_allocator_test_data.tar)
    add_dependencies(buddy_allocator_test download_data)

--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -68,6 +68,20 @@ PADDLE_DEFINE_EXPORTED_bool(
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");

+/**
+ * Operator related FLAG
+ * Name: FLAGS_check_nan_inf
+ * Since Version: 0.13.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Used to debug. Checking whether operator produce NAN/INF or not.
+ */
+PADDLE_DEFINE_EXPORTED_bool(
+    enable_opt_get_features,
+    false,
+    "Checking whether operator produce NAN/INF or not. It will be "
+    "extremely slow so please use this flag wisely.");
+
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
@@ -785,6 +799,34 @@ PADDLE_DEFINE_EXPORTED_bool(
    false,
    "It controls whether to apply IR pass to program when using Fleet APIs");

+/**
+ * Distributed related FLAG
+ * Name: FLAGS_graph_load_in_parallel
+ * Since Version: 2.2.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Control whether load graph node and edge with multi threads parallely
+ *       If it is not set, load graph data with one thread
+ */
+PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel,
+                            false,
+                            "It controls whether load graph node and edge with "
+                            "mutli threads parallely.");
+
+/**
+ * Distributed related FLAG
+ * Name: FLAGS_graph_get_neighbor_id
+ * Since Version: 2.2.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Control get all neighbor id when running sub part graph
+ *       If it is not set, do not need get neighbor id when run all part graph
+ */
+PADDLE_DEFINE_EXPORTED_bool(
+    graph_get_neighbor_id,
+    false,
+    "It controls get all neighbor id when running sub part graph.");
+
 /**
 * KP kernel related FLAG
 * Name: FLAGS_run_kp_kernel
@@ -893,7 +935,33 @@ DEFINE_bool(enable_slotrecord_reset_shrink,
            "enable slotrecord obejct reset shrink memory, default false");
 DEFINE_bool(enable_ins_parser_file,
            false,
-            "enable parser ins file , default false");
+            "enable parser ins file, default false");
+PADDLE_DEFINE_EXPORTED_bool(
+    gpugraph_enable_hbm_table_collision_stat,
+    false,
+    "enable hash collisions stat for hbm table, default false");
+PADDLE_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor,
+                              0.75,
+                              "the load factor of hbm table, default 0.75");
+PADDLE_DEFINE_EXPORTED_bool(
+    gpugraph_enable_gpu_direct_access,
+    false,
+    "enable direct access bwtween multi gpu cards, default false");
+PADDLE_DEFINE_EXPORTED_bool(
+    gpugraph_enable_segment_merge_grads,
+    false,
+    "enable segment merge gradients while push sparse, default false");
+PADDLE_DEFINE_EXPORTED_uint64(
+    gpugraph_merge_grads_segment_size,
+    128,
+    "segment size with segment gradient merge, default 128");
+PADDLE_DEFINE_EXPORTED_int32(
+    gpugraph_dedup_pull_push_mode,
+    0,
+    "enable dedup keys while pull push sparse, default 0");
+PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
+                            true,
+                            "enable load_node_list_into_hbm, default true");

 /**
 * ProcessGroupNCCL related FLAG

--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -365,6 +365,9 @@ void BindDataset(py::module *m) {
           py::call_guard<py::gil_scoped_release>())
      .def("enable_pv_merge",
           &framework::Dataset::EnablePvMerge,
+           py::call_guard<py::gil_scoped_release>())
+      .def("set_gpu_graph_mode",
+           &framework::Dataset::SetGpuGraphMode,
           py::call_guard<py::gil_scoped_release>());

  py::class_<IterableDatasetWrapper>(*m, "IterableDatasetWrapper")

--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -199,13 +199,13 @@ void BindHeterClient(py::module* m) {
 void BindGraphNode(py::module* m) {
  py::class_<GraphNode>(*m, "GraphNode")
      .def(py::init<>())
-      .def("get_id", &GraphNode::get_id)
+      .def("get_id", &GraphNode::get_py_id)
      .def("get_feature", &GraphNode::get_feature);
 }
 void BindGraphPyFeatureNode(py::module* m) {
  py::class_<FeatureNode>(*m, "FeatureNode")
      .def(py::init<>())
-      .def("get_id", &GraphNode::get_id)
+      .def("get_id", &GraphNode::get_py_id)
      .def("get_feature", &GraphNode::get_feature);
 }

@@ -359,17 +359,32 @@ void BindGraphGpuWrapper(py::module* m) {
      *m, "GraphGpuWrapper")
      .def(py::init([]() { return GraphGpuWrapper::GetInstance(); }))
      .def("neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample_v3)
-      .def("graph_neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample)
+      .def("graph_neighbor_sample",
+           py::overload_cast<int, uint64_t*, int, int>(
+               &GraphGpuWrapper::graph_neighbor_sample))
+      .def("graph_neighbor_sample",
+           py::overload_cast<int, int, std::vector<uint64_t>&, int>(
+               &GraphGpuWrapper::graph_neighbor_sample))
      .def("set_device", &GraphGpuWrapper::set_device)
+      .def("set_feature_separator", &GraphGpuWrapper::set_feature_separator)
      .def("init_service", &GraphGpuWrapper::init_service)
      .def("set_up_types", &GraphGpuWrapper::set_up_types)
      .def("query_node_list", &GraphGpuWrapper::query_node_list)
      .def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf)
      .def("load_edge_file", &GraphGpuWrapper::load_edge_file)
-      .def("upload_batch", &GraphGpuWrapper::upload_batch)
-      .def("get_all_id", &GraphGpuWrapper::get_all_id)
-      .def("init_sample_status", &GraphGpuWrapper::init_sample_status)
-      .def("free_sample_status", &GraphGpuWrapper::free_sample_status)
+      .def("load_node_and_edge", &GraphGpuWrapper::load_node_and_edge)
+      .def("upload_batch",
+           py::overload_cast<int, int, int, const std::string&>(
+               &GraphGpuWrapper::upload_batch))
+      .def("upload_batch",
+           py::overload_cast<int, int, int>(&GraphGpuWrapper::upload_batch))
+      .def(
+          "get_all_id",
+          py::overload_cast<int, int, int, std::vector<std::vector<uint64_t>>*>(
+              &GraphGpuWrapper::get_all_id))
+      .def("get_all_id",
+           py::overload_cast<int, int, std::vector<std::vector<uint64_t>>*>(
+               &GraphGpuWrapper::get_all_id))
      .def("load_next_partition", &GraphGpuWrapper::load_next_partition)
      .def("make_partitions", &GraphGpuWrapper::make_partitions)
      .def("make_complementary_graph",
@@ -380,7 +395,8 @@ void BindGraphGpuWrapper(py::module* m) {
      .def("get_partition", &GraphGpuWrapper::get_partition)
      .def("load_node_weight", &GraphGpuWrapper::load_node_weight)
      .def("export_partition_files", &GraphGpuWrapper::export_partition_files)
-      .def("load_node_file", &GraphGpuWrapper::load_node_file);
+      .def("load_node_file", &GraphGpuWrapper::load_node_file)
+      .def("finalize", &GraphGpuWrapper::finalize);
 }
 #endif


--- a/paddle/utils/string/string_helper.h
+++ b/paddle/utils/string/string_helper.h
@@ -18,6 +18,7 @@
 #include <ctype.h>
 #include <stdio.h>

+#include <algorithm>
 #include <cstring>
 #include <sstream>
 #include <string>
@@ -221,6 +222,117 @@ std::string join_strings(const Container& strs,

  return ss.str();
 }
+struct str_ptr {
+  const char* ptr;
+  size_t len;
+  str_ptr(const char* p, size_t n) : ptr(p), len(n) {}
+  str_ptr(str_ptr& other) {
+    ptr = other.ptr;
+    len = other.len;
+  }
+  str_ptr(str_ptr&& other) {
+    ptr = other.ptr;
+    len = other.len;
+  }
+  size_t find_ptr(const char c) {
+    for (size_t i = 0; i < len; ++i) {
+      if (ptr[i] == c) {
+        return i;
+      }
+    }
+    return -1;
+  }
+  std::string to_string(void) { return std::string(ptr, len); }
+};
+
+struct str_ptr_stream {
+  char* ptr = NULL;
+  char* end = NULL;
+  str_ptr_stream() {}
+  str_ptr_stream(const str_ptr& p) { reset(p.ptr, p.len); }
+  void reset(const str_ptr& p) { reset(p.ptr, p.len); }
+  void reset(const char* p, size_t len) {
+    ptr = const_cast<char*>(p);
+    end = ptr + len;
+  }
+  char* cursor(void) { return ptr; }
+  char* finish(void) { return end; }
+  void set_cursor(char* p) { ptr = p; }
+  bool is_finish(void) { return (ptr == end); }
+  template <typename T>
+  str_ptr_stream& operator>>(T& x) {
+    *this >> x;
+    return *this;
+  }
+};
+inline str_ptr_stream& operator>>(str_ptr_stream& ar, float& c) {
+  char* next = NULL;
+  c = strtof(ar.cursor(), &next);
+  ar.set_cursor(std::min(++next, ar.finish()));
+  return ar;
+}
+inline str_ptr_stream& operator>>(str_ptr_stream& ar, double& c) {
+  char* next = NULL;
+  c = strtod(ar.cursor(), &next);
+  ar.set_cursor(std::min(++next, ar.finish()));
+  return ar;
+}
+inline str_ptr_stream& operator>>(str_ptr_stream& ar, int32_t& c) {
+  char* next = NULL;
+  c = strtol(ar.cursor(), &next, 10);
+  ar.set_cursor(std::min(++next, ar.finish()));
+  return ar;
+}
+inline str_ptr_stream& operator>>(str_ptr_stream& ar, uint32_t& c) {
+  char* next = NULL;
+  c = strtoul(ar.cursor(), &next, 10);
+  ar.set_cursor(std::min(++next, ar.finish()));
+  return ar;
+}
+inline str_ptr_stream& operator>>(str_ptr_stream& ar, uint64_t& c) {
+  char* next = NULL;
+  c = strtoul(ar.cursor(), &next, 10);
+  ar.set_cursor(std::min(++next, ar.finish()));
+  return ar;
+}
+inline str_ptr_stream& operator>>(str_ptr_stream& ar, int64_t& c) {
+  char* next = NULL;
+  c = strtoll(ar.cursor(), &next, 10);
+  ar.set_cursor(std::min(++next, ar.finish()));
+  return ar;
+}
+inline int split_string_ptr(const char* str,
+                            size_t len,
+                            char delim,
+                            std::vector<str_ptr>* values) {
+  if (len <= 0) {
+    return 0;
+  }
+
+  int num = 0;
+  const char* p = str;
+  const char* end = str + len;
+  const char* last = str;
+  while (p < end) {
+    if (*p != delim) {
+      ++p;
+      continue;
+    }
+    values->emplace_back(last, (size_t)(p - last));
+    ++num;
+    ++p;
+    // skip continue delim
+    while (*p == delim) {
+      ++p;
+    }
+    last = p;
+  }
+  if (p > last) {
+    values->emplace_back(last, (size_t)(p - last));
+    ++num;
+  }
+  return num;
+}

 // A helper class for reading lines from file. A line buffer is maintained. It
 // doesn't need to know the maximum possible length of a line.

--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -530,7 +530,7 @@ class DistributedStrategy(object):
                                   'embed_sparse_initial_range', 'embed_sparse_initial_g2sum', 'embed_sparse_beta1_decay_rate', \
                                   'embed_sparse_beta2_decay_rate', 'embedx_sparse_optimizer', 'embedx_sparse_learning_rate', \
                                   'embedx_sparse_weight_bounds', 'embedx_sparse_initial_range', 'embedx_sparse_initial_g2sum', \
-                                   'embedx_sparse_beta1_decay_rate', 'embedx_sparse_beta2_decay_rate']
+                                   'embedx_sparse_beta1_decay_rate', 'embedx_sparse_beta2_decay_rate', 'feature_learning_rate', 'nodeid_slot']
        support_sparse_table_class = ['DownpourSparseTable']
        support_sparse_accessor_class = [
            'DownpourSparseValueAccessor', 'DownpourCtrAccessor',
@@ -540,6 +540,11 @@ class DistributedStrategy(object):
        from google.protobuf.descriptor import FieldDescriptor
        table_param = self.strategy.downpour_table_param

+        def add_graph_config(graph, strategy):
+            graph.feature_learning_rate = strategy.get('feature_learning_rate',
+                                                       0.05)
+            graph.nodeid_slot = strategy.get('nodeid_slot', 9008)
+
        def sparse_optimizer_config(sgd, strategy, prefix):
            optimizer_name = strategy.get(prefix + "sparse_optimizer",
                                          "adagrad")
@@ -691,6 +696,7 @@ class DistributedStrategy(object):
                                        config, 'embed_')
                sparse_optimizer_config(table_data.accessor.embedx_sgd_param,
                                        config, 'embedx_')
+            add_graph_config(table_data.accessor.graph_sgd_param, config)

        if not configs:
            print("fleet desc config is empty")

--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -155,6 +155,12 @@ class Accessor:
        if not accessor_proto.HasField("embedx_threshold"):
            accessor_proto.embedx_threshold = 0

+        graph_sgd_param = accessor_proto.graph_sgd_param
+        if not graph_sgd_param.HasField("nodeid_slot"):
+            graph_sgd_param.nodeid_slot = 9008
+        if not graph_sgd_param.HasField("feature_learning_rate"):
+            graph_sgd_param.feature_learning_rate = 0.05
+
        ctr_accessor_param = accessor_proto.ctr_accessor_param
        if not ctr_accessor_param.HasField("nonclk_coeff"):
            ctr_accessor_param.nonclk_coeff = 0.1

--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -933,7 +933,7 @@ def shuffle_batch(x, seed=None):
        seed = helper.create_variable(
            name=unique_name.generate("shuffle_batch_seed"),
            dtype="int64",
-            persistable=True)
+            persistable=False)
    helper.append_op(type='shuffle_batch',
                     inputs={
                         'X': x,

--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -1037,6 +1037,51 @@ class InMemoryDataset(DatasetBase):
        """
        self.dataset.set_heter_ps(enable_heter_ps)

+    def set_graph_config(self, config):
+        """
+        Set graph config, user can set graph config in gpu graph mode. 
+
+        Args:
+            config(dict): config dict.
+
+        Returns:
+            The size of shuffle data.
+
+        Examples:
+            .. code-block:: python
+
+              # required: skiptest
+              import paddle.fluid as fluid
+              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              graph_config = {"walk_len": 24,
+                    "walk_degree": 10,
+                    "once_sample_startid_len": 80000,
+                    "sample_times_one_chunk": 5,
+                    "window": 3,
+                    "debug_mode": 0,
+                    "batch_size": 800,
+                    "meta_path": "cuid2clk-clk2cuid;cuid2conv-conv2cuid;clk2cuid-cuid2clk;clk2cuid-cuid2conv",
+                    "gpu_graph_training": 1}
+              dataset.set_graph_config(graph_config)
+
+        """
+        self.proto_desc.graph_config.walk_degree = config.get("walk_degree", 1)
+        self.proto_desc.graph_config.walk_len = config.get("walk_len", 20)
+        self.proto_desc.graph_config.window = config.get("window", 5)
+        self.proto_desc.graph_config.once_sample_startid_len = config.get(
+            "once_sample_startid_len", 8000)
+        self.proto_desc.graph_config.sample_times_one_chunk = config.get(
+            "sample_times_one_chunk", 10)
+        self.proto_desc.graph_config.batch_size = config.get("batch_size", 1)
+        self.proto_desc.graph_config.debug_mode = config.get("debug_mode", 0)
+        self.proto_desc.graph_config.first_node_type = config.get(
+            "first_node_type", "")
+        self.proto_desc.graph_config.meta_path = config.get("meta_path", "")
+        self.proto_desc.graph_config.gpu_graph_training = config.get(
+            "gpu_graph_training", True)
+        self.dataset.set_gpu_graph_mode(True)
+

 class QueueDataset(DatasetBase):
    """

--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -744,6 +744,65 @@ class TestDataset(unittest.TestCase):

        temp_dir.cleanup()

+    def test_run_with_inmemory_dataset_train_debug_mode(self):
+        """
+        Testcase for InMemoryDataset from create to run.
+        """
+
+        temp_dir = tempfile.TemporaryDirectory()
+        dump_a_path = os.path.join(temp_dir.name, 'test_run_with_dump_a.txt')
+        dump_b_path = os.path.join(temp_dir.name, 'test_run_with_dump_b.txt')
+
+        with open(dump_a_path, "w") as f:
+            data = "1 a 1 a 1 1 2 3 3 4 5 5 5 5 1 1\n"
+            data += "1 b 1 b 1 2 2 3 4 4 6 6 6 6 1 2\n"
+            data += "1 c 1 c 1 3 2 3 5 4 7 7 7 7 1 3\n"
+            f.write(data)
+        with open(dump_b_path, "w") as f:
+            data = "1 d 1 d 1 4 2 3 3 4 5 5 5 5 1 4\n"
+            data += "1 e 1 e 1 5 2 3 4 4 6 6 6 6 1 5\n"
+            data += "1 f 1 f 1 6 2 3 5 4 7 7 7 7 1 6\n"
+            data += "1 g 1 g 1 7 2 3 6 4 8 8 8 8 1 7\n"
+            f.write(data)
+
+        slots = ["slot1", "slot2", "slot3", "slot4"]
+        slots_vars = []
+        for slot in slots:
+            var = fluid.layers.data(name=slot,
+                                    shape=[1],
+                                    dtype="int64",
+                                    lod_level=1)
+            slots_vars.append(var)
+
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(batch_size=32,
+                     thread_num=1,
+                     pipe_command="cat",
+                     data_feed_type="SlotRecordInMemoryDataFeed",
+                     use_var=slots_vars)
+        dataset._init_distributed_settings(parse_ins_id=True,
+                                           parse_content=True,
+                                           fea_eval=True,
+                                           candidate_size=10000)
+        dataset.set_filelist([dump_a_path, dump_b_path])
+        dataset.load_into_memory()
+
+        paddle.enable_static()
+
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        exe.run(startup_program)
+        for i in range(2):
+            try:
+                exe.train_from_dataset(main_program, dataset, debug=True)
+            except ImportError as e:
+                pass
+            except Exception as e:
+                self.assertTrue(False)
+
+        temp_dir.cleanup()
+

 class TestDatasetWithDataLoader(TestDataset):
    """

--- a/python/paddle/fluid/tests/unittests/test_trainer_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_trainer_desc.py
@@ -45,6 +45,17 @@ class TestTrainerDesc(unittest.TestCase):
        self.assertEqual(mpi_rank, 1)
        self.assertEqual(dump_fields_path, "path")

+    def test_config_dump_simple(self):
+        """
+        Testcase for dump_in_simple_mode
+        """
+        trainer_desc = fluid.trainer_desc.TrainerDesc()
+        trainer_desc._set_dump_fields(["a", "b"])
+        trainer_desc._set_is_dump_in_simple_mode(True)
+
+        is_dump_in_simple_mode = trainer_desc.proto_desc.is_dump_in_simple_mode
+        self.assertEqual(is_dump_in_simple_mode, 1)
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -156,6 +156,9 @@ class TrainerDesc(object):
        for field in dump_fields:
            self.proto_desc.dump_fields.append(field)

+    def _set_is_dump_in_simple_mode(self, is_dump_in_simple_mode):
+        self.proto_desc.is_dump_in_simple_mode = is_dump_in_simple_mode
+
    def _set_dump_fields_path(self, path):
        self.proto_desc.dump_fields_path = path


--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -84,6 +84,9 @@ class TrainerFactory(object):
                    trainer._set_worker_places(opt_info["worker_places"])
                if opt_info.get("use_ps_gpu") is not None:
                    trainer._set_use_ps_gpu(opt_info["use_ps_gpu"])
+                if opt_info.get("is_dump_in_simple_mode") is not None:
+                    trainer._set_is_dump_in_simple_mode(
+                        opt_info["is_dump_in_simple_mode"])
                if opt_info.get("enable_random_dump") is not None:
                    trainer._set_enable_random_dump(
                        opt_info["enable_random_dump"])