optimize graph_engine pybind (#42192)

* extract sub-graph * graph-engine merging * fix * fix * fix heter-ps config * test performance * test performance * test performance * test * test * update bfs * change cmake * test * test gpu speed * gpu_graph_engine optimization * add dsm sample method * add graph_neighbor_sample_v2 * Add graph_neighbor_sample_v2 * fix for loop * add cpu sample interface * fix kernel judgement * add ssd layer to graph_engine * fix allocation * fix syntax error * fix syntax error * fix pscore class * fix * change index settings * recover test * recover test * fix spelling * recover * fix * move cudamemcpy after cuda stream sync * fix linking problem * remove comment * add cpu test * test * add cpu test * change comment * combine feature table and graph table * test * test * pybind * test * test * test * test * pybind * pybind * fix cmake * pybind * fix * fix * add pybind * add pybind * optimize pybind * test * fix pybind * fix Co-authored-by: N DesmonDay <908660116@qq.com>

optimize graph_engine pybind (#42192)
* extract sub-graph * graph-engine merging * fix * fix * fix heter-ps config * test performance * test performance * test performance * test * test * update bfs * change cmake * test * test gpu speed * gpu_graph_engine optimization * add dsm sample method * add graph_neighbor_sample_v2 * Add graph_neighbor_sample_v2 * fix for loop * add cpu sample interface * fix kernel judgement * add ssd layer to graph_engine * fix allocation * fix syntax error * fix syntax error * fix pscore class * fix * change index settings * recover test * recover test * fix spelling * recover * fix * move cudamemcpy after cuda stream sync * fix linking problem * remove comment * add cpu test * test * add cpu test * change comment * combine feature table and graph table * test * test * pybind * test * test * test * test * pybind * pybind * fix cmake * pybind * fix * fix * add pybind * add pybind * optimize pybind * test * fix pybind * fix Co-authored-by: N DesmonDay <908660116@qq.com>
1bf08eca · seemingwang · GitHub · 06694953 · 1bf08eca · 1bf08eca
10 changed file
--- a/paddle/fluid/distributed/ps/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt
 set(BRPC_SRCS ps_client.cc server.cc)
 set_source_files_properties(${BRPC_SRCS})
 if(WITH_HETERPS)
    set(BRPC_DEPS brpc ssl crypto protobuf gflags glog zlib leveldb snappy gflags glog device_context rocksdb)
 else()
    set(BRPC_DEPS brpc ssl crypto protobuf gflags glog zlib leveldb snappy gflags glog device_context)
 endif()
 brpc_library(sendrecv_rpc SRCS

--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -64,11 +64,9 @@ struct GpuPsCommGraph {
 /*
 suppose we have a graph like this
 0----3-----5----7
 \   |\         |\
 17  8 9        1 2
 we save the nodes in arbitrary order,
 in this example,the order is
 [0,5,1,2,7,3,8,9,17]
@@ -83,7 +81,6 @@ we record each node's neighbors:
 8:3
 9:3
 17:0
 by concatenating each node's neighbor_list in the order we save the node id.
 we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
 this is the neighbor_list of GpuPsCommGraph
@@ -114,6 +111,32 @@ node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
 node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
 node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
 */
+struct NeighborSampleQuery {
+  int gpu_id;
+  int64_t *key;
+  int sample_size;
+  int len;
+  void initialize(int gpu_id, int64_t key, int sample_size, int len) {
+    this->gpu_id = gpu_id;
+    this->key = (int64_t *)key;
+    this->sample_size = sample_size;
+    this->len = len;
+  }
+  void display() {
+    int64_t *sample_keys = new int64_t[len];
+    VLOG(0) << "device_id " << gpu_id << " sample_size = " << sample_size;
+    VLOG(0) << "there are " << len << " keys ";
+    std::string key_str;
+    cudaMemcpy(sample_keys, key, len * sizeof(int64_t), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < len; i++) {
+      if (key_str.size() > 0) key_str += ";";
+      key_str += std::to_string(sample_keys[i]);
+    }
+    VLOG(0) << key_str;
+    delete[] sample_keys;
+  }
+};
 struct NeighborSampleResult {
  int64_t *val;
  int *actual_sample_size, sample_size, key_size;
@@ -134,6 +157,29 @@ struct NeighborSampleResult {
        memory::AllocShared(place, _key_size * sizeof(int));
    actual_sample_size = (int *)actual_sample_size_mem->ptr();
  }
+  void display() {
+    VLOG(0) << "in node sample result display ------------------";
+    int64_t *res = new int64_t[sample_size * key_size];
+    cudaMemcpy(res, val, sample_size * key_size * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+    int *ac_size = new int[key_size];
+    cudaMemcpy(ac_size, actual_sample_size, key_size * sizeof(int),
+               cudaMemcpyDeviceToHost);  // 3, 1, 3
+    for (int i = 0; i < key_size; i++) {
+      VLOG(0) << "actual sample size for " << i << "th key is " << ac_size[i];
+      VLOG(0) << "sampled neighbors are ";
+      std::string neighbor;
+      for (int j = 0; j < ac_size[i]; j++) {
+        if (neighbor.size() > 0) neighbor += ";";
+        neighbor += std::to_string(res[i * sample_size + j]);
+      }
+      VLOG(0) << neighbor;
+    }
+    delete[] res;
+    delete[] ac_size;
+    VLOG(0) << " ------------------";
+  }
  NeighborSampleResult(){};
  ~NeighborSampleResult() {
    // if (val != NULL) cudaFree(val);
@@ -145,13 +191,39 @@ struct NeighborSampleResult {
 struct NodeQueryResult {
  int64_t *val;
  int actual_sample_size;
+  int64_t get_val() { return (int64_t)val; }
+  int get_len() { return actual_sample_size; }
+  std::shared_ptr<memory::Allocation> val_mem;
+  void initialize(int query_size, int dev_id) {
+    platform::CUDADeviceGuard guard(dev_id);
+    platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+    val_mem = memory::AllocShared(place, query_size * sizeof(int64_t));
+    val = (int64_t *)val_mem->ptr();
+    // cudaMalloc((void **)&val, query_size * sizeof(int64_t));
+    actual_sample_size = 0;
+  }
+  void display() {
+    VLOG(0) << "in node query result display ------------------";
+    int64_t *res = new int64_t[actual_sample_size];
+    cudaMemcpy(res, val, actual_sample_size * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+    VLOG(0) << "actual_sample_size =" << actual_sample_size;
+    std::string str;
+    for (int i = 0; i < actual_sample_size; i++) {
+      if (str.size() > 0) str += ";";
+      str += std::to_string(res[i]);
+    }
+    VLOG(0) << str;
+    delete[] res;
+    VLOG(0) << " ------------------";
+  }
  NodeQueryResult() {
    val = NULL;
    actual_sample_size = 0;
  };
-  ~NodeQueryResult() {
+  ~NodeQueryResult() {}
-    if (val != NULL) cudaFree(val);
-  }
 };
 }
 };

--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -83,13 +83,15 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
    // }
  }
  void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
-  NodeQueryResult *graph_node_sample(int gpu_id, int sample_size);
+  NodeQueryResult graph_node_sample(int gpu_id, int sample_size);
-  NeighborSampleResult *graph_neighbor_sample(int gpu_id, int64_t *key,
+  NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
+                                                bool cpu_switch);
+  NeighborSampleResult graph_neighbor_sample(int gpu_id, int64_t *key,
                                             int sample_size, int len);
-  NeighborSampleResult *graph_neighbor_sample_v2(int gpu_id, int64_t *key,
+  NeighborSampleResult graph_neighbor_sample_v2(int gpu_id, int64_t *key,
                                                int sample_size, int len,
                                                bool cpu_query_switch);
-  NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
+  NodeQueryResult query_node_list(int gpu_id, int start, int query_size);
  void clear_graph_info();
  void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num,
                                                 int sample_size, int *h_left,

--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 #include <thrust/device_vector.h>
+#include <functional>
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
 //#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
@@ -28,7 +28,6 @@ actual_size[0,len) is to save the sample size of each node.
 for ith node in index, actual_size[i] = min(node i's neighbor size, sample size)
 sample_result is to save the neighbor sampling result, its size is len *
 sample_size;
 */
 __global__ void get_cpu_id_index(int64_t* key, int* val, int64_t* cpu_key,
@@ -198,7 +197,6 @@ int GpuPsGraphTable::init_cpu_table(
 // }
 /*
 comment 1
 gpu i triggers a neighbor_sample task,
 when this task is done,
 this function is called to move the sample result on other gpu back
@@ -211,13 +209,11 @@ int GpuPsGraphTable::init_cpu_table(
 smaller than sample_size,
 is saved on src_sample_res [x*sample_size, x*sample_size +
 actual_sample_size[x])
 since before each gpu runs the neighbor_sample task,the key array is shuffled,
 but we have the idx array to save the original order.
 when the gpu i gets all the sample results from other gpus, it relies on
 idx array to recover the original order.
 that's what fill_dvals does.
 */
 void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
@@ -404,10 +400,8 @@ void GpuPsGraphTable::clear_graph_info() {
 /*
 the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu.
 it saves the graph to be saved on each gpu.
 for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number
 == i
 In this function, memory is allocated on each gpu to save the graphs,
 gpu i saves the ith graph from cpu_graph_list
 */
@@ -468,7 +462,12 @@ void GpuPsGraphTable::build_graph_from_cpu(
  cudaDeviceSynchronize();
 }
-NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
+NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v3(
+    NeighborSampleQuery q, bool cpu_switch) {
+  return graph_neighbor_sample_v2(q.gpu_id, q.key, q.sample_size, q.len,
+                                  cpu_switch);
+}
+NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
                                                            int64_t* key,
                                                            int sample_size,
                                                            int len) {
@@ -479,7 +478,6 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
  gpu_id:the id of gpu.
  len:how many keys are used,(the length of array key)
  sample_size:how many neighbors should be sampled for each node in key.
  the code below shuffle the key array to make the keys
    that belong to a gpu-card stay together,
    the shuffled result is saved on d_shard_keys,
@@ -489,18 +487,16 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
    if keys in range [a,b] belong to ith-gpu, then h_left[i] = a, h_right[i] =
 b,
    if no keys are allocated for ith-gpu, then h_left[i] == h_right[i] == -1
    for example, suppose key = [0,1,2,3,4,5,6,7,8], gpu_num = 2
    when we run this neighbor_sample function,
    the key is shuffled to [0,2,4,6,8,1,3,5,7]
    the first part (0,2,4,6,8) % 2 == 0,thus should be handled by gpu 0,
    the rest part should be handled by gpu1, because (1,3,5,7) % 2 == 1,
    h_left = [0,5],h_right = [4,8]
  */
-  NeighborSampleResult* result = new NeighborSampleResult();
+  NeighborSampleResult result;
-  result->initialize(sample_size, len, resource_->dev_id(gpu_id));
+  result.initialize(sample_size, len, resource_->dev_id(gpu_id));
  if (len == 0) {
    return result;
  }
@@ -508,8 +504,8 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
  // cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t));
  // cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
-  int* actual_sample_size = result->actual_sample_size;
+  int* actual_sample_size = result.actual_sample_size;
-  int64_t* val = result->val;
+  int64_t* val = result.val;
  int total_gpu = resource_->total_device();
  // int dev_id = resource_->dev_id(gpu_id);
  auto stream = resource_->local_stream(gpu_id, 0);
@@ -686,10 +682,10 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
  return result;
 }
-NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample_v2(
+NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
    int gpu_id, int64_t* key, int sample_size, int len, bool cpu_query_switch) {
-  NeighborSampleResult* result = new NeighborSampleResult();
+  NeighborSampleResult result;
-  result->initialize(sample_size, len, resource_->dev_id(gpu_id));
+  result.initialize(sample_size, len, resource_->dev_id(gpu_id));
  if (len == 0) {
    return result;
@@ -697,8 +693,8 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample_v2(
  platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-  int* actual_sample_size = result->actual_sample_size;
+  int* actual_sample_size = result.actual_sample_size;
-  int64_t* val = result->val;
+  int64_t* val = result.val;
  int total_gpu = resource_->total_device();
  auto stream = resource_->local_stream(gpu_id, 0);
@@ -861,17 +857,19 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample_v2(
  return result;
 }
-NodeQueryResult* GpuPsGraphTable::graph_node_sample(int gpu_id,
+NodeQueryResult GpuPsGraphTable::graph_node_sample(int gpu_id,
-                                                    int sample_size) {}
+                                                   int sample_size) {
+  return NodeQueryResult();
+}
-NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start,
+NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
                                                 int query_size) {
-  NodeQueryResult* result = new NodeQueryResult();
+  NodeQueryResult result;
  if (query_size <= 0) return result;
-  int& actual_size = result->actual_sample_size;
+  int& actual_size = result.actual_sample_size;
  actual_size = 0;
-  cudaMalloc((void**)&result->val, query_size * sizeof(int64_t));
+  result.initialize(query_size, resource_->dev_id(gpu_id));
-  int64_t* val = result->val;
+  int64_t* val = result.val;
  // int dev_id = resource_->dev_id(gpu_id);
  // platform::CUDADeviceGuard guard(dev_id);
  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
@@ -883,7 +881,6 @@ NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start,
  sample_size[i] = s;
  then on gpu a, the nodes of positions [p1,p1 + s) should be returned
  and saved from the p2 position on the sample_result array
  for example:
  suppose
  gpu 0 saves [0,2,4,6,8], gpu1 saves [1,3,5,7]
@@ -893,23 +890,29 @@ NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start,
  gpu_begin_pos = [3,0]
  local_begin_pos = [0,3]
  sample_size = [2,3]
  */
+  std::function<int(int, int, int, int, int&, int&)> range_check = [](
+      int x, int y, int x1, int y1, int& x2, int& y2) {
+    if (y <= x1 || x >= y1) return 0;
+    y2 = min(y, y1);
+    x2 = max(x1, x);
+    return y2 - x2;
+  };
  for (int i = 0; i < gpu_graph_list.size() && query_size != 0; i++) {
    auto graph = gpu_graph_list[i];
    if (graph.node_size == 0) {
      continue;
    }
-    if (graph.node_size + size > start) {
+    int x2, y2;
-      int cur_size = min(query_size, graph.node_size + size - start);
+    int len = range_check(start, start + query_size, size,
-      query_size -= cur_size;
+                          size + graph.node_size, x2, y2);
-      idx.emplace_back(i);
+    if (len > 0) {
-      gpu_begin_pos.emplace_back(start - size);
+      idx.push_back(i);
+      gpu_begin_pos.emplace_back(x2 - size);
      local_begin_pos.emplace_back(actual_size);
-      start += cur_size;
+      sample_size.push_back(len);
-      actual_size += cur_size;
+      actual_size += len;
-      sample_size.emplace_back(cur_size);
+      create_storage(gpu_id, i, 1, len * sizeof(int64_t));
-      create_storage(gpu_id, i, 1, cur_size * sizeof(int64_t));
    }
    size += graph.node_size;
  }
@@ -936,6 +939,9 @@ NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start,
    auto& node = path_[gpu_id][idx[i]].nodes_.front();
    cudaStreamSynchronize(node.out_stream);
  }
+  for (auto x : idx) {
+    destroy_storage(gpu_id, x);
+  }
  return result;
 }
 }

--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -158,14 +158,16 @@ void GraphGpuWrapper::init_service() {
  graph_table = (char *)g;
 }
-void GraphGpuWrapper::upload_batch(std::vector<std::vector<int64_t>> &ids) {
+void GraphGpuWrapper::upload_batch(int idx,
+                                   std::vector<std::vector<int64_t>> &ids) {
  GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
  std::vector<paddle::framework::GpuPsCommGraph> vec;
  for (int i = 0; i < ids.size(); i++) {
-    vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(0, ids[i]));
+    vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]));
  }
  g->build_graph_from_cpu(vec);
 }
 void GraphGpuWrapper::initialize() {
  std::vector<int> device_id_mapping;
  for (int i = 0; i < 2; i++) device_id_mapping.push_back(i);
@@ -238,10 +240,10 @@ void GraphGpuWrapper::test() {
      ((GpuPsGraphTable *)graph_table)
          ->graph_neighbor_sample(0, (int64_t *)key, 2, 3);
  int64_t *res = new int64_t[7];
-  cudaMemcpy(res, neighbor_sample_res->val, 3 * 2 * sizeof(int64_t),
+  cudaMemcpy(res, neighbor_sample_res.val, 3 * 2 * sizeof(int64_t),
             cudaMemcpyDeviceToHost);
  int *actual_sample_size = new int[3];
-  cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size,
+  cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
             3 * sizeof(int),
             cudaMemcpyDeviceToHost);  // 3, 1, 3
@@ -256,12 +258,60 @@ void GraphGpuWrapper::test() {
    }
  }
 }
-NeighborSampleResult *GraphGpuWrapper::graph_neighbor_sample(int gpu_id,
+NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3(
-                                                             int64_t *key,
+    NeighborSampleQuery q, bool cpu_switch) {
-                                                             int sample_size,
+  return ((GpuPsGraphTable *)graph_table)
-                                                             int len) {
+      ->graph_neighbor_sample_v3(q, cpu_switch);
+}
+// this function is contributed by Liwb5
+std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
+    int gpu_id, std::vector<int64_t> &key, int sample_size) {
+  int64_t *cuda_key;
+  platform::CUDADeviceGuard guard(gpu_id);
+  cudaMalloc(&cuda_key, key.size() * sizeof(int64_t));
+  cudaMemcpy(cuda_key, key.data(), key.size() * sizeof(int64_t),
+             cudaMemcpyHostToDevice);
+  auto neighbor_sample_res =
+      ((GpuPsGraphTable *)graph_table)
+          ->graph_neighbor_sample(gpu_id, cuda_key, sample_size, key.size());
+  int *actual_sample_size = new int[key.size()];
+  cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
+             key.size() * sizeof(int),
+             cudaMemcpyDeviceToHost);  // 3, 1, 3
+  int cumsum = 0;
+  for (int i = 0; i < key.size(); i++) {
+    cumsum += actual_sample_size[i];
+  }
+  /* VLOG(0) << "cumsum " << cumsum; */
+  std::vector<int64_t> res;
+  res.resize(cumsum * 2);
+  int count = 0;
+  for (int i = 0; i < key.size(); i++) {
+    for (int j = 0; j < actual_sample_size[i]; j++) {
+      res[count] = key[i];
+      count += 1;
+    }
+  }
+  cudaMemcpy(res.data() + cumsum, neighbor_sample_res.val,
+             cumsum * sizeof(int64_t), cudaMemcpyDeviceToHost);
+  /* for(int i = 0;i < res.size();i ++) { */
+  /*     VLOG(0) << i << " " << res[i]; */
+  /* } */
+  cudaFree(cuda_key);
+  return res;
+}
+NodeQueryResult GraphGpuWrapper::query_node_list(int gpu_id, int start,
+                                                 int query_size) {
  return ((GpuPsGraphTable *)graph_table)
-      ->graph_neighbor_sample(gpu_id, key, sample_size, len);
+      ->query_node_list(gpu_id, start, query_size);
 }
 #endif
 }

--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -29,13 +29,17 @@ class GraphGpuWrapper {
  void init_service();
  void set_up_types(std::vector<std::string>& edge_type,
                    std::vector<std::string>& node_type);
-  void upload_batch(std::vector<std::vector<int64_t>>& ids);
+  void upload_batch(int idx, std::vector<std::vector<int64_t>>& ids);
  void add_table_feat_conf(std::string table_name, std::string feat_name,
                           std::string feat_dtype, int feat_shape);
  void load_edge_file(std::string name, std::string filepath, bool reverse);
  void load_node_file(std::string name, std::string filepath);
-  NeighborSampleResult* graph_neighbor_sample(int gpu_id, int64_t* key,
+  NodeQueryResult query_node_list(int gpu_id, int start, int query_size);
-                                              int sample_size, int len);
+  NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
+                                                bool cpu_switch);
+  std::vector<int64_t> graph_neighbor_sample(int gpu_id,
+                                             std::vector<int64_t>& key,
+                                             int sample_size);
  std::unordered_map<std::string, int> edge_to_id, feature_to_id;
  std::vector<std::string> id_to_feature, id_to_edge;
  std::vector<std::unordered_map<std::string, int>> table_feat_mapping;

--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
@@ -139,23 +139,17 @@ TEST(TEST_FLEET, test_cpu_cache) {
  platform::CUDADeviceGuard guard(0);
  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
-  auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 2, 3);
+  auto neighbor_sample_res =
-  int64_t *res = new int64_t[7];
+      g.graph_neighbor_sample_v2(0, (int64_t *)key, 2, 3, true);
-  cudaMemcpy(res, neighbor_sample_res->val, 3 * 2 * sizeof(int64_t),
+  neighbor_sample_res.display();
-             cudaMemcpyDeviceToHost);
+  //{1,9} or {9,1} is expected for key 0
-  int *actual_sample_size = new int[3];
-  cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size,
-             3 * sizeof(int),
-             cudaMemcpyDeviceToHost);  // 3, 1, 3
-  //{0,9} or {9,0} is expected for key 0
  //{0,2} or {2,0} is expected for key 1
  //{1,3} or {3,1} is expected for key 2
-  for (int i = 0; i < 3; i++) {
+  auto node_query_res = g.query_node_list(0, 0, 4);
-    VLOG(0) << "actual sample size for " << i << " is "
+  node_query_res.display();
-            << actual_sample_size[i];
+  NeighborSampleQuery query;
-    for (int j = 0; j < actual_sample_size[i]; j++) {
+  query.initialize(0, node_query_res.get_val(), 2, node_query_res.get_len());
-      VLOG(0) << "sampled an neighbor for node" << i << " : " << res[i * 2 + j];
+  query.display();
-    }
+  auto c = g.graph_neighbor_sample_v3(query, false);
-  }
+  c.display();
 }
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -255,6 +252,8 @@ using paddle::distributed::IndexNode;
 #ifdef PADDLE_WITH_HETERPS
 using paddle::framework::GraphGpuWrapper;
 using paddle::framework::NeighborSampleResult;
+using paddle::framework::NeighborSampleQuery;
+using paddle::framework::NodeQueryResult;
 #endif
 void BindIndexNode(py::module* m) {
@@ -307,21 +306,39 @@ void BindIndexWrapper(py::module* m) {
 }
 #ifdef PADDLE_WITH_HETERPS
+void BindNodeQueryResult(py::module* m) {
+  py::class_<NodeQueryResult>(*m, "NodeQueryResult")
+      .def(py::init<>())
+      .def("initialize", &NodeQueryResult::initialize)
+      .def("display", &NodeQueryResult::display)
+      .def("get_val", &NodeQueryResult::get_val)
+      .def("get_len", &NodeQueryResult::get_len);
+}
+void BindNeighborSampleQuery(py::module* m) {
+  py::class_<NeighborSampleQuery>(*m, "NeighborSampleQuery")
+      .def(py::init<>())
+      .def("initialize", &NeighborSampleQuery::initialize)
+      .def("display", &NeighborSampleQuery::display);
+}
 void BindNeighborSampleResult(py::module* m) {
  py::class_<NeighborSampleResult>(*m, "NeighborSampleResult")
      .def(py::init<>())
-      .def("initialize", &NeighborSampleResult::initialize);
+      .def("initialize", &NeighborSampleResult::initialize)
+      .def("display", &NeighborSampleResult::display);
 }
 void BindGraphGpuWrapper(py::module* m) {
  py::class_<GraphGpuWrapper>(*m, "GraphGpuWrapper")
      .def(py::init<>())
-      .def("test", &GraphGpuWrapper::test)
+      //.def("test", &GraphGpuWrapper::test)
      .def("initialize", &GraphGpuWrapper::initialize)
+      .def("neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample_v3)
      .def("graph_neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample)
      .def("set_device", &GraphGpuWrapper::set_device)
      .def("init_service", &GraphGpuWrapper::init_service)
      .def("set_up_types", &GraphGpuWrapper::set_up_types)
+      .def("query_node_list", &GraphGpuWrapper::query_node_list)
      .def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf)
      .def("load_edge_file", &GraphGpuWrapper::load_edge_file)
      .def("upload_batch", &GraphGpuWrapper::upload_batch)

--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
@@ -39,6 +39,8 @@ void BindIndexSampler(py::module* m);
 #ifdef PADDLE_WITH_HETERPS
 void BindNeighborSampleResult(py::module* m);
 void BindGraphGpuWrapper(py::module* m);
+void BindNodeQueryResult(py::module* m);
+void BindNeighborSampleQuery(py::module* m);
 #endif
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -4578,6 +4578,8 @@ All parameter, weight, gradient are variables in Paddle.
  BindIndexWrapper(&m);
  BindIndexSampler(&m);
 #ifdef PADDLE_WITH_HETERPS
+  BindNodeQueryResult(&m);
+  BindNeighborSampleQuery(&m);
  BindNeighborSampleResult(&m);
  BindGraphGpuWrapper(&m);
 #endif