[Pglbox2.0] merge gpugraph to develop (#49946)

* add set slot_num for psgpuwraper (#177) * add set slot_num_for_pull_feature for psgpuwarper * Add get_epoch_finish python interface (#182) * add get_epoch_finish interface * add return * delete return * add unzip op (#183) * fix miss key for error dataset (#186) * fix miss key for error dataset * fix miss key for error dataset Co-authored-by: N yangjunchao <yangjunchao@baidu.com> * add excluded_train_pair and infer_node_type (#187) * support return of degree (#188) * fix task stuck in barrier (#189) Co-authored-by: N yangjunchao <yangjunchao@baidu.com> * check node/feature format when loading (#190) * check node&feature format when loading * check node&feature format when loading (2£ (2) * degrade log (#191) * [PGLBOX]fix conflict * [PGLBOX]fix conflict * [PGLBOX]replace LodTensor with phi::DenseTensor * [PGLBOX]fix gpu_primitives.h include path * [PGLBOX]from platform::PADDLE_CUDA_NUM_THREADS to phi::PADDLE_CUDA_NUM_THREADS * [PGLBOX]fix unzip example code * [PGLBOX]fix unzip example code * [PGLBOX]fix unzip example code * [PGLBOX]fix unzip example code * [PGLBOX]fix unzip ut * [PGLBOX]fix unzip ut * [PGLBOX]fix code style * [PGLBOX]fix code style * [PGLBOX]fix code style * fix code style * fix code style * fix unzip ut * fix unzip ut * fix unzip ut * fix unzip * fix code stype * add ut * add c++ ut & fix train_mode_ set * fix load into memory * fix c++ ut * fix c++ ut * fix c++ ut * fix c++ ut * fix code style * fix collective * fix unzip_op.cc * fix barrier * fix code style * fix barrier * fix barrier * fix code styple * fix unzip * add unzip.py * add unzip.py * fix unzip.py --------- Co-authored-by: N chao9527 <33347532+chao9527@users.noreply.github.com> Co-authored-by: N Siming Dai <908660116@qq.com> Co-authored-by: N huwei02 <53012141+huwei02@users.noreply.github.com> Co-authored-by: N yangjunchao <yangjunchao@baidu.com>

[Pglbox2.0] merge gpugraph to develop (#49946)
* add set slot_num for psgpuwraper (#177) * add set slot_num_for_pull_feature for psgpuwarper * Add get_epoch_finish python interface (#182) * add get_epoch_finish interface * add return * delete return * add unzip op (#183) * fix miss key for error dataset (#186) * fix miss key for error dataset * fix miss key for error dataset Co-authored-by: N yangjunchao <yangjunchao@baidu.com> * add excluded_train_pair and infer_node_type (#187) * support return of degree (#188) * fix task stuck in barrier (#189) Co-authored-by: N yangjunchao <yangjunchao@baidu.com> * check node/feature format when loading (#190) * check node&feature format when loading * check node&feature format when loading (2£ (2) * degrade log (#191) * [PGLBOX]fix conflict * [PGLBOX]fix conflict * [PGLBOX]replace LodTensor with phi::DenseTensor * [PGLBOX]fix gpu_primitives.h include path * [PGLBOX]from platform::PADDLE_CUDA_NUM_THREADS to phi::PADDLE_CUDA_NUM_THREADS * [PGLBOX]fix unzip example code * [PGLBOX]fix unzip example code * [PGLBOX]fix unzip example code * [PGLBOX]fix unzip example code * [PGLBOX]fix unzip ut * [PGLBOX]fix unzip ut * [PGLBOX]fix code style * [PGLBOX]fix code style * [PGLBOX]fix code style * fix code style * fix code style * fix unzip ut * fix unzip ut * fix unzip ut * fix unzip * fix code stype * add ut * add c++ ut & fix train_mode_ set * fix load into memory * fix c++ ut * fix c++ ut * fix c++ ut * fix c++ ut * fix code style * fix collective * fix unzip_op.cc * fix barrier * fix code style * fix barrier * fix barrier * fix code styple * fix unzip * add unzip.py * add unzip.py * fix unzip.py --------- Co-authored-by: N chao9527 <33347532+chao9527@users.noreply.github.com> Co-authored-by: N Siming Dai <908660116@qq.com> Co-authored-by: N huwei02 <53012141+huwei02@users.noreply.github.com> Co-authored-by: N yangjunchao <yangjunchao@baidu.com>
cb525d4e · zmxdream · GitHub · 382e9a06 · cb525d4e · cb525d4e
34 changed file
--- a/paddle/fluid/distributed/ps/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc
@@ -306,10 +306,10 @@ int32_t PsLocalClient::Initialize() {
                                                     size_t threshold) {
  auto* table_ptr = GetTable(table_id);
  std::pair<int64_t, int64_t> ret = table_ptr->PrintTableStat();
-  VLOG(0) << "table id: " << table_id << ", feasign size: " << ret.first
+  VLOG(1) << "table id: " << table_id << ", feasign size: " << ret.first
          << ", mf size: " << ret.second;
  if (ret.first > (int64_t)threshold) {
-    VLOG(0) << "run cache table";
+    VLOG(1) << "run cache table";
    table_ptr->CacheTable(pass_id);
  }
  return done();

--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -124,11 +124,13 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
  }
  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();

-  std::stringstream ss;
-  for (int k = 0; k < slot_num; ++k) {
-    ss << slot_feature_num_map_[k] << " ";
+  if (FLAGS_v > 0) {
+    std::stringstream ss;
+    for (int k = 0; k < slot_num; ++k) {
+      ss << slot_feature_num_map_[k] << " ";
+    }
+    VLOG(1) << "slot_feature_num_map: " << ss.str();
  }
-  VLOG(0) << "slot_feature_num_map: " << ss.str();

  tasks.clear();

@@ -137,7 +139,7 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
  for (size_t i = 0; i < shard_num; i++) {
    tot_len += feature_array[i].size();
  }
-  VLOG(0) << "Loaded feature table on cpu, feature_list_size[" << tot_len
+  VLOG(1) << "Loaded feature table on cpu, feature_list_size[" << tot_len
          << "] node_ids_size[" << node_ids.size() << "]";
  res.init_on_cpu(tot_len, (unsigned int)node_ids.size(), slot_num);
  unsigned int offset = 0, ind = 0;
@@ -494,6 +496,8 @@ void GraphTable::export_partition_files(int idx, std::string file_path) {

  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
 }
+#endif
+
 void GraphTable::clear_graph(int idx) {
  for (auto p : edge_shards[idx]) {
    p->clear();
@@ -506,6 +510,7 @@ void GraphTable::clear_graph(int idx) {
  }
 }

+#ifdef PADDLE_WITH_HETERPS
 void GraphTable::release_graph() {
  // Before releasing graph, prepare for sampling ids and embedding keys.
  build_graph_type_keys();
@@ -545,6 +550,7 @@ void GraphTable::release_graph_node() {
    feature_shrink_to_fit();
  }
 }
+#endif

 void GraphTable::clear_edge_shard() {
  VLOG(0) << "begin clear edge shard";
@@ -590,6 +596,7 @@ void GraphTable::clear_feature_shard() {
  VLOG(0) << "finish clear feature shard";
 }

+#ifdef PADDLE_WITH_HETERPS
 void GraphTable::feature_shrink_to_fit() {
  std::vector<std::future<int>> tasks;
  for (auto &type_shards : feature_shards) {
@@ -619,6 +626,8 @@ void GraphTable::merge_feature_shard() {
  feature_shards.resize(1);
 }

+#endif
+
 void GraphTable::clear_graph() {
  VLOG(0) << "begin clear_graph";
  clear_edge_shard();
@@ -626,6 +635,7 @@ void GraphTable::clear_graph() {
  VLOG(0) << "finish clear_graph";
 }

+#ifdef PADDLE_WITH_HETERPS
 int32_t GraphTable::load_next_partition(int idx) {
  if (next_partition >= static_cast<int>(partitions[idx].size())) {
    VLOG(0) << "partition iteration is done";
@@ -1203,11 +1213,21 @@ int32_t GraphTable::Load(const std::string &path, const std::string &param) {
  if (load_edge) {
    bool reverse_edge = (param[1] == '<');
    std::string edge_type = param.substr(2);
-    return this->load_edges(path, reverse_edge, edge_type);
+    int ret = this->load_edges(path, reverse_edge, edge_type);
+    if (ret != 0) {
+      VLOG(0) << "Fail to load edges, path[" << path << "] edge_type["
+              << edge_type << "]";
+      return -1;
+    }
  }
  if (load_node) {
    std::string node_type = param.substr(1);
-    return this->load_nodes(path, node_type);
+    int ret = this->load_nodes(path, node_type);
+    if (ret != 0) {
+      VLOG(0) << "Fail to load nodes, path[" << path << "] node_type["
+              << node_type << "]";
+      return -1;
+    }
  }
  return 0;
 }
@@ -1319,10 +1339,19 @@ int32_t GraphTable::parse_node_and_load(std::string ntype2files,
    return 0;
  }
  if (FLAGS_graph_load_in_parallel) {
-    this->load_nodes(npath_str, "");
+    int ret = this->load_nodes(npath_str, "");
+    if (ret != 0) {
+      VLOG(0) << "Fail to load nodes, path[" << npath << "]";
+      return -1;
+    }
  } else {
    for (size_t j = 0; j < ntypes.size(); j++) {
-      this->load_nodes(npath_str, ntypes[j]);
+      int ret = this->load_nodes(npath_str, ntypes[j]);
+      if (ret != 0) {
+        VLOG(0) << "Fail to load nodes, path[" << npath << "], ntypes["
+                << ntypes[j] << "]";
+        return -1;
+      }
    }
  }
  return 0;
@@ -1397,10 +1426,19 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype2files,
              return 0;
            }
            if (FLAGS_graph_load_in_parallel) {
-              this->load_nodes(npath_str, "");
+              int ret = this->load_nodes(npath_str, "");
+              if (ret != 0) {
+                VLOG(0) << "Fail to load nodes, path[" << npath_str << "]";
+                return -1;
+              }
            } else {
              for (size_t j = 0; j < ntypes.size(); j++) {
-                this->load_nodes(npath_str, ntypes[j]);
+                int ret = this->load_nodes(npath_str, ntypes[j]);
+                if (ret != 0) {
+                  VLOG(0) << "Fail to load nodes, path[" << npath_str
+                          << "], ntypes[" << ntypes[j] << "]";
+                  return -1;
+                }
              }
            }
          }
@@ -1408,6 +1446,10 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype2files,
        }));
  }
  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  if (is_parse_node_fail_) {
+    VLOG(0) << "Fail to load node_and_edge_file";
+    return -1;
+  }
  return 0;
 }

@@ -1499,7 +1541,12 @@ std::pair<uint64_t, uint64_t> GraphTable::parse_node_file(
      node->set_feature_size(feat_name[idx].size());
      for (int i = 1; i < num; ++i) {
        auto &v = vals[i];
-        parse_feature(idx, v.ptr, v.len, node);
+        int ret = parse_feature(idx, v.ptr, v.len, node);
+        if (ret != 0) {
+          VLOG(0) << "Fail to parse feature, node_id[" << id << "]";
+          is_parse_node_fail_ = true;
+          return {0, 0};
+        }
      }
    }
    local_valid_count++;
@@ -1551,7 +1598,12 @@ std::pair<uint64_t, uint64_t> GraphTable::parse_node_file(
    if (node != NULL) {
      for (int i = 2; i < num; ++i) {
        auto &v = vals[i];
-        parse_feature(idx, v.ptr, v.len, node);
+        int ret = parse_feature(idx, v.ptr, v.len, node);
+        if (ret != 0) {
+          VLOG(0) << "Fail to parse feature, node_id[" << id << "]";
+          is_parse_node_fail_ = true;
+          return {0, 0};
+        }
      }
    }
    local_valid_count++;
@@ -1603,6 +1655,11 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
      valid_count += res.second;
    }
  }
+  if (is_parse_node_fail_) {
+    VLOG(0) << "Fail to load nodes, path[" << paths[0] << ".."
+            << paths[paths.size() - 1] << "] node_type[" << node_type << "]";
+    return -1;
+  }

  VLOG(0) << valid_count << "/" << count << " nodes in node_type[ " << node_type
          << "] are loaded successfully!";
@@ -2103,28 +2160,48 @@ int GraphTable::parse_feature(int idx,
    if (dtype == "feasign") {
      //      string_vector_2_string(fields.begin() + 1, fields.end(), ' ',
      //      fea_ptr);
-      FeatureNode::parse_value_to_bytes<uint64_t>(
+      int ret = FeatureNode::parse_value_to_bytes<uint64_t>(
          fea_fields.begin(), fea_fields.end(), fea_ptr);
+      if (ret != 0) {
+        VLOG(0) << "Fail to parse value";
+        return -1;
+      }
      return 0;
    } else if (dtype == "string") {
      string_vector_2_string(
          fea_fields.begin(), fea_fields.end(), ' ', fea_ptr);
      return 0;
    } else if (dtype == "float32") {
-      FeatureNode::parse_value_to_bytes<float>(
+      int ret = FeatureNode::parse_value_to_bytes<float>(
          fea_fields.begin(), fea_fields.end(), fea_ptr);
+      if (ret != 0) {
+        VLOG(0) << "Fail to parse value";
+        return -1;
+      }
      return 0;
    } else if (dtype == "float64") {
-      FeatureNode::parse_value_to_bytes<double>(
+      int ret = FeatureNode::parse_value_to_bytes<double>(
          fea_fields.begin(), fea_fields.end(), fea_ptr);
+      if (ret != 0) {
+        VLOG(0) << "Fail to parse value";
+        return -1;
+      }
      return 0;
    } else if (dtype == "int32") {
-      FeatureNode::parse_value_to_bytes<int32_t>(
+      int ret = FeatureNode::parse_value_to_bytes<int32_t>(
          fea_fields.begin(), fea_fields.end(), fea_ptr);
+      if (ret != 0) {
+        VLOG(0) << "Fail to parse value";
+        return -1;
+      }
      return 0;
    } else if (dtype == "int64") {
-      FeatureNode::parse_value_to_bytes<uint64_t>(
+      int ret = FeatureNode::parse_value_to_bytes<uint64_t>(
          fea_fields.begin(), fea_fields.end(), fea_ptr);
+      if (ret != 0) {
+        VLOG(0) << "Fail to parse value";
+        return -1;
+      }
      return 0;
    }
  } else {
@@ -2132,7 +2209,7 @@ int GraphTable::parse_feature(int idx,
            << idx << "] feat_id_map_size[" << feat_id_map.size() << "]";
  }

-  return -1;
+  return 0;
 }
 // thread safe shard vector merge
 class MergeShardVector {

--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -789,6 +789,7 @@ class GraphTable : public Table {
  std::string slot_feature_separator_ = std::string(" ");
  std::string feature_separator_ = std::string(" ");
  std::vector<int> slot_feature_num_map_;
+  bool is_parse_node_fail_ = false;
 };

 /*

--- a/paddle/fluid/distributed/ps/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -255,7 +255,7 @@ class FeatureNode : public Node {
  }

  template <typename T>
-  static void parse_value_to_bytes(
+  static int parse_value_to_bytes(
      std::vector<paddle::string::str_ptr>::iterator feat_str_begin,
      std::vector<paddle::string::str_ptr>::iterator feat_str_end,
      std::string *output) {
@@ -269,8 +269,14 @@ class FeatureNode : public Node {
    thread_local paddle::string::str_ptr_stream ss;
    for (size_t i = 0; i < feat_str_size; i++) {
      ss.reset(*(feat_str_begin + i));
+      int len = ss.end - ss.ptr;
+      char *old_ptr = ss.ptr;
      ss >> fea_ptrs[i];
+      if (ss.ptr - old_ptr != len) {
+        return -1;
+      }
    }
+    return 0;
  }

 protected:

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -866,7 +866,8 @@ if(WITH_DISTRIBUTE)
           fleet
           heter_server
           brpc
-           fleet_executor)
+           fleet_executor
+           flags)
    set(DISTRIBUTE_COMPILE_FLAGS
        "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses"
    )

--- a/paddle/fluid/framework/barrier.h
+++ b/paddle/fluid/framework/barrier.h
@@ -14,6 +14,11 @@

 #pragma once

+#if defined _WIN32 || defined __APPLE__
+#else
+#define __LINUX__
+#endif
+
 #ifdef __LINUX__
 #include <pthread.h>
 #include <semaphore.h>
@@ -48,7 +53,7 @@ class Barrier {
  void wait() {
 #ifdef __LINUX__
    int err = pthread_barrier_wait(&_barrier);
-    if (err != 0 && err != PTHREAD_BARRIER_SERIAL_THREAD)) {
+    if (err != 0 && err != PTHREAD_BARRIER_SERIAL_THREAD) {
      CHECK_EQ(1, 0);
    }
 #endif

--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -2112,15 +2112,24 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) {
 #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
  gpu_graph_data_generator_.SetConfig(data_feed_desc);
 #endif
+  if (gpu_graph_mode_) {
+    train_mode_ = true;
+  } else {
+    train_mode_ = data_feed_desc.graph_config().gpu_graph_training();
+  }
 }

 #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
 void SlotRecordInMemoryDataFeed::InitGraphResource() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
  gpu_graph_data_generator_.AllocResource(thread_id_, feed_vec_);
+#endif
 }

 void SlotRecordInMemoryDataFeed::InitGraphTrainResource() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
  gpu_graph_data_generator_.AllocTrainResource(thread_id_);
+#endif
 }
 #endif


--- a/paddle/fluid/framework/data_feed.cu
+++ b/paddle/fluid/framework/data_feed.cu
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -914,7 +914,9 @@ class GraphDataGenerator {
  int FillFeatureBuf(std::shared_ptr<phi::Allocation> d_walk,
                     std::shared_ptr<phi::Allocation> d_feature);
  void FillOneStep(uint64_t* start_ids,
+                   int etype_id,
                   uint64_t* walk,
+                   uint8_t* walk_ntype,
                   int len,
                   NeighborSampleResult& sample_res,  // NOLINT
                   int cur_degree,
@@ -966,6 +968,7 @@ class GraphDataGenerator {
      int len,
      int* uniq_len,
      std::shared_ptr<phi::Allocation>& inverse);  // NOLINT
+  std::shared_ptr<phi::Allocation> GetNodeDegree(uint64_t* node_ids, int len);
  int InsertTable(const uint64_t* d_keys,
                  uint64_t len,
                  std::shared_ptr<phi::Allocation> d_uniq_node_num);
@@ -988,6 +991,7 @@ class GraphDataGenerator {
  int* index_tensor_ptr_;
  int64_t* show_tensor_ptr_;
  int64_t* clk_tensor_ptr_;
+  int* degree_tensor_ptr_;

  cudaStream_t train_stream_;
  cudaStream_t sample_stream_;
@@ -999,6 +1003,8 @@ class GraphDataGenerator {
  std::shared_ptr<phi::Allocation> d_train_metapath_keys_;

  std::shared_ptr<phi::Allocation> d_walk_;
+  std::shared_ptr<phi::Allocation> d_walk_ntype_;
+  std::shared_ptr<phi::Allocation> d_excluded_train_pair_;
  std::shared_ptr<phi::Allocation> d_feature_list_;
  std::shared_ptr<phi::Allocation> d_feature_;
  std::shared_ptr<phi::Allocation> d_len_per_row_;
@@ -1033,11 +1039,13 @@ class GraphDataGenerator {
  // sage mode batch data
  std::vector<std::shared_ptr<phi::Allocation>> inverse_vec_;
  std::vector<std::shared_ptr<phi::Allocation>> final_sage_nodes_vec_;
+  std::vector<std::shared_ptr<phi::Allocation>> node_degree_vec_;
  std::vector<int> uniq_instance_vec_;
  std::vector<int> total_instance_vec_;
  std::vector<std::vector<std::shared_ptr<phi::Allocation>>> graph_edges_vec_;
  std::vector<std::vector<std::vector<int>>> edges_split_num_vec_;

+  int excluded_train_pair_len_;
  int64_t reindex_table_size_;
  int sage_batch_count_;
  int sage_batch_num_;
@@ -1067,6 +1075,9 @@ class GraphDataGenerator {
  int total_row_;
  size_t infer_node_start_;
  size_t infer_node_end_;
+  std::set<int> infer_node_type_index_set_;
+  std::string infer_node_type_;
+  bool get_degree_;
 };

 class DataFeed {

--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -42,6 +42,9 @@ message GraphConfig {
  optional string samples = 12;
  optional int64 train_table_cap = 13 [ default = 80000 ];
  optional int64 infer_table_cap = 14 [ default = 80000 ];
+  optional string excluded_train_pair = 15;
+  optional string infer_node_type = 16;
+  optional bool get_degree = 17 [ default = false ];
 }

 message DataFeedDesc {

--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -457,7 +457,7 @@ void DatasetImpl<T>::LoadIntoMemory() {
  timeline.Start();
  std::vector<std::thread> load_threads;
  if (gpu_graph_mode_) {
-    VLOG(0) << "in gpu_graph_mode";
+    VLOG(1) << "in gpu_graph_mode";
 #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
    for (size_t i = 0; i < readers_.size(); i++) {
      readers_[i]->SetGpuGraphMode(gpu_graph_mode_);
@@ -470,7 +470,6 @@ void DatasetImpl<T>::LoadIntoMemory() {
        readers_[i]->ResetPathNum();
        readers_[i]->ResetEpochFinish();
      }
-      return;
    }

    for (int64_t i = 0; i < thread_num_; ++i) {

--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -30,7 +30,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #endif
-
+#include "paddle/fluid/framework/barrier.h"
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/heter_util.h"
@@ -212,6 +212,9 @@ class DeviceWorker {
  virtual void SetDeviceContext(platform::DeviceContext* dev_ctx) {
    dev_ctx_ = dev_ctx;
  }
+
+  virtual void SetThreadNum(int thread_num) { thread_num_ = thread_num; }
+
  virtual Scope* GetThreadScope() { return thread_scope_; }
  DataFeed* device_reader_ = nullptr;

@@ -290,7 +293,8 @@ class HogwildWorker : public CPUWorkerBase {
  HogwildWorkerParameter param_;
  std::vector<std::string> skip_ops_;
  std::map<std::string, int> stat_var_name_map_;
-  static std::atomic<uint64_t> worker_num_stat_;
+  static std::atomic<bool> quit_flag_;
+  // static bool quit_flag_2;
 };

 class DownpourWorker : public HogwildWorker {
@@ -724,7 +728,7 @@ class HeterSectionWorker : public DeviceWorker {
  const platform::Place& place() const { return place_; }

  void SetDeviceIndex(int tid) override { thread_id_ = tid; }
-  void SetThreadNum(int thread_num) { thread_num_ = thread_num; }
+  // void SetThreadNum(int thread_num) { thread_num_ = thread_num; }
  void SetMicrobatchNum(int num) { num_microbatches_ = num; }
  void SetPipelineStageNum(int num) { num_pipeline_stages_ = num; }
  void SetPipelineStage(int stage) { pipeline_stage_ = stage; }
@@ -767,7 +771,7 @@ class HeterSectionWorker : public DeviceWorker {
 protected:
  int trainer_id_;
  int trainers_;
-  int thread_num_;
+  // int thread_num_;
  int thread_id_;
  int num_microbatches_;
  int num_pipeline_stages_;

--- a/paddle/fluid/framework/dist_multi_trainer_test.cc
+++ b/paddle/fluid/framework/dist_multi_trainer_test.cc
@@ -14,6 +14,7 @@

 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/trainer.h"
+#include "paddle/phi/core/flags.h"
 #ifdef PADDLE_WITH_GLOO
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
@@ -21,6 +22,7 @@
 #else
 #define _LINUX
 #endif
+DECLARE_bool(enable_exit_when_partial_worker);

 namespace paddle {
 namespace framework {
@@ -82,5 +84,92 @@ TEST(DisMultiTrainerTest, testforgpugraph) {
 #endif
 }

+TEST(DisMultiTrainerTest, test2) {
+#ifdef _LINUX
+  FLAGS_enable_exit_when_partial_worker = true;
+  std::shared_ptr<MultiTrainer> tmp1 = std::make_shared<MultiTrainer>();
+  TrainerDesc t;
+  t.set_class_name("MultiTrainer");
+  t.set_device_worker_name("HogwildWorker");
+  t.set_thread_num(1);
+  auto* m = t.mutable_downpour_param()->add_program_config();
+  m->set_program_id("123");
+  std::string str;
+  // str += "name: \"MultiSlotDataFeed\"\nbatch_size: 2\nmulti_slot_desc {\n";
+  str +=
+      "name: \"SlotRecordInMemoryDataFeed\"\nbatch_size: 2\nmulti_slot_desc "
+      "{\n";
+  str += "slots {\nname: \"words\"\ntype: \"uint64\"\nis_dense: false\n";
+  str += "is_used: true\n}\nslots {\nname: \"label\"\ntype: \"uint64\"\n";
+  str += "is_dense: false\nis_used: true\n}\n}\n";
+  str += "graph_config {\n";
+  str += "gpu_graph_training: true\n}";
+  // std::shared_ptr<MultiSlotDataset> dataset =
+  //     std::make_shared<MultiSlotDataset>();
+  std::shared_ptr<SlotRecordDataset> dataset =
+      std::make_shared<SlotRecordDataset>();
+
+  dataset->SetFileList(std::vector<std::string>());
+  dataset->SetThreadNum(1);
+  dataset->SetTrainerNum(1);
+  dataset->SetDataFeedDesc(str);
+  dataset->CreateChannel();
+  dataset->CreateReaders();
+  Scope root_scope;
+  tmp1->SetScope(&root_scope);
+  tmp1->Initialize(t, dataset.get());
+  tmp1->SetDebug(false);
+  ProgramDesc p;
+  tmp1->InitOtherEnv(p);
+  tmp1->Run();
+  tmp1->Finalize();
+#endif
+}
+
+TEST(DisMultiTrainerTest, test3) {
+#ifdef _LINUX
+  FLAGS_enable_exit_when_partial_worker = true;
+  std::shared_ptr<MultiTrainer> tmp1 = std::make_shared<MultiTrainer>();
+  TrainerDesc t;
+  t.set_class_name("MultiTrainer");
+  t.set_device_worker_name("HogwildWorker");
+  t.set_thread_num(1);
+  auto* m = t.mutable_downpour_param()->add_program_config();
+  m->set_program_id("123");
+  std::string str;
+  // str += "name: \"MultiSlotDataFeed\"\nbatch_size: 2\nmulti_slot_desc {\n";
+  str +=
+      "name: \"SlotRecordInMemoryDataFeed\"\nbatch_size: 2\nmulti_slot_desc "
+      "{\n";
+  str += "slots {\nname: \"words\"\ntype: \"uint64\"\nis_dense: false\n";
+  str += "is_used: true\n}\nslots {\nname: \"label\"\ntype: \"uint64\"\n";
+  str += "is_dense: false\nis_used: true\n}\n}\n";
+  str += "graph_config {\n";
+  str += "gpu_graph_training: true\n}";
+  // std::shared_ptr<MultiSlotDataset> dataset =
+  //     std::make_shared<MultiSlotDataset>();
+  std::shared_ptr<SlotRecordDataset> dataset =
+      std::make_shared<SlotRecordDataset>();
+
+  dataset->SetFileList(std::vector<std::string>());
+  dataset->SetThreadNum(1);
+  dataset->SetTrainerNum(1);
+  dataset->SetDataFeedDesc(str);
+  dataset->CreateChannel();
+  dataset->SetGpuGraphMode(true);
+  dataset->CreateReaders();
+  auto readers = dataset->GetReaders();
+  readers[0]->SetGpuGraphMode(true);
+  Scope root_scope;
+  tmp1->SetScope(&root_scope);
+  tmp1->Initialize(t, dataset.get());
+  tmp1->SetDebug(true);
+  ProgramDesc p;
+  tmp1->InitOtherEnv(p);
+  tmp1->Run();
+  tmp1->Finalize();
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -109,6 +109,11 @@ class GpuPsGraphTable
      std::vector<std::shared_ptr<phi::Allocation>> edge_type_graphs);
  std::vector<std::shared_ptr<phi::Allocation>> get_edge_type_graph(
      int gpu_id, int edge_type_len);
+  void get_node_degree(int gpu_id,
+                       int edge_idx,
+                       uint64_t *key,
+                       int len,
+                       std::shared_ptr<phi::Allocation> node_degree);
  int get_feature_of_nodes(int gpu_id,
                           uint64_t *d_walk,
                           uint64_t *d_offset,
@@ -146,6 +151,8 @@ class GpuPsGraphTable
                                 uint32_t *actual_feature_size,
                                 uint64_t *feature_list,
                                 uint8_t *slot_list);
+  void move_degree_to_source_gpu(
+      int gpu_id, int gpu_num, int *h_left, int *h_right, int *node_degree);
  void move_result_to_source_gpu_all_edge_type(int gpu_id,
                                               int gpu_num,
                                               int sample_size,

--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
@@ -155,6 +155,15 @@ __global__ void get_features_kernel(GpuPsCommGraphFea graph,
  }
 }

+__global__ void get_node_degree_kernel(GpuPsNodeInfo* node_info_list,
+                                       int* node_degree,
+                                       int n) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n) {
+    node_degree[i] = node_info_list[i].neighbor_size;
+  }
+}
+
 template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
 __global__ void neighbor_sample_kernel_walking(GpuPsCommGraph graph,
                                               GpuPsNodeInfo* node_info_list,
@@ -455,6 +464,41 @@ void GpuPsGraphTable::move_result_to_source_gpu(int start_index,
  }
 }

+void GpuPsGraphTable::move_degree_to_source_gpu(
+    int start_index, int gpu_num, int* h_left, int* h_right, int* node_degree) {
+  int shard_len[gpu_num];
+  for (int i = 0; i < gpu_num; i++) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    shard_len[i] = h_right[i] - h_left[i] + 1;
+    int cur_step = (int)path_[start_index][i].nodes_.size() - 1;
+    for (int j = cur_step; j > 0; j--) {
+      CUDA_CHECK(
+          cudaMemcpyAsync(path_[start_index][i].nodes_[j - 1].val_storage,
+                          path_[start_index][i].nodes_[j].val_storage,
+                          path_[start_index][i].nodes_[j - 1].val_bytes_len,
+                          cudaMemcpyDefault,
+                          path_[start_index][i].nodes_[j - 1].out_stream));
+    }
+    auto& node = path_[start_index][i].nodes_.front();
+    CUDA_CHECK(
+        cudaMemcpyAsync(reinterpret_cast<char*>(node_degree + h_left[i]),
+                        node.val_storage + sizeof(int64_t) * shard_len[i],
+                        sizeof(int) * shard_len[i],
+                        cudaMemcpyDefault,
+                        node.out_stream));
+  }
+
+  for (int i = 0; i < gpu_num; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    auto& node = path_[start_index][i].nodes_.front();
+    CUDA_CHECK(cudaStreamSynchronize(node.out_stream));
+  }
+}
+
 void GpuPsGraphTable::move_result_to_source_gpu_all_edge_type(
    int start_index,
    int gpu_num,
@@ -570,6 +614,16 @@ __global__ void fill_dvalues(uint64_t* d_shard_vals,
  }
 }

+__global__ void fill_dvalues(int* d_shard_degree,
+                             int* d_degree,
+                             int* idx,
+                             int len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_degree[idx[i]] = d_shard_degree[i];
+  }
+}
+
 __global__ void fill_dvalues_with_edge_type(uint64_t* d_shard_vals,
                                            uint64_t* d_vals,
                                            int* d_shard_actual_sample_size,
@@ -756,7 +810,7 @@ void GpuPsGraphTable::build_graph_fea_on_single_gpu(const GpuPsCommGraphFea& g,
  } else {
    gpu_graph_fea_list_[offset].feature_size = 0;
  }
-  VLOG(0) << "gpu node_feature info card :" << gpu_id << " ,node_size is "
+  VLOG(1) << "gpu node_feature info card :" << gpu_id << " ,node_size is "
          << gpu_graph_fea_list_[offset].node_size << ", feature_size is "
          << gpu_graph_fea_list_[offset].feature_size;
 }
@@ -1340,6 +1394,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
  return result;
 }

+// only for graphsage
 NeighborSampleResultV2 GpuPsGraphTable::graph_neighbor_sample_all_edge_type(
    int gpu_id,
    int edge_type_len,
@@ -1537,6 +1592,125 @@ NeighborSampleResultV2 GpuPsGraphTable::graph_neighbor_sample_all_edge_type(
  return result;
 }

+void GpuPsGraphTable::get_node_degree(
+    int gpu_id,
+    int edge_idx,
+    uint64_t* key,
+    int len,
+    std::shared_ptr<phi::Allocation> node_degree) {
+  int* node_degree_ptr =
+      reinterpret_cast<int*>(node_degree->ptr()) + edge_idx * len;
+  int total_gpu = resource_->total_device();
+  platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
+  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
+  auto stream = resource_->local_stream(gpu_id, 0);
+  int grid_size = (len - 1) / block_size_ + 1;
+  int h_left[total_gpu];   // NOLINT
+  int h_right[total_gpu];  // NOLINT
+  auto d_left =
+      memory::Alloc(place,
+                    total_gpu * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  auto d_right =
+      memory::Alloc(place,
+                    total_gpu * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
+  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
+  CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream));
+  auto d_idx =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
+  auto d_shard_keys =
+      memory::Alloc(place,
+                    len * sizeof(uint64_t),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
+  auto d_shard_degree =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  int* d_shard_degree_ptr = reinterpret_cast<int*>(d_shard_degree->ptr());
+  split_input_to_shard(
+      (uint64_t*)(key), d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
+  heter_comm_kernel_->fill_shard_key(
+      d_shard_keys_ptr, key, d_idx_ptr, len, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaMemcpyAsync(h_left,
+                             d_left_ptr,
+                             total_gpu * sizeof(int),
+                             cudaMemcpyDeviceToHost,
+                             stream));
+  CUDA_CHECK(cudaMemcpyAsync(h_right,
+                             d_right_ptr,
+                             total_gpu * sizeof(int),
+                             cudaMemcpyDeviceToHost,
+                             stream));
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  device_mutex_[gpu_id]->lock();
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    create_storage(
+        gpu_id,
+        i,
+        shard_len * sizeof(uint64_t),
+        shard_len * sizeof(uint64_t) + sizeof(int) * shard_len + shard_len % 2);
+  }
+  walk_to_dest(
+      gpu_id, total_gpu, h_left, h_right, (uint64_t*)(d_shard_keys_ptr), NULL);
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    auto& node = path_[gpu_id][i].nodes_.back();
+    CUDA_CHECK(cudaMemsetAsync(
+        node.val_storage, 0, shard_len * sizeof(uint64_t), node.in_stream));
+    CUDA_CHECK(cudaStreamSynchronize(node.in_stream));
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    int table_offset =
+        get_table_offset(i, GraphTableType::EDGE_TABLE, edge_idx);
+    tables_[table_offset]->get(reinterpret_cast<uint64_t*>(node.key_storage),
+                               reinterpret_cast<uint64_t*>(node.val_storage),
+                               (size_t)(h_right[i] - h_left[i] + 1),
+                               resource_->remote_stream(i, gpu_id));
+    GpuPsNodeInfo* node_info_list =
+        reinterpret_cast<GpuPsNodeInfo*>(node.val_storage);
+    int* node_degree_array = (int*)(node_info_list + shard_len);
+    int grid_size_ = (shard_len - 1) / block_size_ + 1;
+    get_node_degree_kernel<<<grid_size_,
+                             block_size_,
+                             0,
+                             resource_->remote_stream(i, gpu_id)>>>(
+        node_info_list, node_degree_array, shard_len);
+  }
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    CUDA_CHECK(cudaStreamSynchronize(resource_->remote_stream(i, gpu_id)));
+  }
+  move_degree_to_source_gpu(
+      gpu_id, total_gpu, h_left, h_right, d_shard_degree_ptr);
+  fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_degree_ptr, node_degree_ptr, d_idx_ptr, len);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  for (int i = 0; i < total_gpu; i++) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    destroy_storage(gpu_id, i);
+  }
+  device_mutex_[gpu_id]->unlock();
+}
+
 NodeQueryResult GpuPsGraphTable::graph_node_sample(int gpu_id,
                                                   int sample_size) {
  return NodeQueryResult();

--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -32,7 +32,8 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) {
 }

 void GraphGpuWrapper::init_conf(const std::string &first_node_type,
-                                const std::string &meta_path) {
+                                const std::string &meta_path,
+                                const std::string &excluded_train_pair) {
  static std::mutex mutex;
  {
    std::lock_guard<std::mutex> lock(mutex);
@@ -45,12 +46,12 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type,
        paddle::string::split_string<std::string>(first_node_type, ";");
    VLOG(2) << "node_types: " << first_node_type;
    for (auto &type : node_types) {
-      auto iter = feature_to_id.find(type);
-      PADDLE_ENFORCE_NE(iter,
-                        feature_to_id.end(),
-                        platform::errors::NotFound(
-                            "(%s) is not found in feature_to_id.", type));
-      VLOG(2) << "feature_to_id[" << type << "] = " << iter->second;
+      auto iter = node_to_id.find(type);
+      PADDLE_ENFORCE_NE(
+          iter,
+          node_to_id.end(),
+          platform::errors::NotFound("(%s) is not found in node_to_id.", type));
+      VLOG(2) << "node_to_id[" << type << "] = " << iter->second;
      first_node_type_.push_back(iter->second);
    }
    meta_path_.resize(first_node_type_.size());
@@ -58,17 +59,40 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type,

    for (size_t i = 0; i < meta_paths.size(); i++) {
      auto path = meta_paths[i];
-      auto nodes = paddle::string::split_string<std::string>(path, "-");
+      auto edges = paddle::string::split_string<std::string>(path, "-");
+      for (auto &edge : edges) {
+        auto iter = edge_to_id.find(edge);
+        PADDLE_ENFORCE_NE(iter,
+                          edge_to_id.end(),
+                          platform::errors::NotFound(
+                              "(%s) is not found in edge_to_id.", edge));
+        VLOG(2) << "edge_to_id[" << edge << "] = " << iter->second;
+        meta_path_[i].push_back(iter->second);
+        if (edge_to_node_map_.find(iter->second) == edge_to_node_map_.end()) {
+          auto nodes = paddle::string::split_string<std::string>(edge, "2");
+          uint64_t src_node_id = node_to_id.find(nodes[0])->second;
+          uint64_t dst_node_id = node_to_id.find(nodes[1])->second;
+          edge_to_node_map_[iter->second] = src_node_id << 32 | dst_node_id;
+        }
+      }
+    }
+
+    auto paths =
+        paddle::string::split_string<std::string>(excluded_train_pair, ";");
+    VLOG(2) << "excluded_train_pair[" << excluded_train_pair << "]";
+    for (auto &path : paths) {
+      auto nodes = paddle::string::split_string<std::string>(path, "2");
      for (auto &node : nodes) {
-        auto iter = edge_to_id.find(node);
+        auto iter = node_to_id.find(node);
        PADDLE_ENFORCE_NE(iter,
                          edge_to_id.end(),
                          platform::errors::NotFound(
                              "(%s) is not found in edge_to_id.", node));
        VLOG(2) << "edge_to_id[" << node << "] = " << iter->second;
-        meta_path_[i].push_back(iter->second);
+        excluded_train_pair_.push_back(iter->second);
      }
    }
+
    int max_dev_id = 0;
    for (size_t i = 0; i < device_id_mapping.size(); i++) {
      if (device_id_mapping[i] > max_dev_id) {
@@ -85,11 +109,11 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type,
      auto &finish_node_type = finish_node_type_[i];
      finish_node_type.clear();

-      for (size_t idx = 0; idx < feature_to_id.size(); idx++) {
+      for (size_t idx = 0; idx < node_to_id.size(); idx++) {
        infer_node_type_start[idx] = 0;
      }
      for (auto &type : node_types) {
-        auto iter = feature_to_id.find(type);
+        auto iter = node_to_id.find(type);
        node_type_start[iter->second] = 0;
        infer_node_type_start[iter->second] = 0;
      }
@@ -188,7 +212,7 @@ void GraphGpuWrapper::init_metapath(std::string cur_metapath,
  int first_node_idx;
  std::string first_node =
      paddle::string::split_string<std::string>(cur_metapath_, "2")[0];
-  auto it = feature_to_id.find(first_node);
+  auto it = node_to_id.find(first_node);
  first_node_idx = it->second;
  d_graph_train_total_keys_.resize(thread_num);
  h_graph_train_keys_len_.resize(thread_num);
@@ -309,8 +333,8 @@ void GraphGpuWrapper::set_up_types(const std::vector<std::string> &edge_types,
  }
  id_to_feature = node_types;
  for (size_t table_id = 0; table_id < node_types.size(); table_id++) {
-    int res = feature_to_id.size();
-    feature_to_id[node_types[table_id]] = res;
+    int res = node_to_id.size();
+    node_to_id[node_types[table_id]] = res;
  }
  table_feat_mapping.resize(node_types.size());
  this->table_feat_conf_feat_name.resize(node_types.size());
@@ -389,21 +413,22 @@ void GraphGpuWrapper::load_edge_file(std::string etype2files,
          etype2files, graph_data_local_path, part_num, reverse);
 }

-void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) {
+int GraphGpuWrapper::load_node_file(std::string name, std::string filepath) {
  // 'n' means load nodes and 'node_type' follows

  std::string params = "n" + name;

-  if (feature_to_id.find(name) != feature_to_id.end()) {
-    reinterpret_cast<GpuPsGraphTable *>(graph_table)
+  if (node_to_id.find(name) != node_to_id.end()) {
+    return reinterpret_cast<GpuPsGraphTable *>(graph_table)
        ->cpu_graph_table_->Load(std::string(filepath), params);
  }
+  return 0;
 }

-void GraphGpuWrapper::load_node_file(std::string ntype2files,
-                                     std::string graph_data_local_path,
-                                     int part_num) {
-  reinterpret_cast<GpuPsGraphTable *>(graph_table)
+int GraphGpuWrapper::load_node_file(std::string ntype2files,
+                                    std::string graph_data_local_path,
+                                    int part_num) {
+  return reinterpret_cast<GpuPsGraphTable *>(graph_table)
      ->cpu_graph_table_->parse_node_and_load(
          ntype2files, graph_data_local_path, part_num);
 }
@@ -422,8 +447,8 @@ void GraphGpuWrapper::add_table_feat_conf(std::string table_name,
                                          std::string feat_name,
                                          std::string feat_dtype,
                                          int feat_shape) {
-  if (feature_to_id.find(table_name) != feature_to_id.end()) {
-    int idx = feature_to_id[table_name];
+  if (node_to_id.find(table_name) != node_to_id.end()) {
+    int idx = node_to_id[table_name];
    if (table_feat_mapping[idx].find(feat_name) ==
        table_feat_mapping[idx].end()) {
      int res = table_feat_mapping[idx].size();
@@ -512,7 +537,7 @@ void GraphGpuWrapper::upload_batch(int type,
          g->cpu_graph_table_->make_gpu_ps_graph(idx, ids[i]);
      g->build_graph_on_single_gpu(sub_graph, i, idx);
      sub_graph.release_on_cpu();
-      VLOG(0) << "sub graph on gpu " << i << " is built";
+      VLOG(1) << "sub graph on gpu " << i << " is built";
      return 0;
    }));
  }
@@ -579,7 +604,7 @@ void GraphGpuWrapper::build_gpu_graph_fea(GpuPsCommGraphFea &sub_graph_fea,
  GpuPsGraphTable *g = reinterpret_cast<GpuPsGraphTable *>(graph_table);
  g->build_graph_fea_on_single_gpu(sub_graph_fea, i);
  sub_graph_fea.release_on_cpu();
-  VLOG(0) << "sub graph fea on gpu " << i << " is built";
+  VLOG(1) << "sub graph fea on gpu " << i << " is built";
  return;
 }

@@ -607,6 +632,16 @@ GraphGpuWrapper::get_edge_type_graph(int gpu_id, int edge_type_len) {
      ->get_edge_type_graph(gpu_id, edge_type_len);
 }

+void GraphGpuWrapper::get_node_degree(
+    int gpu_id,
+    int edge_idx,
+    uint64_t *key,
+    int len,
+    std::shared_ptr<phi::Allocation> node_degree) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->get_node_degree(gpu_id, edge_idx, key, len, node_degree);
+}
+
 int GraphGpuWrapper::get_feature_info_of_nodes(
    int gpu_id,
    uint64_t *d_nodes,
@@ -776,7 +811,7 @@ std::string &GraphGpuWrapper::get_node_type_size(std::string first_node_type) {
  auto &type_to_index = get_graph_type_to_index();
  std::vector<std::string> node_type_size;
  for (auto node : uniq_first_node_) {
-    auto it = feature_to_id.find(node);
+    auto it = node_to_id.find(node);
    auto first_node_idx = it->second;
    size_t f_idx = type_to_index[first_node_idx];
    int type_total_key_size = graph_all_type_total_keys[f_idx].size();

--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -41,7 +41,8 @@ class GraphGpuWrapper {
  }
  static std::shared_ptr<GraphGpuWrapper> s_instance_;
  void init_conf(const std::string& first_node_type,
-                 const std::string& meta_path);
+                 const std::string& meta_path,
+                 const std::string& excluded_train_pair);
  void initialize();
  void finalize();
  void set_device(std::vector<int> ids);
@@ -66,10 +67,10 @@ class GraphGpuWrapper {
                      int part_num,
                      bool reverse);

-  void load_node_file(std::string name, std::string filepath);
-  void load_node_file(std::string ntype2files,
-                      std::string graph_data_local_path,
-                      int part_num);
+  int load_node_file(std::string name, std::string filepath);
+  int load_node_file(std::string ntype2files,
+                     std::string graph_data_local_path,
+                     int part_num);
  void load_node_and_edge(std::string etype2files,
                          std::string ntype2files,
                          std::string graph_data_local_path,
@@ -120,6 +121,11 @@ class GraphGpuWrapper {
      int sample_size,
      int len,
      std::vector<std::shared_ptr<phi::Allocation>> edge_type_graphs);
+  void get_node_degree(int gpu_id,
+                       int edge_idx,
+                       uint64_t* key,
+                       int len,
+                       std::shared_ptr<phi::Allocation> node_degree);
  gpuStream_t get_local_stream(int gpuid);
  std::vector<uint64_t> graph_neighbor_sample(
      int gpu_id,
@@ -160,7 +166,7 @@ class GraphGpuWrapper {
  std::string& get_node_type_size(std::string first_node_type);
  std::string& get_edge_type_size();

-  std::unordered_map<std::string, int> edge_to_id, feature_to_id;
+  std::unordered_map<std::string, int> edge_to_id, node_to_id;
  std::vector<std::string> id_to_feature, id_to_edge;
  std::vector<std::unordered_map<std::string, int>> table_feat_mapping;
  std::vector<std::vector<std::string>> table_feat_conf_feat_name;
@@ -175,6 +181,7 @@ class GraphGpuWrapper {
  std::string feature_separator_ = std::string(" ");
  bool conf_initialized_ = false;
  std::vector<int> first_node_type_;
+  std::vector<uint8_t> excluded_train_pair_;
  std::vector<std::vector<int>> meta_path_;

  std::vector<std::set<int>> finish_node_type_;
@@ -187,6 +194,11 @@ class GraphGpuWrapper {
  std::vector<size_t> h_graph_train_keys_len_;
  std::vector<std::vector<std::shared_ptr<phi::Allocation>>>
      d_graph_all_type_total_keys_;
+  std::map<uint64_t,  // edge_id
+           uint64_t   // src_node_id << 32 | dst_node_id
+           >
+      edge_to_node_map_;
+
  std::vector<std::vector<uint64_t>> h_graph_all_type_keys_len_;
  std::string slot_feature_separator_ = std::string(" ");


--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -193,17 +193,17 @@ void PSGPUWrapper::add_key_to_gputask(std::shared_ptr<HeterContext> gpu_task) {
  }
  timeline.Pause();

-  VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec()
+  VLOG(1) << "GpuPs task add keys cost " << timeline.ElapsedSec()
          << " seconds.";
  timeline.Start();
-  size_t slot_num = slot_vector_.size() - 1;
+  size_t slot_num = (size_t)slot_num_for_pull_feature_;
  // no slot_fea mode and whole_hbm mode, only keep one unique_sort action
  if (slot_num > 0 && FLAGS_gpugraph_storage_mode !=
                          paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
    gpu_task->UniqueKeys();
  }
  timeline.Pause();
-  VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
+  VLOG(1) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
 }

 void PSGPUWrapper::resize_gputask(std::shared_ptr<HeterContext> gpu_task) {
@@ -218,7 +218,8 @@ void PSGPUWrapper::resize_gputask(std::shared_ptr<HeterContext> gpu_task) {
  }
 }

-void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
+void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task,
+                                Dataset* dataset_for_pull) {
  VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin";
  platform::Timer timeline;
  timeline.Start();
@@ -341,12 +342,13 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
              << " seconds.";
    }
  } else {
-    SlotRecordDataset* dataset = reinterpret_cast<SlotRecordDataset*>(dataset_);
+    SlotRecordDataset* dataset =
+        reinterpret_cast<SlotRecordDataset*>(dataset_for_pull);
    const std::vector<uint64_t>& vec_data = dataset->GetGpuGraphTotalKeys();
    timeline.Start();
    add_key_to_local(vec_data);
    timeline.Pause();
-    VLOG(0) << "GpuGraphTotalKeys: " << vec_data.size()
+    VLOG(1) << "GpuGraphTotalKeys: " << vec_data.size()
            << ", add_key_to_local cost " << timeline.ElapsedSec()
            << " seconds.";
  }
@@ -361,7 +363,8 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) {
  // 8卡数据分片
  size_t device_num = heter_devices_.size();
  std::vector<std::thread> threads;
-  size_t slot_num = slot_vector_.size() - 1;  // node slot 9008 in slot_vector
+  size_t slot_num =
+      (size_t)slot_num_for_pull_feature_;  // node slot 9008 in slot_vector
  auto& local_dim_keys = gpu_task->feature_dim_keys_;  // [shard_num, 0, keys]]
  double divide_nodeid_cost = 0;
  double get_feature_id_cost = 0;
@@ -532,8 +535,7 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) {
  for (size_t i = 0; i < device_num; i++) {
    feature_num += feature_list_size[i];
  }
-  VLOG(0) << "feature_num is " << feature_num << " node_num num is "
-          << node_num;
+  VLOG(1) << "feature_num is " << feature_num << " node_num is " << node_num;

  size_t set_num = thread_keys_shard_num_;
  std::vector<std::unordered_set<uint64_t>> feature_id_set(set_num);
@@ -635,7 +637,7 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) {
  add_feature_to_key_cost = time_stage.ElapsedSec();
  threads.clear();
  timeline.Pause();
-  VLOG(0) << " add_slot_feature costs: " << timeline.ElapsedSec() << " s."
+  VLOG(1) << " add_slot_feature costs: " << timeline.ElapsedSec() << " s."
          << " divide_nodeid_cost " << divide_nodeid_cost
          << " get_feature_id_cost " << get_feature_id_cost
          << " add_feature_to_set_cost " << add_feature_to_set_cost
@@ -644,9 +646,9 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) {

 void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
  platform::Timer timeline;
-  size_t slot_num = slot_vector_.size() - 1;  // node slot 9008 in slot_vector
-  if (slot_num > 0 && FLAGS_gpugraph_storage_mode !=
-                          paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
+  if (slot_num_for_pull_feature_ > 0 &&
+      FLAGS_gpugraph_storage_mode !=
+          paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
    add_slot_feature(gpu_task);
  }

@@ -656,7 +658,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
  time_stage.Start();
  gpu_task->UniqueKeys();
  time_stage.Pause();
-  VLOG(0) << "BuildPull slot feature uniq and sort cost time: "
+  VLOG(1) << "BuildPull slot feature uniq and sort cost time: "
          << time_stage.ElapsedSec();

  auto& local_dim_keys = gpu_task->feature_dim_keys_;
@@ -795,7 +797,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
  }
  task_futures.clear();
  timeline.Pause();
-  VLOG(0) << "pull sparse from CpuPS into GpuPS total keys " << total_key
+  VLOG(1) << "pull sparse from CpuPS into GpuPS total keys " << total_key
          << ", cost " << timeline.ElapsedSec() << " seconds.";
  if (multi_node_) {
    auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
@@ -879,7 +881,7 @@ void PSGPUWrapper::divide_to_device(std::shared_ptr<HeterContext> gpu_task) {
    }
  }
  timeline.Pause();
-  VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
+  VLOG(1) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
          << " seconds.";
 }

@@ -1052,7 +1054,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
              << " dim index: " << j << " contains feasign nums: "
              << gpu_task->device_dim_ptr_[i][j].size();
    }
-    VLOG(0) << i << " card with dynamic mf contains feasign nums total: "
+    VLOG(1) << i << " card with dynamic mf contains feasign nums total: "
            << feature_keys_count[i];
    size_max = std::max(size_max, feature_keys_count[i]);
  }
@@ -1074,7 +1076,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
 #endif
  }
  stagetime.Pause();
-  VLOG(0) << "card: "
+  VLOG(1) << "card: "
          << " BuildGPUTask create HeterPs_ costs: " << stagetime.ElapsedSec()
          << " s.";
  stagetime.Start();
@@ -1191,8 +1193,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
      threads[j] = std::thread(build_ps_thread, i, j, len, feature_value_size);
    }
    // build feature table
-    size_t slot_num = slot_vector_.size() - 1;  // node slot 9008 in slot_vector
-    if (slot_num > 0 &&
+    if (slot_num_for_pull_feature_ > 0 &&
        (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::
                                            MEM_EMB_FEATURE_AND_GPU_GRAPH ||
         FLAGS_gpugraph_storage_mode ==
@@ -1228,7 +1229,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
      t.join();
    }
    stagetime.Pause();
-    VLOG(0) << "card: " << i
+    VLOG(1) << "card: " << i
            << " BuildGPUTask build_ps async costs: " << stagetime.ElapsedSec()
            << " s.";
  };
@@ -1262,7 +1263,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
  }
  cpu_task_futures.clear();
  stagetime.Pause();
-  VLOG(0) << " BuildGPUTask build_dynamic_mf_func "
+  VLOG(1) << " BuildGPUTask build_dynamic_mf_func "
          << " cost " << stagetime.ElapsedSec() << " s.";
  for (int i = 0; i < device_num; i++) {
    cpu_reday_channels_[i]->Close();
@@ -1282,7 +1283,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
    gpu_task->sub_graph_feas = NULL;
  }
  stagetime.Pause();
-  VLOG(0) << "  build_dymf_hbm_pool "
+  VLOG(1) << "  build_dymf_hbm_pool "
          << " cost " << stagetime.ElapsedSec() << " s.";
 }

@@ -1292,7 +1293,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
  timer.Start();
  dataset_->LoadIntoMemory();
  timer.Pause();
-  VLOG(0) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s";
+  VLOG(1) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s";
  gpu_graph_mode_ = dataset_->GetGpuGraphMode();
  if (dataset_->GetMemoryDataSize() == 0) {
    VLOG(0) << "GetMemoryDataSize == 0";
@@ -1309,7 +1310,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
    std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
    gpu_task->Reset();
    gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
-    data_ready_channel_->Put(gpu_task);
+    data_ready_channel_->Put(std::make_pair(gpu_task, dataset_));
  } else if (hbm_sparse_table_initialized_ == false) {
    SparseTableToHbm();
  }
@@ -1317,7 +1318,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
  std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
  gpu_task->Reset();
  gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
-  data_ready_channel_->Put(gpu_task);
+  data_ready_channel_->Put(std::make_pair(gpu_task, dataset_));
 #endif
  VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]";
 }
@@ -1332,17 +1333,20 @@ void PSGPUWrapper::start_build_thread() {
 void PSGPUWrapper::pre_build_thread() {
  // prebuild: process load_data
  while (running_) {
+    std::pair<std::shared_ptr<HeterContext>, Dataset*> task =
+        std::make_pair(nullptr, nullptr);
    std::shared_ptr<HeterContext> gpu_task = nullptr;
-    if (!data_ready_channel_->Get(gpu_task)) {
+    if (!data_ready_channel_->Get(task)) {
      continue;
    }
+    gpu_task = task.first;
    VLOG(3) << "thread PreBuildTask start.";
    platform::Timer timer;
    timer.Start();
    // build cpu ps data process
-    PreBuildTask(gpu_task);
+    PreBuildTask(gpu_task, task.second);
    timer.Pause();
-    VLOG(0) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec()
+    VLOG(1) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec()
            << " s";
    buildcpu_ready_channel_->Put(gpu_task);
  }
@@ -1382,7 +1386,7 @@ void PSGPUWrapper::build_task() {
    return;
  }

-  VLOG(0) << "PrepareGPUTask start.";
+  VLOG(1) << "PrepareGPUTask start.";
  platform::Timer timer;
  timer.Start();
  if (!multi_mf_dim_) {
@@ -1390,7 +1394,7 @@ void PSGPUWrapper::build_task() {
  }
  BuildGPUTask(gpu_task);
  timer.Pause();
-  VLOG(0) << "PrepareGPUTask + BuildGPUTask end, cost time: "
+  VLOG(1) << "PrepareGPUTask + BuildGPUTask end, cost time: "
          << timer.ElapsedSec() << "s";

  current_task_ = gpu_task;
@@ -1419,11 +1423,11 @@ void PSGPUWrapper::BeginPass() {
        "[BeginPass] after build_task, current task is not null."));
  }
  if (FLAGS_gpugraph_dedup_pull_push_mode) {
-    VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec()
+    VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec()
            << "s, enable pull push dedup mode="
            << FLAGS_gpugraph_dedup_pull_push_mode;
  } else {
-    VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
+    VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
  }
 }

@@ -1433,11 +1437,14 @@ void PSGPUWrapper::EndPass() {
    return;
  }
 #endif
+  if (current_task_ == nullptr) {
+    return;
+  }
  platform::Timer stagetime;
  stagetime.Start();
  HbmToSparseTable();
  stagetime.Pause();
-  VLOG(0) << "EndPass HbmToSparseTable cost time: " << stagetime.ElapsedSec()
+  VLOG(1) << "EndPass HbmToSparseTable cost time: " << stagetime.ElapsedSec()
          << "s";

  gpu_task_pool_.Push(current_task_);
@@ -1453,7 +1460,7 @@ void PSGPUWrapper::SparseTableToHbm() {
  gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
  gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-  auto node_to_id = gpu_graph_ptr->feature_to_id;
+  auto node_to_id = gpu_graph_ptr->node_to_id;
  auto edge_to_id = gpu_graph_ptr->edge_to_id;
  std::vector<uint64_t> vec_data = gpu_graph_ptr->get_graph_total_keys();

@@ -1569,7 +1576,7 @@ void PSGPUWrapper::HbmToSparseTable() {
  size_t thread_num = 16;
  size_t device_num = heter_devices_.size();
  if (multi_mf_dim_) {
-    VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
+    VLOG(1) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
    for (size_t i = 0; i < device_num; i++) {
      cpu_reday_channels_[i]->Open();
      for (int j = 0; j < multi_mf_dim_; j++) {
@@ -1593,7 +1600,7 @@ void PSGPUWrapper::HbmToSparseTable() {
    f.wait();
  }
  timer.Pause();
-  VLOG(0) << " EndPass  dump_pool_to_cpu_func "
+  VLOG(1) << " EndPass  dump_pool_to_cpu_func "
          << " cost " << timer.ElapsedSec() << " s.";
  for (size_t i = 0; i < device_num; i++) {
    cpu_reday_channels_[i]->Close();
@@ -1605,7 +1612,7 @@ void PSGPUWrapper::HbmToSparseTable() {
  }
  cpu_task_futures.clear();
  timer.Pause();
-  VLOG(0) << " EndPass  cpu_func "
+  VLOG(1) << " EndPass  cpu_func "
          << " cost " << timer.ElapsedSec() << " s.";
  if (keysize_max != 0) {
    HeterPs_->end_pass();

--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/broadcast.h>
@@ -202,7 +203,8 @@ class PSGPUWrapper {
  void divide_to_device(std::shared_ptr<HeterContext> gpu_task);
  void add_slot_feature(std::shared_ptr<HeterContext> gpu_task);
  void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
-  void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
+  void PreBuildTask(std::shared_ptr<HeterContext> gpu_task,
+                    Dataset* dataset_for_pull);
  void BuildPull(std::shared_ptr<HeterContext> gpu_task);
  void PrepareGPUTask(std::shared_ptr<HeterContext> gpu_task);
  void LoadIntoMemory(bool is_shuffle);
@@ -251,8 +253,8 @@ class PSGPUWrapper {
    buildpull_threads_.join();
    s_instance_ = nullptr;
    VLOG(3) << "PSGPUWrapper Finalize Finished.";
-    HeterPs_->show_table_collisions();
    if (HeterPs_ != NULL) {
+      HeterPs_->show_table_collisions();
      delete HeterPs_;
      HeterPs_ = NULL;
    }
@@ -605,7 +607,10 @@ class PSGPUWrapper {
    slot_vector_ = slot_vector;
    VLOG(0) << "slot_vector size is " << slot_vector_.size();
  }
-
+  void SetPullFeatureSlotNum(int slot_num) {
+    slot_num_for_pull_feature_ = slot_num;
+    VLOG(0) << "slot_num_for_pull_feature_ is " << slot_num_for_pull_feature_;
+  }
  void SetSlotOffsetVector(const std::vector<int>& slot_offset_vector) {
    slot_offset_vector_ = slot_offset_vector;
    std::cout << "yxf set: ";
@@ -734,6 +739,7 @@ class PSGPUWrapper {
  std::vector<int> index_dim_vec_;
  int multi_mf_dim_{0};
  int max_mf_dim_{0};
+  int slot_num_for_pull_feature_{0};
  size_t val_type_size_{0};
  size_t grad_type_size_{0};
  size_t pull_type_size_{0};
@@ -781,10 +787,10 @@ class PSGPUWrapper {
                                              // hbm pools of totol dims number
 #endif

-  std::shared_ptr<
-      paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
-      data_ready_channel_ =
-          paddle::framework::MakeChannel<std::shared_ptr<HeterContext>>();
+  std::shared_ptr<paddle::framework::ChannelObject<
+      std::pair<std::shared_ptr<HeterContext>, Dataset*>>>
+      data_ready_channel_ = paddle::framework::MakeChannel<
+          std::pair<std::shared_ptr<HeterContext>, Dataset*>>();
  std::shared_ptr<
      paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
      buildcpu_ready_channel_ =

--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -31,7 +31,7 @@ DECLARE_bool(enable_exit_when_partial_worker);
 namespace paddle {
 namespace framework {

-std::atomic<uint64_t> HogwildWorker::worker_num_stat_(0);
+std::atomic<bool> HogwildWorker::quit_flag_(false);
 Barrier g_barrier;

 void HogwildWorker::Initialize(const TrainerDesc &desc) {
@@ -148,7 +148,7 @@ void HogwildWorker::TrainFilesWithProfiler() {
  int cur_batch;
  int batch_cnt = 0;
  if (thread_id_ == 0) {
-    worker_num_stat_.store(0);
+    quit_flag_.store(false);
  }
  g_barrier.wait();
  bool train_mode = device_reader_->IsTrainMode();
@@ -160,11 +160,11 @@ void HogwildWorker::TrainFilesWithProfiler() {
  while (1) {
    cur_batch = device_reader_->Next();
    if (FLAGS_enable_exit_when_partial_worker && train_mode) {
-      if (cur_batch > 0) {
-        worker_num_stat_.fetch_add(1, std::memory_order_relaxed);
+      if (cur_batch <= 0) {
+        quit_flag_.store(true, std::memory_order_relaxed);
      }
      g_barrier.wait();
-      if (worker_num_stat_.load(std::memory_order_relaxed) % thread_num_ != 0) {
+      if (quit_flag_.load(std::memory_order_relaxed) == true) {
        break;
      }
    }
@@ -265,7 +265,8 @@ void HogwildWorker::TrainFiles() {
  int cur_batch;
  int batch_cnt = 0;
  if (thread_id_ == 0) {
-    worker_num_stat_.store(0);
+    quit_flag_.store(false);
+    // quit_flag_2 = false;
  }
  g_barrier.wait();

@@ -280,11 +281,11 @@ void HogwildWorker::TrainFiles() {
  while (1) {
    cur_batch = device_reader_->Next();
    if (FLAGS_enable_exit_when_partial_worker && train_mode) {
-      if (cur_batch > 0) {
-        worker_num_stat_.fetch_add(1, std::memory_order_relaxed);
+      if (cur_batch <= 0) {
+        quit_flag_.store(true, std::memory_order_relaxed);
      }
      g_barrier.wait();
-      if (worker_num_stat_.load(std::memory_order_relaxed) % thread_num_ != 0) {
+      if (quit_flag_.load(std::memory_order_relaxed) == true) {
        break;
      }
    }
@@ -320,7 +321,7 @@ void HogwildWorker::TrainFiles() {
 #endif
  }
  timeline.Pause();
-  VLOG(0) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
+  VLOG(1) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
          << " seconds, batch_num: " << total_batch_num;

  if (need_dump_field_ || need_dump_param_) {

--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -25,6 +25,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+extern Barrier g_barrier;
+
 void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
                              Dataset* dataset) {
  thread_num_ = trainer_desc.thread_num();
@@ -62,7 +64,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
        thread_num_);
  }
 #endif
-
+  g_barrier.reset(thread_num_);
  for (int i = 0; i < thread_num_; ++i) {
    workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
        trainer_desc.device_worker_name());
@@ -74,6 +76,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
    workers_[i]->Initialize(trainer_desc);
    workers_[i]->SetDeviceIndex(i);
    workers_[i]->SetDataFeed(readers[i]);
+    workers_[i]->SetThreadNum(thread_num_);
  }

  // set debug here
@@ -177,7 +180,7 @@ void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
  // for unittest which call train_from_dataset but does not call
  // fleet.init_worker() first
  if (communicator == nullptr) {
-    VLOG(0) << "MultiTrainer::InitOtherEnv Communicator is null!";
+    VLOG(1) << "MultiTrainer::InitOtherEnv Communicator is null!";
  } else {
    auto& recv_ctx = communicator->GetRecvCtxMap();
    communicator->PullDense(recv_ctx);
@@ -299,13 +302,13 @@ void MultiTrainer::Finalize() {
  auto communicator = paddle::distributed::Communicator::GetInstance();
  // for unittest which does not call fleet.init_worker() first
  if (communicator == nullptr) {
-    VLOG(0) << "MultiTrainer::Finalize communicator is null!";
+    VLOG(1) << "MultiTrainer::Finalize communicator is null!";
  } else {
    if (communicator->_worker_ptr != nullptr) {
      communicator->_worker_ptr->Flush();
      VLOG(1) << "MultiTrainer::Finalize ps client flush done";
    } else {
-      VLOG(0) << "communicator->_worker_ptr is null";
+      VLOG(1) << "communicator->_worker_ptr is null";
    }
  }
 #endif

--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -64,6 +64,7 @@ register_unity_group(
  cudnn_lstm_op.cc
  cumsum_op.cc
  cvm_op.cc
+  unzip_op.cc
  data_norm_op.cc
  deformable_conv_op.cc
  deformable_conv_v1_op.cc
@@ -402,6 +403,7 @@ register_unity_group(
  ctc_align_op.cu
  cumsum_op.cu
  cvm_op.cu
+  unzip_op.cu
  data_norm_op.cu
  deformable_conv_op.cu
  deformable_conv_v1_op.cu
@@ -579,3 +581,5 @@ register_unity_group(cu expand_op.cu)
 register_unity_group(cu matmul_v2_op.cu)
 register_unity_group(cu top_k_v2_op.cu)
 register_unity_group(cu set_value_op.cu)
+register_unity_group(cu unzip.cu)
+register_unity_group(cc unzip.cc)
--- a/paddle/fluid/operators/unzip_op.cc
+++ b/paddle/fluid/operators/unzip_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/unzip_op.h"
+
+#include <memory>
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class unzipOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lod");
+    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "lod");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(),
+        2UL,
+        platform::errors::InvalidArgument(
+            "Input(X)'s rank should be 2, but got %d", x_dims.size()));
+
+    auto lod_dims = ctx->GetInputDim("lod");
+    PADDLE_ENFORCE_EQ(
+        lod_dims.size(),
+        1UL,
+        platform::errors::InvalidArgument(
+            "Input(X)'s rank should be 1, but got %d", lod_dims.size()));
+
+    ctx->SetOutputDim("Y", {lod_dims[0] - 1, x_dims[1]});
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of
+  // unzip
+  // is determined by its input "X".
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+                          ctx.device_context().GetPlace());
+  }
+};
+
+class unzipGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unzipGradient");
+    OP_INOUT_CHECK(ctx->HasInput("lod"), "Input", "unzip", "unzipGradient");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input",
+                   framework::GradVarName("Y"),
+                   "unzipGradient");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output",
+                   framework::GradVarName("X"),
+                   "unzipGradient");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto lod_dims = ctx->GetInputDim("lod");
+    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(),
+        2,
+        platform::errors::InvalidArgument(
+            "Expect Input(X)'s rank == 2, but got %d", x_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        dy_dims.size(),
+        2,
+        platform::errors::InvalidArgument(
+            "Expect Input(X)'s rank == 2, but got %d", dy_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        lod_dims.size(),
+        1,
+        platform::errors::InvalidArgument(
+            "Expect Input(X)'s rank == 1, but got %d", lod_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        x_dims[1],
+        dy_dims[1],
+        platform::errors::InvalidArgument(
+            "The 1st dimension of Input(X) and Input(Y@Grad) should "
+            "be equal, X is %d, Y@Grad is %d",
+            x_dims[1],
+            dy_dims[1]));
+
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of
+  // unzip
+  // is determined by its input "X".
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
+                              ctx, framework::GradVarName("Y")),
+                          ctx.device_context().GetPlace());
+  }
+};
+
+class unzipOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
+             "[M x N],"
+             " where N is the batch size and D is the emebdding dim. ");
+    AddInput("lod", "(Tensor),  a 1-D Tensor with shape [K]");
+    AddOutput("Y",
+              "(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
+              "[K-1 x N].");
+    AddComment(R"DOC(
+unzip Operator.
+)DOC");
+  }
+};
+
+template <typename T>
+class unzipGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("unzip_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("lod", this->Input("lod"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(unzip,
+                  ops::unzipOp,
+                  ops::unzipOpMaker,
+                  ops::unzipGradOpMaker<paddle::framework::OpDesc>,
+                  ops::unzipGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(unzip_grad, ops::unzipGradientOp);
+
+REGISTER_OP_CPU_KERNEL(unzip,
+                       ops::unzipOpKernel<int64_t>,
+                       ops::unzipOpKernel<int64_t>);
+
+REGISTER_OP_CPU_KERNEL(unzip_grad,
+                       ops::unzipGradOpKernel<int64_t>,
+                       ops::unzipGradOpKernel<int64_t>);
--- a/paddle/fluid/operators/unzip_op.cu
+++ b/paddle/fluid/operators/unzip_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/unzip_op.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
+namespace paddle {
+namespace operators {
+
+using phi::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename LodType>
+__global__ void unzipKernel(
+    const T* X, const LodType* lod, T* Y, size_t col_size, size_t n) {
+  CUDA_KERNEL_LOOP(i, n) {
+    int lod_idx = i / col_size;
+    if ((lod[lod_idx + 1] - lod[lod_idx]) > 0) {
+      assert((lod[lod_idx + 1] - lod[lod_idx]) == col_size);
+      int x_idx = 0;
+      for (int j = 0; j < lod_idx; ++j) {
+        if ((lod[j + 1] - lod[j]) > 0) {
+          x_idx++;
+        }
+      }
+      Y[i] = X[x_idx * col_size + (i % col_size)];
+    } else {
+      Y[i] = 0;
+    }
+  }
+}
+
+template <typename T, typename LodType>
+class unzipCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<phi::DenseTensor>("X");
+    const T* x_data = x->data<T>();
+
+    const auto* lod = context.Input<phi::DenseTensor>("lod");
+    const LodType* lod_data = lod->data<LodType>();
+
+    auto col_size = x->dims()[1];
+    auto row_size = lod->dims()[0] - 1;
+    auto y_numel = col_size * row_size;
+
+    auto* y = context.Output<phi::DenseTensor>("Y");
+    T* y_data = y->mutable_data<T>(context.GetPlace());
+
+    // for Input X do not have lod Information.
+    auto stream = context.template device_context<phi::GPUContext>().stream();
+    unzipKernel<<<(y_numel + PADDLE_CUDA_NUM_THREADS - 1) /
+                      PADDLE_CUDA_NUM_THREADS,
+                  PADDLE_CUDA_NUM_THREADS,
+                  0,
+                  stream>>>(x_data, lod_data, y_data, col_size, y_numel);
+  }
+};
+
+template <typename T>
+class unzipGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_THROW(phi::errors::Unimplemented("unzip_grad is unimplemented"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    unzip,
+    ops::unzipCUDAKernel<float, int>,
+    ops::unzipCUDAKernel<double, int>,
+    ops::unzipCUDAKernel<paddle::platform::float16, int>,
+    ops::unzipCUDAKernel<int, int>,
+    ops::unzipCUDAKernel<bool, int>,
+    ops::unzipCUDAKernel<int64_t, int>,
+    ops::unzipCUDAKernel<float, int64_t>,
+    ops::unzipCUDAKernel<double, int64_t>,
+    ops::unzipCUDAKernel<paddle::platform::float16, int64_t>,
+    ops::unzipCUDAKernel<int, int64_t>,
+    ops::unzipCUDAKernel<bool, int64_t>,
+    ops::unzipCUDAKernel<int64_t, int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(unzip_grad,
+                        ops::unzipGradCUDAKernel<float>,
+                        ops::unzipGradCUDAKernel<double>,
+                        ops::unzipGradCUDAKernel<paddle::platform::float16>,
+                        ops::unzipGradCUDAKernel<int>,
+                        ops::unzipGradCUDAKernel<bool>,
+                        ops::unzipGradCUDAKernel<int64_t>);
--- a/paddle/fluid/operators/unzip_op.h
+++ b/paddle/fluid/operators/unzip_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class unzipOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_THROW(phi::errors::Unimplemented("unzip is unimplemented"));
+  }
+};
+
+template <typename T>
+class unzipGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_THROW(phi::errors::Unimplemented("unzip_grad is unimplemented"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -290,6 +290,9 @@ void BindDataset(py::module *m) {
      .def("get_memory_data_size",
           &framework::Dataset::GetMemoryDataSize,
           py::call_guard<py::gil_scoped_release>())
+      .def("get_epoch_finish",
+           &framework::Dataset::GetEpochFinish,
+           py::call_guard<py::gil_scoped_release>())
      .def("get_pv_data_size",
           &framework::Dataset::GetPvDataSize,
           py::call_guard<py::gil_scoped_release>())

--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -41,6 +41,9 @@ void BindPSGPUWrapper(py::module* m) {
      .def("set_slot_vector",
           &framework::PSGPUWrapper::SetSlotVector,
           py::call_guard<py::gil_scoped_release>())
+      .def("set_slot_num_for_pull_feature",
+           &framework::PSGPUWrapper::SetPullFeatureSlotNum,
+           py::call_guard<py::gil_scoped_release>())
 #ifdef PADDLE_WITH_CUDA
      .def("set_slot_dim_vector",
           &framework::PSGPUWrapper::SetSlotDimVector,

--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -19,10 +19,10 @@ from _collections import defaultdict
 import paddle
 import paddle.fluid.framework as framework
 from paddle.distributed.passes.pass_base import PassBase, register_pass
+from paddle.fluid.transpiler.collective import SingleProcessMultiThread
 from paddle.framework import core
 from paddle.static import Parameter, Program

-from ..ps.utils.collective_transpiler import SingleProcessMultiThread
 from ..ps.utils.public import *  # noqa: F403



--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -967,6 +967,9 @@ class InMemoryDataset(DatasetBase):
        """
        return self.dataset.get_pv_data_size()

+    def get_epoch_finish(self):
+        return self.dataset.get_epoch_finish()
+
    @deprecated(
        since="2.0.0",
        update_to="paddle.distributed.InMemoryDataset.get_memory_data_size",
@@ -1121,6 +1124,15 @@ class InMemoryDataset(DatasetBase):
        self.proto_desc.graph_config.infer_table_cap = config.get(
            "infer_table_cap", 800000
        )
+        self.proto_desc.graph_config.excluded_train_pair = config.get(
+            "excluded_train_pair", ""
+        )
+        self.proto_desc.graph_config.infer_node_type = config.get(
+            "infer_node_type", ""
+        )
+        self.proto_desc.graph_config.get_degree = config.get(
+            "get_degree", False
+        )
        self.dataset.set_gpu_graph_mode(True)

    def set_pass_id(self, pass_id):

--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -313,6 +313,7 @@ class TestDataset(unittest.TestCase):
        dataset.set_graph_config(graph_config)
        dataset.set_pass_id(0)
        dataset.get_pass_id()
+        dataset.get_epoch_finish()

    def test_in_memory_dataset_masterpatch(self):
        """

--- a/python/paddle/fluid/tests/unittests/test_unzip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unzip_op.py
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+class TestUnzipOp(unittest.TestCase):
+    def test_result(self):
+        """
+        For unzip op
+        """
+        paddle.enable_static()
+        if core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            x = fluid.data(name='X', shape=[3, 4], dtype='float64')
+            lod = fluid.data(name='lod', shape=[11], dtype='int64')
+            output = paddle.incubate.unzip(x, lod)
+
+            input = [
+                [1.0, 2.0, 3.0, 4.0],
+                [10.0, 20.0, 30.0, 40.0],
+                [100.0, 200.0, 300.0, 400.0],
+            ]
+            lod = [0, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]
+
+            feed = {
+                'X': np.array(input).astype("float64"),
+                'lod': np.array(lod).astype("int64"),
+            }
+
+            exe = fluid.Executor(place=place)
+            exe.run(fluid.default_startup_program())
+            res = exe.run(feed=feed, fetch_list=[output])
+            out = [
+                [1.0, 2.0, 3.0, 4.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [10.0, 20.0, 30.0, 40.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [100.0, 200.0, 300.0, 400.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ]
+            out_np = np.array(out, dtype="float64")
+            assert (res == out_np).all(), "output is not right"
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -23,6 +23,7 @@ from .operators import graph_send_recv
 from .operators import graph_khop_sampler
 from .operators import graph_sample_neighbors
 from .operators import graph_reindex
+from .operators import unzip
 from .tensor import segment_sum
 from .tensor import segment_mean
 from .tensor import segment_max
@@ -55,4 +56,5 @@ __all__ = [
    'segment_max',
    'segment_min',
    'identity_loss',
+    'unzip',
 ]
--- a/python/paddle/incubate/operators/__init__.py
+++ b/python/paddle/incubate/operators/__init__.py
@@ -21,3 +21,4 @@ from .graph_send_recv import graph_send_recv  # noqa: F401
 from .graph_khop_sampler import graph_khop_sampler  # noqa: F401
 from .graph_sample_neighbors import graph_sample_neighbors  # noqa: F401
 from .graph_reindex import graph_reindex  # noqa: F401
+from .unzip import unzip  # noqa: F401
--- a/python/paddle/incubate/operators/unzip.py
+++ b/python/paddle/incubate/operators/unzip.py
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.fluid.layer_helper import LayerHelper
+
+
+def unzip(input, lod):
+    r"""
+
+    **unzip layers**
+
+    unzip 'input' accroding to 'lod'
+
+    Args:
+        input (Variable): The zipped input, 2-D LodTensor with shape [N, M].
+        lod (Variable): The original lod of unzipped input, 1-D LodTensor with shape[K].
+
+    Returns:
+        Variable: The original unzipped tensor, 2-D LodTensor with shape[K-1, M].
+
+    Examples:
+
+        .. code-block:: python
+          import numpy as np
+          import paddle
+          import paddle.fluid as fluid
+          paddle.enable_static()
+          input_np = np.array([
+                        [1.0, 2.0, 3.0, 4.0],
+                        [10.0, 20.0, 30.0, 40.0],
+                        [100.0, 200.0, 300.0, 400.0]
+                    ])
+          lod_np = np.array([0, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12])
+          input = paddle.to_tensor(input_np, "int64")
+          lod = paddle.to_tensor(lod_np, "int64")
+
+          unzipped_input = paddle.incubate.unzip(input, lod)
+          '''
+          unzipped_input is [
+                        [1.0, 2.0, 3.0, 4.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [10.0, 20.0, 30.0, 40.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [100.0, 200.0, 300.0, 400.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0]
+                    ]
+          '''
+    """
+    helper = LayerHelper('unzip', **locals())
+    out = helper.create_variable(dtype=input.dtype)
+    check_variable_and_dtype(
+        input,
+        'input',
+        ['float16', 'float32', 'float64', 'int', 'bool', 'int64'],
+        'unzip',
+    )
+    check_variable_and_dtype(lod, 'lod', ['int', 'int64'], 'unzip')
+    helper.append_op(
+        type='unzip', inputs={'X': [input], 'lod': [lod]}, outputs={'Y': [out]}
+    )
+    return out