solve build gpu task core (#30626)

* build gpu task core * format

solve build gpu task core (#30626)
* build gpu task core * format
1bebc092 · Thunderbrook · GitHub · 33bf6eb7 · 1bebc092 · 1bebc092
5 changed file
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -30,11 +30,19 @@ namespace framework {

 class HeterContext {
 public:
+  ~HeterContext() {
+    for (size_t i = 0; i < mutex_.size(); ++i) {
+      delete mutex_[i];
+    }
+    mutex_.clear();
+  }
  Scope* scope_{nullptr};
  std::vector<std::vector<FeatureKey>> feature_keys_;
  std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> value_ptr_;
-  std::vector<std::vector<FeatureValue>> feature_values_;
-  std::vector<std::mutex*> mutex_lock_;
+  std::vector<std::vector<FeatureValue>> device_values_;
+  std::vector<std::vector<FeatureKey>> device_keys_;
+  std::vector<std::mutex*> mutex_;
+
  uint32_t shard_num_ = 37;
  uint64_t size() {
    uint64_t total_size = 0;
@@ -45,19 +53,28 @@ class HeterContext {
  }
  void SetShardNum(uint32_t shard_num) { shard_num_ = shard_num; }
  uint32_t ShardNum() { return shard_num_; }
-  void init() { feature_keys_.resize(shard_num_); }
+  void init(int shard_num, int device_num) {
+    shard_num_ = shard_num;
+    feature_keys_.resize(shard_num_);
+    value_ptr_.resize(shard_num_);
+
+    device_values_.resize(device_num);
+    device_keys_.resize(device_num);
+    mutex_.resize(device_num);
+    for (size_t i = 0; i < mutex_.size(); ++i) {
+      mutex_[i] = new std::mutex();
+    }
+  }
  void batch_add_keys(const std::vector<std::vector<uint64_t>>& thread_keys) {
    assert(thread_keys.size() == feature_keys_.size());

    for (uint32_t i = 0; i < shard_num_; i++) {
      int idx = 0;
-      // mutex_lock_[i]->lock();
      idx = feature_keys_[i].size();
      feature_keys_[i].resize(feature_keys_[i].size() + thread_keys[i].size());
      for (uint64_t j = 0; j < thread_keys[i].size(); j++) {
        feature_keys_[i][idx + j] = thread_keys[i][j];
      }
-      // mutex_lock_[i]->unlock();
    }
  }
  void UniqueKeys() {

--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -40,16 +40,22 @@ namespace framework {
 std::shared_ptr<PSGPUWrapper> PSGPUWrapper::s_instance_ = NULL;
 bool PSGPUWrapper::is_initialized_ = false;

-void PSGPUWrapper::BuildTask(uint64_t table_id, int feature_dim) {
+void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
+                             uint64_t table_id, int feature_dim) {
  VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin";
  platform::Timer timeline;
  timeline.Start();
+  int device_num = heter_devices_.size();
  MultiSlotDataset* dataset = dynamic_cast<MultiSlotDataset*>(dataset_);
-  std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
+  gpu_task->init(thread_keys_shard_num_, device_num);
  auto input_channel = dataset->GetInputChannel();
  auto& local_keys = gpu_task->feature_keys_;
-  auto& local_values = gpu_task->feature_values_;
  auto& local_ptr = gpu_task->value_ptr_;
+
+  auto& device_keys = gpu_task->device_keys_;
+  auto& device_vals = gpu_task->device_values_;
+  auto& device_mutex = gpu_task->mutex_;
+
  std::vector<std::thread> threads;
  auto fleet_ptr = FleetWrapper::GetInstance();

@@ -91,12 +97,11 @@ void PSGPUWrapper::BuildTask(uint64_t table_id, int feature_dim) {
    t.join();
  }
  timeline.Pause();
-  VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
+  VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";

  timeline.Start();

  // merge thread_keys to shard_keys
-  gpu_task->init();
  for (size_t i = 0; i < thread_keys_.size(); i++) {
    gpu_task->batch_add_keys(thread_keys_[i]);
    for (int j = 0; j < thread_keys_thread_num_; j++) {
@@ -105,21 +110,20 @@ void PSGPUWrapper::BuildTask(uint64_t table_id, int feature_dim) {
  }
  timeline.Pause();

-  VLOG(0) << "GpuPs task unique11111 cost " << timeline.ElapsedSec()
+  VLOG(1) << "GpuPs task unique11111 cost " << timeline.ElapsedSec()
          << " seconds.";
-  VLOG(0) << "FK1";
  timeline.Start();
  gpu_task->UniqueKeys();
  timeline.Pause();

-  VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
+  VLOG(1) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";

  for (int i = 0; i < thread_keys_shard_num_; i++) {
-    local_values[i].resize(local_keys[i].size());
+    VLOG(3) << "GpuPs shard: " << i << " key len: " << local_keys[i].size();
    local_ptr[i].resize(local_keys[i].size());
  }

-  auto ptl_func = [this, &local_keys, &local_values, &local_ptr, &table_id,
+  auto ptl_func = [this, &local_keys, &local_ptr, &table_id,
                   &fleet_ptr](int i) {
    size_t key_size = local_keys[i].size();
    auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
@@ -136,68 +140,102 @@ void PSGPUWrapper::BuildTask(uint64_t table_id, int feature_dim) {
      VLOG(3) << "FleetWrapper Pull sparse to local done with table size: "
              << local_keys[i].size();
    }
-    for (size_t num = 0; num < local_ptr[i].size(); ++num) {
-      float* ptr_val = local_ptr[i][num]->data();
-      FeatureValue& val = local_values[i][num];
-      size_t dim = local_ptr[i][num]->size();
-
-      val.delta_score = ptr_val[1];
-      val.show = ptr_val[2];
-      val.clk = ptr_val[3];
-      val.slot = ptr_val[6];
-      val.lr = ptr_val[4];
-      val.lr_g2sum = ptr_val[5];
-
-      if (dim > 7) {
-        val.mf_size = MF_DIM + 1;
-        for (int x = 0; x < val.mf_size; x++) {
-          val.mf[x] = ptr_val[x + 7];
-        }
-      } else {
-        val.mf_size = 0;
-        for (int x = 0; x < MF_DIM + 1; x++) {
-          val.mf[x] = 0;
+  };
+  for (size_t i = 0; i < threads.size(); i++) {
+    threads[i] = std::thread(ptl_func, i);
+  }
+  for (std::thread& t : threads) {
+    t.join();
+  }
+  timeline.Pause();
+  VLOG(1) << "GpuPs pull sparse cost " << timeline.ElapsedSec() << " seconds.";
+
+  timeline.Start();
+  auto build_func = [device_num, &local_keys, &local_ptr, &device_keys,
+                     &device_vals, &device_mutex](int i) {
+    std::vector<std::vector<FeatureKey>> task_keys(device_num);
+    std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
+        device_num);
+
+    for (size_t j = 0; j < local_keys[i].size(); j++) {
+      int shard = local_keys[i][j] % device_num;
+      task_keys[shard].push_back(local_keys[i][j]);
+      task_ptrs[shard].push_back(local_ptr[i][j]);
+    }
+
+    for (int dev = 0; dev < device_num; dev++) {
+      device_mutex[dev]->lock();
+
+      int len = task_keys[dev].size();
+      int cur = device_keys[dev].size();
+      device_keys[dev].resize(device_keys[dev].size() + len);
+      device_vals[dev].resize(device_vals[dev].size() + len);
+
+      for (int j = 0; j < len; ++j) {
+        device_keys[dev][cur + j] = task_keys[dev][j];
+        float* ptr_val = task_ptrs[dev][j]->data();
+        FeatureValue& val = device_vals[dev][cur + j];
+        size_t dim = task_ptrs[dev][j]->size();
+
+        val.delta_score = ptr_val[1];
+        val.show = ptr_val[2];
+        val.clk = ptr_val[3];
+        val.slot = ptr_val[6];
+        val.lr = ptr_val[4];
+        val.lr_g2sum = ptr_val[5];
+
+        if (dim > 7) {
+          val.mf_size = MF_DIM + 1;
+          for (int x = 0; x < val.mf_size; x++) {
+            val.mf[x] = ptr_val[x + 7];
+          }
+        } else {
+          val.mf_size = 0;
+          for (int x = 0; x < MF_DIM + 1; x++) {
+            val.mf[x] = 0;
+          }
        }
      }
+
+      device_mutex[dev]->unlock();
    }
  };
+
  for (size_t i = 0; i < threads.size(); i++) {
-    threads[i] = std::thread(ptl_func, i);
+    threads[i] = std::thread(build_func, i);
  }
  for (std::thread& t : threads) {
    t.join();
  }
  timeline.Pause();
-  VLOG(0) << "GpuPs pull sparse cost " << timeline.ElapsedSec() << " seconds.";
+  VLOG(1) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
+          << " seconds.";
 }

 void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
-  BuildTask(table_id, feature_dim);
+  int device_num = heter_devices_.size();
+  std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
+  BuildTask(gpu_task, table_id, feature_dim);
  platform::Timer timeline;
  timeline.Start();
-  std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
-  int shard_num = gpu_task->feature_keys_.size();
-  if (shard_num == 0) {
-    return;
-  }

-  std::vector<size_t> feature_keys_count(shard_num);
+  std::vector<size_t> feature_keys_count(device_num);
  size_t size_max = 0;
-  for (int i = 0; i < shard_num; i++) {
-    feature_keys_count[i] = gpu_task->feature_keys_[i].size();
+  for (int i = 0; i < device_num; i++) {
+    feature_keys_count[i] = gpu_task->device_keys_[i].size();
    size_max = std::max(size_max, feature_keys_count[i]);
  }
  if (HeterPs_) {
    HeterPs_->show_one_table(0);
    return;
  }
-  std::vector<std::thread> threads(shard_num);
+  std::vector<std::thread> threads(device_num);
  HeterPs_ = HeterPsBase::get_instance(size_max, resource_);
  auto build_func = [this, &gpu_task, &feature_keys_count](int i) {
    std::cout << "building table: " << i << std::endl;
-    this->HeterPs_->build_ps(i, gpu_task->feature_keys_[i].data(),
-                             gpu_task->feature_values_[i].data(),
-                             feature_keys_count[i], 10000, 2);
+    this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(),
+                             gpu_task->device_values_[i].data(),
+                             feature_keys_count[i], 500000, 2);
    HeterPs_->show_one_table(i);
  };
  for (size_t i = 0; i < threads.size(); i++) {
@@ -207,7 +245,7 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
    t.join();
  }
  timeline.Pause();
-  VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec()
+  VLOG(1) << "GpuPs build table total costs: " << timeline.ElapsedSec()
          << " s.";
 }


--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -76,7 +76,8 @@ class PSGPUWrapper {
                   const int batch_size);

  void BuildGPUPS(const uint64_t table_id, int feature_dim);
-  void BuildTask(uint64_t table_id, int feature_dim);
+  void BuildTask(std::shared_ptr<HeterContext> gpu_task, uint64_t table_id,
+                 int feature_dim);
  void InitializeGPU(const std::vector<int>& dev_ids) {
    if (s_instance_ != NULL) {
      VLOG(3) << "PSGPUWrapper Begin InitializeGPU";

--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -74,8 +74,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
    workers_[i]->Initialize(trainer_desc);
    workers_[i]->SetWorkerNum(place_num);
  }
-  auto gpu_ps_wrapper = PSGPUWrapper::GetInstance();
-  gpu_ps_wrapper->InitializeGPU(dev_ids);
  return;
 }


--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -41,6 +41,10 @@ void BindPSGPUWrapper(py::module* m) {
           py::call_guard<py::gil_scoped_release>())
      .def("init_GPU_server", &framework::PSGPUWrapper::InitializeGPUServer,
           py::call_guard<py::gil_scoped_release>())
+      .def("set_dataset", &framework::PSGPUWrapper::SetDataset,
+           py::call_guard<py::gil_scoped_release>())
+      .def("init_gpu_ps", &framework::PSGPUWrapper::InitializeGPU,
+           py::call_guard<py::gil_scoped_release>())
      .def("build_gpu_ps", &framework::PSGPUWrapper::BuildGPUPS,
           py::call_guard<py::gil_scoped_release>());
 }  // end PSGPUWrapper