Dump cpu xingneng (#40068)

* dump cpu * code format

Dump cpu xingneng (#40068)
* dump cpu * code format
5496a7ab · Thunderbrook · GitHub · 1ca379bf · 5496a7ab
显示空白变更内容
内联并排

Showing with 47 addition and 29 deletion

paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +47 -29

未找到文件。
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -186,10 +186,18 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
  container_->prefetch(cudaCpuDeviceId, stream);
+  std::vector<std::thread> threads;
  size_t num = container_->size();
  KeyType unuse_key = std::numeric_limits<KeyType>::max();
  thrust::pair<KeyType, ValType>* kv = container_->data();
-  for (size_t i = 0; i < num; ++i) {
+  int thread_num = 8;
+  int len_per_thread = num / thread_num;
+  int remain = num % thread_num;
+  int begin = 0;
+  auto dump_func = [unuse_key, kv](int left, int right) {
+    for (int i = left; i < right; i++) {
      if (kv[i].first == unuse_key) {
        continue;
      }
@@ -223,8 +231,18 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
      }
 #endif
    }
+  };
+  for (int i = 0; i < thread_num; i++) {
+    threads.push_back(std::thread(
+        dump_func, begin, begin + len_per_thread + (i < remain ? 1 : 0)));
+    begin += len_per_thread + (i < remain ? 1 : 0);
+  }
+  for (std::thread& t : threads) {
+    t.join();
+  }
-  container_->prefetch(devid, stream);
+  // container_->prefetch(devid, stream);
 }
 template <typename KeyType, typename ValType>