From 5496a7ab3beb60e908a2deb6eb5bca9834ac7a8b Mon Sep 17 00:00:00 2001 From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Date: Fri, 4 Mar 2022 17:20:26 +0800 Subject: [PATCH] Dump cpu xingneng (#40068) * dump cpu * code format --- .../framework/fleet/heter_ps/hashtable_inl.h | 76 ++++++++++++------- 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index b7cb2ce0f0..59220fc9cd 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -186,45 +186,63 @@ void HashTable::insert(const KeyType* d_keys, size_t len, template void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { container_->prefetch(cudaCpuDeviceId, stream); + std::vector threads; size_t num = container_->size(); KeyType unuse_key = std::numeric_limits::max(); thrust::pair* kv = container_->data(); - for (size_t i = 0; i < num; ++i) { - if (kv[i].first == unuse_key) { - continue; - } - ValType& gpu_val = kv[i].second; + + int thread_num = 8; + int len_per_thread = num / thread_num; + int remain = num % thread_num; + int begin = 0; + + auto dump_func = [unuse_key, kv](int left, int right) { + for (int i = left; i < right; i++) { + if (kv[i].first == unuse_key) { + continue; + } + ValType& gpu_val = kv[i].second; #ifdef PADDLE_WITH_PSLIB - auto* downpour_value = - (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); - int downpour_value_size = downpour_value->size(); - if (gpu_val.mf_size > 0 && downpour_value_size == 7) { - downpour_value->resize(gpu_val.mf_size + downpour_value_size); - } - float* cpu_val = downpour_value->data(); - // cpu_val[0] = 0; - cpu_val[1] = gpu_val.delta_score; - cpu_val[2] = gpu_val.show; - cpu_val[3] = gpu_val.clk; - cpu_val[4] = gpu_val.lr; - cpu_val[5] = gpu_val.lr_g2sum; - cpu_val[6] = gpu_val.slot; - if (gpu_val.mf_size > 0) { - for (int x = 0; x < gpu_val.mf_size; x++) { - cpu_val[x + 7] = gpu_val.mf[x]; + auto* downpour_value = + (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); + int downpour_value_size = downpour_value->size(); + if (gpu_val.mf_size > 0 && downpour_value_size == 7) { + downpour_value->resize(gpu_val.mf_size + downpour_value_size); + } + float* cpu_val = downpour_value->data(); + // cpu_val[0] = 0; + cpu_val[1] = gpu_val.delta_score; + cpu_val[2] = gpu_val.show; + cpu_val[3] = gpu_val.clk; + cpu_val[4] = gpu_val.lr; + cpu_val[5] = gpu_val.lr_g2sum; + cpu_val[6] = gpu_val.slot; + if (gpu_val.mf_size > 0) { + for (int x = 0; x < gpu_val.mf_size; x++) { + cpu_val[x + 7] = gpu_val.mf[x]; + } } - } #endif #ifdef PADDLE_WITH_PSCORE - auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); - downpour_value->count_ = gpu_val.show; - for (int x = 0; x < gpu_val.mf_size; x++) { - downpour_value->data_[x] = gpu_val.mf[x]; - } + auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); + downpour_value->count_ = gpu_val.show; + for (int x = 0; x < gpu_val.mf_size; x++) { + downpour_value->data_[x] = gpu_val.mf[x]; + } #endif + } + }; + + for (int i = 0; i < thread_num; i++) { + threads.push_back(std::thread( + dump_func, begin, begin + len_per_thread + (i < remain ? 1 : 0))); + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); } - container_->prefetch(devid, stream); + // container_->prefetch(devid, stream); } template -- GitLab