未验证 提交 5496a7ab 编写于 作者: T Thunderbrook 提交者: GitHub

Dump cpu xingneng (#40068)

* dump cpu

* code format
上级 1ca379bf
...@@ -186,45 +186,63 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len, ...@@ -186,45 +186,63 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
template <typename KeyType, typename ValType> template <typename KeyType, typename ValType>
void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) { void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
container_->prefetch(cudaCpuDeviceId, stream); container_->prefetch(cudaCpuDeviceId, stream);
std::vector<std::thread> threads;
size_t num = container_->size(); size_t num = container_->size();
KeyType unuse_key = std::numeric_limits<KeyType>::max(); KeyType unuse_key = std::numeric_limits<KeyType>::max();
thrust::pair<KeyType, ValType>* kv = container_->data(); thrust::pair<KeyType, ValType>* kv = container_->data();
for (size_t i = 0; i < num; ++i) {
if (kv[i].first == unuse_key) { int thread_num = 8;
continue; int len_per_thread = num / thread_num;
} int remain = num % thread_num;
ValType& gpu_val = kv[i].second; int begin = 0;
auto dump_func = [unuse_key, kv](int left, int right) {
for (int i = left; i < right; i++) {
if (kv[i].first == unuse_key) {
continue;
}
ValType& gpu_val = kv[i].second;
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
auto* downpour_value = auto* downpour_value =
(paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr);
int downpour_value_size = downpour_value->size(); int downpour_value_size = downpour_value->size();
if (gpu_val.mf_size > 0 && downpour_value_size == 7) { if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
downpour_value->resize(gpu_val.mf_size + downpour_value_size); downpour_value->resize(gpu_val.mf_size + downpour_value_size);
} }
float* cpu_val = downpour_value->data(); float* cpu_val = downpour_value->data();
// cpu_val[0] = 0; // cpu_val[0] = 0;
cpu_val[1] = gpu_val.delta_score; cpu_val[1] = gpu_val.delta_score;
cpu_val[2] = gpu_val.show; cpu_val[2] = gpu_val.show;
cpu_val[3] = gpu_val.clk; cpu_val[3] = gpu_val.clk;
cpu_val[4] = gpu_val.lr; cpu_val[4] = gpu_val.lr;
cpu_val[5] = gpu_val.lr_g2sum; cpu_val[5] = gpu_val.lr_g2sum;
cpu_val[6] = gpu_val.slot; cpu_val[6] = gpu_val.slot;
if (gpu_val.mf_size > 0) { if (gpu_val.mf_size > 0) {
for (int x = 0; x < gpu_val.mf_size; x++) { for (int x = 0; x < gpu_val.mf_size; x++) {
cpu_val[x + 7] = gpu_val.mf[x]; cpu_val[x + 7] = gpu_val.mf[x];
}
} }
}
#endif #endif
#ifdef PADDLE_WITH_PSCORE #ifdef PADDLE_WITH_PSCORE
auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr);
downpour_value->count_ = gpu_val.show; downpour_value->count_ = gpu_val.show;
for (int x = 0; x < gpu_val.mf_size; x++) { for (int x = 0; x < gpu_val.mf_size; x++) {
downpour_value->data_[x] = gpu_val.mf[x]; downpour_value->data_[x] = gpu_val.mf[x];
} }
#endif #endif
}
};
for (int i = 0; i < thread_num; i++) {
threads.push_back(std::thread(
dump_func, begin, begin + len_per_thread + (i < remain ? 1 : 0)));
begin += len_per_thread + (i < remain ? 1 : 0);
}
for (std::thread& t : threads) {
t.join();
} }
container_->prefetch(devid, stream); // container_->prefetch(devid, stream);
} }
template <typename KeyType, typename ValType> template <typename KeyType, typename ValType>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册