未验证 提交 9c17688a 编写于 作者: Z zmxdream 提交者: GitHub

[GPUPS]Optimize dump_pool_to_cpu for dymf (#43219)

* optimize dump_to_cpu for dymf

* code clean. test=develop

* fix func. test=develop

* fix code style. test=develop

* fix. test=develop
上级 5434d663
......@@ -838,29 +838,42 @@ void PSGPUWrapper::EndPass() {
std::max(keysize_max, current_task_->device_dim_keys_[i][j].size());
}
}
auto dump_pool_to_cpu_func = [this](int i, int j) {
int thread_num = 8;
auto dump_pool_to_cpu_func = [this, thread_num](int i, int j, int z) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
auto& device_keys = this->current_task_->device_dim_keys_[i][j];
size_t len = device_keys.size();
// ====== multi-thread process feasign================
int len_per_thread = len / thread_num;
int remain = len % thread_num;
int left = -1, right = -1;
int real_len = len_per_thread;
if (z < remain) real_len++;
if (z < remain) {
left = z * (len_per_thread + 1);
right = left + real_len;
} else {
left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread;
right = left + real_len;
}
// ============ multi-thread process feasign============
int mf_dim = this->index_dim_vec_[j];
VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim;
size_t feature_value_size =
TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
char* test_build_values = (char*)malloc(feature_value_size * len);
cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len,
cudaMemcpyDeviceToHost);
char* test_build_values = (char*)malloc(feature_value_size * real_len);
uint64_t offset = left * feature_value_size;
cudaMemcpy(test_build_values, hbm_pool->mem() + offset,
feature_value_size * real_len, cudaMemcpyDeviceToHost);
CHECK(len == hbm_pool->capacity());
uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
for (size_t i = 0; i < len; ++i) {
for (int i = left; i < right; ++i) {
if (device_keys[i] == unuse_key) {
continue;
}
size_t offset = i * feature_value_size;
FeatureValue* gpu_val = (FeatureValue*)(test_build_values + offset);
size_t local_offset = (i - left) * feature_value_size;
FeatureValue* gpu_val = (FeatureValue*)(test_build_values + local_offset);
#ifdef PADDLE_WITH_PSLIB
auto* downpour_value =
(paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
......@@ -912,10 +925,13 @@ void PSGPUWrapper::EndPass() {
if (multi_mf_dim_) {
VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
size_t device_num = heter_devices_.size();
std::vector<std::thread> threads(device_num * multi_mf_dim_);
std::vector<std::thread> threads(device_num * multi_mf_dim_ * thread_num);
for (size_t i = 0; i < device_num; i++) {
for (int j = 0; j < multi_mf_dim_; j++) {
threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j);
for (int k = 0; k < thread_num; k++) {
threads[(i + j * device_num) * thread_num + k] =
std::thread(dump_pool_to_cpu_func, i, j, k);
}
}
}
for (std::thread& t : threads) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册