From bb0713b2973b5f3183ba2fc5c6561c05061a386a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Mon, 20 Dec 2021 16:36:22 +0800 Subject: [PATCH] changes the call AllocShared to Alloc, test=develop (#38258) --- paddle/fluid/framework/fleet/box_wrapper.cu | 9 +- .../fluid/framework/fleet/box_wrapper_impl.h | 8 +- .../framework/fleet/heter_ps/heter_comm.h | 21 ++-- .../framework/fleet/heter_ps/heter_comm_inl.h | 112 +++++++++--------- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 9 +- .../fluid/framework/fleet/ps_gpu_wrapper.cu | 9 +- paddle/fluid/operators/math/math_function.cu | 4 +- paddle/pten/kernels/hybird/transpose.cu | 4 +- 8 files changed, 89 insertions(+), 87 deletions(-) diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index c91d371f5a1..0ef77a03876 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -140,7 +140,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, platform::DeviceContextPool::Instance().Get( BOOST_GET_CONST(platform::CUDAPlace, place))) ->stream(); - auto buf_value = memory::AllocShared(place, values.size() * sizeof(float*)); + auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); float** gpu_values = reinterpret_cast(buf_value->ptr()); #ifdef PADDLE_WITH_HIP hipMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), @@ -233,11 +233,10 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place, slot_lengths_lod[i] += slot_lengths_lod[i - 1]; } auto buf_grad_value = - memory::AllocShared(place, grad_values.size() * sizeof(float*)); - auto buf_length = - memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t)); + memory::Alloc(place, grad_values.size() * sizeof(float*)); + auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); auto buf_slot_vector = - memory::AllocShared(place, slot_lengths_lod.size() * sizeof(int)); + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index 8832f0a20e3..f42b0395eaf 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -32,7 +32,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, int64_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - auto buf = memory::AllocShared( + auto buf = memory::Alloc( place, total_length * sizeof(boxps::FeatureValueGpu)); boxps::FeatureValueGpu* total_values_gpu = @@ -55,9 +55,9 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, for (size_t i = 1; i < slot_lengths_lod.size(); i++) { slot_lengths_lod[i] += slot_lengths_lod[i - 1]; } - auto buf_key = memory::AllocShared(place, keys.size() * sizeof(uint64_t*)); + auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*)); auto buf_length = - memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t)); + memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); #ifdef PADDLE_WITH_HIP @@ -118,7 +118,7 @@ void BoxWrapper::PushSparseGradCase( all_timer.Start(); int64_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - auto buf = memory::AllocShared( + auto buf = memory::Alloc( place, total_length * sizeof(boxps::FeaturePushValueGpu)); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index c3fe2dabea9..7b43e68ff01 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -17,9 +17,10 @@ limitations under the License. */ #include #include "cub/cub.cuh" #include "cub/util_allocator.cuh" -#include "hashtable.h" -#include "heter_resource.h" +#include "hashtable.h" // NOLINT +#include "heter_resource.h" // NOLINT #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" +#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/dynload/nccl.h" @@ -58,7 +59,7 @@ class HeterComm { void split_input_to_shard(KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right, int gpu_num); void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, - int& uniq_len); + int& uniq_len); // NOLINT void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len); void build_ps(int num, KeyType* h_keys, ValType* h_vals, size_t len, size_t chunk_size, int stream_num); @@ -68,15 +69,15 @@ class HeterComm { template void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len, - Sgd& sgd); + Sgd& sgd); // NOLINT template void push_sparse_multi_node(int num, KeyType* d_keys, GradType* d_grads, - size_t len, Sgd& sgd); + size_t len, Sgd& sgd); // NOLINT template void update_one_table(int num, KeyType* d_keys, GradType* d_grads, size_t len, - Sgd& sgd); + Sgd& sgd); // NOLINT int gather_one_node_grad(int num, KeyType* d_keys, GradType* d_grads, int len); @@ -136,16 +137,16 @@ class HeterComm { if (force || size > all_keys_mem->size()) { all_keys_mem.reset(); all_grads_mem.reset(); - all_keys_mem = memory::AllocShared(place_, size * sizeof(KeyType)); - all_grads_mem = memory::AllocShared(place_, size * sizeof(GradType)); + all_keys_mem = memory::Alloc(place_, size * sizeof(KeyType)); + all_grads_mem = memory::Alloc(place_, size * sizeof(GradType)); all_keys = reinterpret_cast(all_keys_mem->ptr()); all_grads = reinterpret_cast(all_grads_mem->ptr()); } if (force || size > local_keys_mem->size()) { local_keys_mem.reset(); local_grads_mem.reset(); - local_keys_mem = memory::AllocShared(place_, size * sizeof(KeyType)); - local_grads_mem = memory::AllocShared(place_, size * sizeof(GradType)); + local_keys_mem = memory::Alloc(place_, size * sizeof(KeyType)); + local_grads_mem = memory::Alloc(place_, size * sizeof(GradType)); local_keys = reinterpret_cast(local_keys_mem->ptr()); local_grads = reinterpret_cast(local_grads_mem->ptr()); } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index c293b07e899..2cf702969f9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -28,7 +28,7 @@ __global__ void fill_idx(T* idx, size_t len) { template void show_tensor(T* input, size_t len, gpuStream_t stream, std::string name) { - T tmp[len]; + T tmp[len]; // NOLINT cudaMemcpyAsync(&tmp, input, sizeof(T) * len, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); std::cout << name; @@ -101,7 +101,7 @@ HeterComm::HeterComm( for (int i = 0; i < resource_->total_gpu(); ++i) { platform::CUDADeviceGuard guard(resource_->dev_id(i)); allocators_.push_back(std::make_shared( - 8, 1, (unsigned int)-1, (size_t)-1, false, false)); + 8, 1, (unsigned int)-1, (size_t)-1, false, false)); // NOLINT auto table = new Table(capacity / load_factor_); tables_.push_back(table); if (multi_node_) { @@ -174,10 +174,12 @@ void HeterComm::create_storage(int start_index, for (size_t i = 0; i < nodes.size(); ++i) { platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num)); allocator->DeviceAllocate( - resource_->dev_id(nodes[i].gpu_num), (void**)&(nodes[i].key_storage), + resource_->dev_id(nodes[i].gpu_num), + (void**)&(nodes[i].key_storage), // NOLINT keylen, resource_->remote_stream(nodes[i].gpu_num, start_index)); allocator->DeviceAllocate( - resource_->dev_id(nodes[i].gpu_num), (void**)&(nodes[i].val_storage), + resource_->dev_id(nodes[i].gpu_num), + (void**)&(nodes[i].val_storage), // NOLINT vallen, resource_->remote_stream(nodes[i].gpu_num, start_index)); nodes[i].key_bytes_len = keylen; @@ -342,16 +344,16 @@ void HeterComm::build_ps(int num, KeyType* h_keys, platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); - std::vector> d_key_bufs; - std::vector> d_val_bufs; + std::vector d_key_bufs; + std::vector d_val_bufs; - gpuStream_t streams[stream_num]; + gpuStream_t streams[stream_num]; // NOLINT for (int i = 0; i < stream_num; ++i) { PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i]))); - auto d_k_buf = memory::AllocShared(place, chunk_size * sizeof(KeyType)); - auto d_v_buf = memory::AllocShared(place, chunk_size * sizeof(ValType)); - d_key_bufs.push_back(d_k_buf); - d_val_bufs.push_back(d_v_buf); + auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType)); + auto d_v_buf = memory::Alloc(place, chunk_size * sizeof(ValType)); + d_key_bufs.push_back(std::move(d_k_buf)); + d_val_bufs.push_back(std::move(d_v_buf)); } int cur_len = 0; @@ -383,11 +385,9 @@ void HeterComm::build_ps(int num, KeyType* h_keys, } template -void HeterComm::merge_grad(int gpu_num, - KeyType* d_keys, - GradType* d_grads, - size_t len, - int& uniq_len) { +void HeterComm::merge_grad( + int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, + int& uniq_len) { // NOLINT int dev_id = resource_->dev_id(gpu_num); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); @@ -395,10 +395,10 @@ void HeterComm::merge_grad(int gpu_num, size_t temp_storage_bytes; - auto d_merge_keys = memory::AllocShared(place, len * sizeof(KeyType)); + auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); - auto d_merge_grads = memory::AllocShared(place, len * sizeof(GradType)); + auto d_merge_grads = memory::Alloc(place, len * sizeof(GradType)); GradType* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); @@ -407,14 +407,14 @@ void HeterComm::merge_grad(int gpu_num, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false)); void* d_buff = NULL; - auto d_temp_storage = memory::AllocShared(place, temp_storage_bytes); + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false)); temp_storage_bytes = 0; - auto d_num_runs_out_mem = memory::AllocShared(place, sizeof(int)); + auto d_num_runs_out_mem = memory::Alloc(place, sizeof(int)); int* d_num_runs_out = reinterpret_cast(d_num_runs_out_mem->ptr()); PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey( @@ -423,7 +423,7 @@ void HeterComm::merge_grad(int gpu_num, if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; - d_temp_storage = memory::AllocShared(place, temp_storage_bytes); + d_temp_storage = memory::Alloc(place, temp_storage_bytes); } PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey( @@ -445,13 +445,13 @@ void HeterComm::split_input_to_shard( platform::CUDADeviceGuard guard(dev_id); auto stream = resource_->local_stream(gpu_num, 0); - auto d_idx_tmp = memory::AllocShared(place, len * sizeof(int)); + auto d_idx_tmp = memory::Alloc(place, len * sizeof(int)); int* d_idx_tmp_ptr = reinterpret_cast(d_idx_tmp->ptr()); - auto d_shard_index = memory::AllocShared(place, len * sizeof(int)); + auto d_shard_index = memory::Alloc(place, len * sizeof(int)); int* d_shard_index_ptr = reinterpret_cast(d_shard_index->ptr()); - auto d_shard_index_tmp = memory::AllocShared(place, len * sizeof(int)); + auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int)); int* d_shard_index_tmp_ptr = reinterpret_cast(d_shard_index_tmp->ptr()); int grid_size = (len - 1) / block_size_ + 1; @@ -465,7 +465,7 @@ void HeterComm::split_input_to_shard( NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream)); - auto d_temp_storage = memory::AllocShared(place, temp_storage_bytes); + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream)); @@ -491,23 +491,23 @@ void HeterComm::pull_sparse(int num, int grid_size = (len - 1) / block_size_ + 1; - int h_left[total_gpu]; - int h_right[total_gpu]; + int h_left[total_gpu]; // NOLINT + int h_right[total_gpu]; // NOLINT - auto d_left = memory::AllocShared(place, total_gpu * sizeof(int)); - auto d_right = memory::AllocShared(place, total_gpu * sizeof(int)); + auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); int* d_left_ptr = reinterpret_cast(d_left->ptr()); int* d_right_ptr = reinterpret_cast(d_right->ptr()); cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); // - auto d_idx = memory::AllocShared(place, len * sizeof(int)); + auto d_idx = memory::Alloc(place, len * sizeof(int)); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - auto d_shard_keys = memory::AllocShared(place, len * sizeof(KeyType)); + auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - auto d_shard_vals = memory::AllocShared(place, len * sizeof(ValType)); + auto d_shard_vals = memory::Alloc(place, len * sizeof(ValType)); ValType* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num); @@ -574,7 +574,8 @@ template void HeterComm::push_sparse(int gpu_num, KeyType* d_keys, GradType* d_grads, - size_t len, Sgd& sgd) { + size_t len, + Sgd& sgd) { // NOLINT if (len == 0) { return; } @@ -585,23 +586,23 @@ void HeterComm::push_sparse(int gpu_num, platform::CUDADeviceGuard guard(dev_id); auto stream = resource_->local_stream(gpu_num, 0); - int h_left[total_gpu]; - int h_right[total_gpu]; + int h_left[total_gpu]; // NOLINT + int h_right[total_gpu]; // NOLINT - auto d_left = memory::AllocShared(place, total_gpu * sizeof(int)); - auto d_right = memory::AllocShared(place, total_gpu * sizeof(int)); + auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); int* d_left_ptr = reinterpret_cast(d_left->ptr()); int* d_right_ptr = reinterpret_cast(d_right->ptr()); cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); // - auto d_idx = memory::AllocShared(place, len * sizeof(int)); + auto d_idx = memory::Alloc(place, len * sizeof(int)); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - auto d_shard_keys = memory::AllocShared(place, len * sizeof(KeyType)); + auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - auto d_shard_grads = memory::AllocShared(place, len * sizeof(GradType)); + auto d_shard_grads = memory::Alloc(place, len * sizeof(GradType)); GradType* d_shard_grads_ptr = reinterpret_cast(d_shard_grads->ptr()); @@ -664,7 +665,8 @@ void HeterComm::push_sparse(int gpu_num, template template void HeterComm::update_one_table( - int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, Sgd& sgd) { + int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, + Sgd& sgd) { // NOLINT if (len == 0) { return; } @@ -681,7 +683,8 @@ void HeterComm::update_one_table( template template void HeterComm::push_sparse_multi_node( - int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, Sgd& sgd) { + int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, + Sgd& sgd) { // NOLINT if (len == 0) { return; } @@ -711,8 +714,8 @@ int HeterComm::gather_one_node_grad( ncclComm_t nccl_inner_comm = nccl_inner_comms_[gpu_num]; // alloc for size - int h_node_len[total_gpu]; - auto d_node_len_mem = memory::AllocShared(place, total_gpu * sizeof(int)); + int h_node_len[total_gpu]; // NOLINT + auto d_node_len_mem = memory::Alloc(place, total_gpu * sizeof(int)); int* d_node_len = reinterpret_cast(d_node_len_mem->ptr()); h_node_len[gpu_num] = len; @@ -721,9 +724,10 @@ int HeterComm::gather_one_node_grad( // allgather grad len PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( - (const void*)(d_node_len + gpu_num), (void*)d_node_len, 1, ncclInt, - nccl_inner_comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::ncclAllGather((const void*)(d_node_len + gpu_num), + (void*)d_node_len, 1, ncclInt, // NOLINT + nccl_inner_comm, stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); cudaMemcpy(h_node_len, d_node_len, sizeof(int) * total_gpu, @@ -747,17 +751,17 @@ int HeterComm::gather_one_node_grad( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - int h_left[total_gpu]; - int h_right[total_gpu]; - auto d_left = memory::AllocShared(place, total_gpu * sizeof(int)); - auto d_right = memory::AllocShared(place, total_gpu * sizeof(int)); + int h_left[total_gpu]; // NOLINT + int h_right[total_gpu]; // NOLINT + auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); int* d_left_ptr = reinterpret_cast(d_left->ptr()); int* d_right_ptr = reinterpret_cast(d_right->ptr()); int merge_num = 0; for (int i = 0; i < total_gpu; ++i) { int index = i * max_size; - auto d_idx = memory::AllocShared(place, h_node_len[i] * sizeof(int)); + auto d_idx = memory::Alloc(place, h_node_len[i] * sizeof(int)); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); cudaMemset(d_left_ptr, -1, total_gpu * sizeof(int)); @@ -794,8 +798,8 @@ int HeterComm::gather_multi_node_grad( int max_size = 0; ncclComm_t nccl_inter_comm = nccl_inter_comms_[gpu_num]; // alloc for size - int h_node_len[node_size_]; - auto d_node_len_mem = memory::AllocShared(place, node_size_ * sizeof(int)); + int h_node_len[node_size_]; // NOLINT + auto d_node_len_mem = memory::Alloc(place, node_size_ * sizeof(int)); int* d_node_len = reinterpret_cast(d_node_len_mem->ptr()); h_node_len[0] = len; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index a5194ce7e29..264155d79c0 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -592,7 +592,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, all_timer.Start(); int64_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - auto buf = memory::AllocShared(place, total_length * sizeof(FeatureValue)); + auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue)); FeatureValue* total_values_gpu = reinterpret_cast(buf->ptr()); if (platform::is_cpu_place(place)) { PADDLE_THROW(platform::errors::Unimplemented( @@ -610,9 +610,9 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, for (size_t i = 1; i < slot_lengths_lod.size(); i++) { slot_lengths_lod[i] += slot_lengths_lod[i - 1]; } - auto buf_key = memory::AllocShared(place, keys.size() * sizeof(uint64_t*)); + auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*)); auto buf_length = - memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t)); + memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), @@ -660,8 +660,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, all_timer.Start(); int64_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - auto buf = - memory::AllocShared(place, total_length * sizeof(FeaturePushValue)); + auto buf = memory::Alloc(place, total_length * sizeof(FeaturePushValue)); FeaturePushValue* total_grad_values_gpu = reinterpret_cast(buf->ptr()); if (platform::is_cpu_place(place)) { diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index a0954ef0709..b4fa09653a3 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -116,7 +116,7 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, platform::DeviceContextPool::Instance().Get( BOOST_GET_CONST(platform::CUDAPlace, place))) ->stream(); - auto buf_value = memory::AllocShared(place, values.size() * sizeof(float*)); + auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); float** gpu_values = reinterpret_cast(buf_value->ptr()); cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), cudaMemcpyHostToDevice); @@ -156,11 +156,10 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, slot_lengths_lod[i] += slot_lengths_lod[i - 1]; } auto buf_grad_value = - memory::AllocShared(place, grad_values.size() * sizeof(float*)); - auto buf_length = - memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t)); + memory::Alloc(place, grad_values.size() * sizeof(float*)); + auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); auto buf_slot_vector = - memory::AllocShared(place, slot_lengths_lod.size() * sizeof(int)); + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index fa214c6389c..a692246a06b 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -102,8 +102,8 @@ struct TransposeNormal { BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); platform::CPUPlace cpu_place = platform::CPUPlace(); size_t size = 3 * rank * sizeof(int64_t); - auto cpu_buf_holder = memory::AllocShared(cpu_place, size); - auto cuda_buf_holder = memory::AllocShared(cuda_place, size); + auto cpu_buf_holder = memory::Alloc(cpu_place, size); + auto cuda_buf_holder = memory::Alloc(cuda_place, size); REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); for (int i = 0; i < rank; ++i) { diff --git a/paddle/pten/kernels/hybird/transpose.cu b/paddle/pten/kernels/hybird/transpose.cu index ad2eda4edd9..5c5bd291176 100644 --- a/paddle/pten/kernels/hybird/transpose.cu +++ b/paddle/pten/kernels/hybird/transpose.cu @@ -69,8 +69,8 @@ struct TransposeNormal { BOOST_GET_CONST(paddle::platform::CUDAPlace, dev_ctx.GetPlace()); paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); size_t size = 3 * rank * sizeof(int64_t); - auto cpu_buf_holder = paddle::memory::AllocShared(cpu_place, size); - auto cuda_buf_holder = paddle::memory::AllocShared(cuda_place, size); + auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); + auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size); REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); for (int i = 0; i < rank; ++i) { -- GitLab