diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 327adcc4aac1c50b51942c557d66dae6770e24f2..46aca62a27490b0141245beeea9a3e61d0fee651 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -257,7 +257,7 @@ void *Alloc(const platform::CUDAPinnedPlace &place, void *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { - LOG(WARNING) << "cudaMallocHost Cannot allocate " << size + LOG(WARNING) << "cudaHostAlloc Cannot allocate " << size << " bytes in CUDAPinnedPlace"; } if (FLAGS_init_allocated_mem) { diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 6ac3aefdd18d6d9a21dc7ce66511013dfb78bc5b..de81d12cca6ca280289371abdec225c9e2b8f4d0 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -32,7 +32,7 @@ Allocation *CPUPinnedAllocator::AllocateImpl(size_t size, // "CPUPinnedAllocator should be used for Cross-Device Communication"); void *ptr; - PADDLE_ENFORCE(cudaMallocHost(&ptr, size)); + PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); return new CPUPinnedAllocation(ptr, size); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index 26d12dd91c7fda31802226a84d883b6a6e9abbe4..42d0938f2afbb1efca8bfdd7035bc0eada30f06b 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -19,7 +19,7 @@ namespace paddle { namespace memory { namespace allocation { -// Allocator uses `cudaMallocHost` +// Allocator uses `cudaHostAlloc` class CPUPinnedAllocation : public Allocation { public: CPUPinnedAllocation(void *ptr, size_t size) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 3e8fb83e9d5ba2078bcf37e4a4af74708df9c11c..197d1c2f21fd818879aafe17599bc87d33caa198 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -173,14 +173,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { void* p; // PINNED memory is visible to all CUDA contexts. - cudaError_t result = cudaMallocHost(&p, size); + cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); if (result == cudaSuccess) { *index = 1; // PINNED memory cuda_pinnd_alloc_size_ += size; return p; } else { - LOG(WARNING) << "cudaMallocHost failed."; + LOG(WARNING) << "cudaHostAlloc failed."; return nullptr; } diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 26ff221dfa0768bd2bcc9e6485a32485f0212ac6..defc29b91f81cb851fec24c5cd9d62dc72c54147 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/reader/buffered_reader.h" #include +#include "paddle/fluid/framework/data_type.h" namespace paddle { namespace operators { @@ -24,6 +25,13 @@ BufferedReader::~BufferedReader() { position_.front().wait(); position_.pop(); } +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + PADDLE_ENFORCE(cudaStreamDestroy(stream)); + for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event)); + } +#endif } BufferedReader::BufferedReader( @@ -33,6 +41,19 @@ BufferedReader::BufferedReader( thread_pool_(1), place_(place), buffer_size_(buffer_size) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + compute_stream = + ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events.resize(buffer_size); + for (auto &event : events) + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + } +#endif cpu_buffer_.resize(buffer_size); gpu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); @@ -46,6 +67,12 @@ void BufferedReader::ReadTillBufferFullAsync() { } void BufferedReader::ReadAsync(size_t i) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream)); + } +#endif position_.emplace(thread_pool_.enqueue([this, i]() -> size_t { TensorVec &cpu = cpu_buffer_[i]; reader_->ReadNext(&cpu); @@ -54,14 +81,41 @@ void BufferedReader::ReadAsync(size_t i) { return -1UL; } +#ifdef PADDLE_WITH_CUDA + // NOTE(liangdun): using async copy instead of TensorCopySync + // TensorCopySync would block other stream if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0)); TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); for (size_t i = 0; i < cpu.size(); ++i) { - framework::TensorCopySync(cpu[i], place_, &gpu[i]); + gpu[i].Resize(cpu[i].dims()); + gpu[i].set_layout(cpu[i].layout()); + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type()); + auto size = + cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); + if (platform::is_cuda_pinned_place(cpu_place)) + memory::Copy(boost::get(place_), gpu_ptr, + boost::get(cpu_place), + cpu_ptr, size, stream); + else if ((platform::is_gpu_place(cpu_place))) + memory::Copy(boost::get(place_), gpu_ptr, + boost::get(cpu_place), cpu_ptr, + size, stream); + else + // if cpu place is not pinned, async copy is slower than sync copy, + // so we use sync copy instead. + memory::Copy(boost::get(place_), gpu_ptr, + boost::get(cpu_place), cpu_ptr, size, + 0); gpu[i].set_lod(cpu[i].lod()); } + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); } +#endif return i; })); } diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index cbe2bc1b5fdd69d1a843b768e3289acd621369a6..87680da01a1f51cfdfe4d100508440eda9d1877f 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -19,6 +19,9 @@ #include #include "ThreadPool.h" #include "paddle/fluid/framework/reader.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif namespace paddle { namespace operators { @@ -59,6 +62,11 @@ class BufferedReader : public framework::DecoratedReader { std::vector cpu_buffer_; std::vector gpu_buffer_; size_t prev_pos_{-1UL}; +#ifdef PADDLE_WITH_CUDA + cudaStream_t stream; + cudaStream_t compute_stream; + std::vector events; +#endif }; } // namespace reader