Fix Pr #15296

test=develop

Fix Pr #15296
test=develop
bc921927 · Dun Liang · db8fcf6b · bc921927 · bc921927 · bc921927
6 changed file
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -257,7 +257,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
  void *ptr = buddy_allocator->Alloc(size);

  if (ptr == nullptr) {
-    LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
+    LOG(WARNING) << "cudaHostAlloc Cannot allocate " << size
                 << " bytes in CUDAPinnedPlace";
  }
  if (FLAGS_init_allocated_mem) {

--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -32,7 +32,7 @@ Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
  //    "CPUPinnedAllocator should be used for Cross-Device Communication");

  void *ptr;
-  PADDLE_ENFORCE(cudaMallocHost(&ptr, size));
+  PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
  return new CPUPinnedAllocation(ptr, size);
 }
 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {

-// Allocator uses `cudaMallocHost`
+// Allocator uses `cudaHostAlloc`
 class CPUPinnedAllocation : public Allocation {
 public:
  CPUPinnedAllocation(void *ptr, size_t size)

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -173,14 +173,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {

  void* p;
  // PINNED memory is visible to all CUDA contexts.
-  cudaError_t result = cudaMallocHost(&p, size);
+  cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable);

  if (result == cudaSuccess) {
    *index = 1;  // PINNED memory
    cuda_pinnd_alloc_size_ += size;
    return p;
  } else {
-    LOG(WARNING) << "cudaMallocHost failed.";
+    LOG(WARNING) << "cudaHostAlloc failed.";
    return nullptr;
  }


--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -29,6 +29,7 @@ BufferedReader::~BufferedReader() {
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
    PADDLE_ENFORCE(cudaStreamDestroy(stream));
+    for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event));
  }
 #endif
 }
@@ -43,7 +44,14 @@ BufferedReader::BufferedReader(
 #ifdef PADDLE_WITH_CUDA
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    PADDLE_ENFORCE(cudaStreamCreate(&stream));
+    compute_stream =
+        ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
+                                             .Get(place_)))
+            ->stream();
+    events.resize(buffer_size);
+    for (auto &event : events)
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  }
 #endif
  cpu_buffer_.resize(buffer_size);
@@ -59,6 +67,12 @@ void BufferedReader::ReadTillBufferFullAsync() {
 }

 void BufferedReader::ReadAsync(size_t i) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(place_)) {
+    platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
+    PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream));
+  }
+#endif
  position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
    TensorVec &cpu = cpu_buffer_[i];
    reader_->ReadNext(&cpu);
@@ -71,6 +85,8 @@ void BufferedReader::ReadAsync(size_t i) {
    // NOTE(liangdun): using async copy instead of TensorCopySync
    // TensorCopySync would block other stream
    if (platform::is_gpu_place(place_)) {
+      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
+      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
      TensorVec &gpu = gpu_buffer_[i];
      gpu.resize(cpu.size());
      for (size_t i = 0; i < cpu.size(); ++i) {

--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -64,6 +64,8 @@ class BufferedReader : public framework::DecoratedReader {
  size_t prev_pos_{-1UL};
 #ifdef PADDLE_WITH_CUDA
  cudaStream_t stream;
+  cudaStream_t compute_stream;
+  std::vector<cudaEvent_t> events;
 #endif
 };