fix reader default stream,test=develop (#17106)

08773b60 · Zeng Jinle · GitHub · aa5307ce · 08773b60 · 08773b60
2 changed file
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -39,7 +39,7 @@ TEST(RetryAllocator, RetryAllocator) {
  std::unique_ptr<LockedAllocator> locked_allocator(
      new LockedAllocator(std::move(best_fit_allocator)));
-  size_t thread_num = 32;
+  size_t thread_num = 8;
  size_t sleep_time = 40;
  size_t extra_time = 10;

--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -74,12 +74,6 @@ void BufferedReader::ReadTillBufferFullAsync() {
 }
 void BufferedReader::ReadAsync(size_t i) {
-#ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(place_)) {
-    platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
-  }
-#endif
  position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
    TensorVec &cpu = cpu_buffer_[i];
    reader_->ReadNext(&cpu);
@@ -94,17 +88,34 @@ void BufferedReader::ReadAsync(size_t i) {
    // issues the copying command to the default stream, it will make two
    // commands from different streams cannot run concurrently.
    if (platform::is_gpu_place(place_)) {
-      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-      PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
      TensorVec &gpu = gpu_buffer_[i];
+      if (gpu.empty()) {
        gpu.resize(cpu.size());
-      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+      } else {
+        PADDLE_ENFORCE_EQ(gpu.size(), cpu.size(),
+                          "Input tensor number not matched");
+      }
+      std::vector<void *> gpu_ptrs;
+      gpu_ptrs.reserve(cpu.size());
      for (size_t i = 0; i < cpu.size(); ++i) {
        gpu[i].Resize(cpu[i].dims());
        gpu[i].set_layout(cpu[i].layout());
+        gpu_ptrs.emplace_back(gpu[i].mutable_data(place_, cpu[i].type()));
+      }
+      // NOTE(zjl): cudaStreamWaitEvent() must be called after all
+      // gpu[i].mutable_data() is called, since some ops release
+      // gpu memory immediately without waiting gpu kernel ends
+      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
+      PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
+      PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+      for (size_t i = 0; i < cpu.size(); ++i) {
        auto cpu_place = cpu[i].place();
        auto cpu_ptr = cpu[i].data<void>();
-        auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type());
+        auto gpu_ptr = gpu_ptrs[i];
        auto size =
            cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
        if (platform::is_cuda_pinned_place(cpu_place)) {
@@ -116,12 +127,9 @@ void BufferedReader::ReadAsync(size_t i) {
                       boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
                       size, stream_);
        } else {
-          // if cpu place is not pinned, async copy is slower than sync copy,
-          // so we use sync copy instead.
-          // TODO(zcd): The default stream should not be used here.
          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                       boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
-                       0);
+                       stream_);
        }
        gpu[i].set_lod(cpu[i].lod());
      }