add xpu buffer_reader, *test=kunlun (#42578)

* add xpu buffer_reader, *test=kunlun * xpu buffer_reader, use XPUDeviceGuard, *test=kunlun * modify xpu.cmake, *test=kunlun * modify xpu.cmake, *test=kunlun * modify xpu.cmake, *test=kunlun * add xpu buffer_reader, *test=kunlun * add xpu buffer reader, *test=kunlun * add xpu buffer reader, *test=kunlun

add xpu buffer_reader, *test=kunlun (#42578)
* add xpu buffer_reader, *test=kunlun * xpu buffer_reader, use XPUDeviceGuard, *test=kunlun * modify xpu.cmake, *test=kunlun * modify xpu.cmake, *test=kunlun * modify xpu.cmake, *test=kunlun * add xpu buffer_reader, *test=kunlun * add xpu buffer reader, *test=kunlun * add xpu buffer reader, *test=kunlun
cc343a41 · z8hanghuan · GitHub · e906eb5b · cc343a41 · cc343a41
9 changed file
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@

 #include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"

@@ -85,10 +86,27 @@ BufferedReader::BufferedReader(
    stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx);
  }
 #endif
+
+#ifdef PADDLE_WITH_XPU
+  if (platform::is_xpu_place(place_)) {
+    int dev_idx = place_.device;
+    compute_stream_ =
+        ((platform::XPUDeviceContext *)(platform::DeviceContextPool::Instance()
+                                            .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::XpuEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::XpuStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
+
  cpu_buffer_.resize(buffer_size);
  cuda_buffer_.resize(buffer_size);
  npu_buffer_.resize(buffer_size);
  mlu_buffer_.resize(buffer_size);
+  xpu_buffer_.resize(buffer_size);
  ReadTillBufferFullAsync();
 }

@@ -322,6 +340,57 @@ void BufferedReader::ReadAsync(size_t i) {
      platform::MLUStreamSync(stream_.get());
    }
 #endif
+
+#ifdef PADDLE_WITH_XPU
+    if (platform::is_xpu_place(place_)) {
+      TensorVec &xpu = xpu_buffer_[i];
+      if (xpu.empty()) {
+        xpu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(
+            xpu.size(), cpu.size(),
+            platform::errors::InvalidArgument(
+                "Input tensor number on XPU and CPU devices are not matched. "
+                "The number on XPU is %d, on CPU is %d",
+                xpu.size(), cpu.size()));
+      }
+
+      std::vector<void *> xpu_ptrs;
+      xpu_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        xpu[i].Resize(cpu[i].dims());
+        xpu[i].set_layout(cpu[i].layout());
+        xpu_ptrs.emplace_back(xpu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      platform::XPUDeviceGuard gurad(place_.device);
+      int r = xpu_event_record(events_[i].get(), compute_stream_);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_event_record");
+      r = xpu_stream_wait_event(stream_.get(), events_[i].get());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_stream_wait_event");
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data();
+        auto xpu_ptr = xpu_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype());
+        // TODO(zhanghuan) for now hardware not support xpu_memcpy_async, maybe
+        // KL3
+        if ((platform::is_xpu_place(cpu_place))) {
+          memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
+          platform::XPUStreamSync(stream_.get());
+        } else {
+          memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
+        }
+        xpu[i].set_lod(cpu[i].lod());
+      }
+      platform::XPUStreamSync(stream_.get());
+    }
+#endif
    return i;
  }));
 }
@@ -359,6 +428,8 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
    *out = std::move(npu_buffer_[i]);
  } else if (platform::is_mlu_place(place_)) {
    *out = std::move(mlu_buffer_[i]);
+  } else if (platform::is_xpu_place(place_)) {
+    *out = std::move(xpu_buffer_[i]);
  } else {
    *out = std::move(cpu_buffer_[i]);
  }

--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -33,6 +33,10 @@
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
+#endif

 namespace paddle {
 namespace operators {
@@ -76,6 +80,7 @@ class BufferedReader : public framework::DecoratedReader {
  std::vector<TensorVec> cuda_buffer_;
  std::vector<TensorVec> npu_buffer_;
  std::vector<TensorVec> mlu_buffer_;
+  std::vector<TensorVec> xpu_buffer_;
  size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  gpuStream_t compute_stream_;
@@ -94,6 +99,12 @@ class BufferedReader : public framework::DecoratedReader {
  std::shared_ptr<platform::MluStreamObject> stream_;
  std::vector<std::shared_ptr<platform::MluEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_XPU
+  xpuStream compute_stream_;
+  std::shared_ptr<platform::XpuStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::XpuEventObject>> events_;
+#endif
 };

 }  // namespace reader

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -125,7 +125,7 @@ cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc x
    place phi_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context generator)
 if(WITH_XPU)
-  target_link_libraries(device_context xpu_context)
+  target_link_libraries(device_context xpu_context xpu_resource_pool)
 endif()

 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)

--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -7,5 +7,6 @@ set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)

 cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place phi_xpu_info)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context op_kernel_type)
+cc_library(xpu_resource_pool SRCS xpu_resource_pool.cc DEPS xpu_info)

 add_subdirectory(tests)
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -79,6 +79,10 @@ void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place,
                                    *dev_ctx);
 }

+void XPUStreamSync(xpuStream stream) {
+  PADDLE_ENFORCE_XDNN_SUCCESS(xpu_wait(stream), "xpu_wait");
+}
+
 /**************************** Others **************************/

 phi::backends::xpu::XPUVersion get_xpu_version(int dev_id) {

--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -14,8 +14,13 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
+#include "xpu/runtime.h"

 namespace paddle {
+
+using xpuStream = XPUStream;
+using xpuEventHandle = XPUEvent;
+
 namespace platform {

 /***** Version Management *****/
@@ -51,6 +56,9 @@ void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place,
                   const void *src, const platform::XPUPlace &src_place,
                   size_t count);

+//! Blocks until stream has completed all operations.
+void XPUStreamSync(xpuStream stream);
+
 using XPUDeviceGuard = phi::backends::xpu::XPUDeviceGuard;

 phi::backends::xpu::XPUVersion get_xpu_version(int dev_id);

--- a/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(PADDLE_WITH_XPU)
+#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+XpuStreamResourcePool::XpuStreamResourcePool() {
+  int dev_cnt = platform::GetXPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpuStream stream;
+      xpu_stream_create(&stream);
+      return stream;
+    };
+
+    auto deleter = [dev_idx](xpuStream stream) {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpu_stream_destroy(stream);
+    };
+
+    pool_.emplace_back(ResourcePool<XpuStreamObject>::Create(creator, deleter));
+  }
+}
+
+XpuStreamResourcePool& XpuStreamResourcePool::Instance() {
+  static XpuStreamResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<XpuStreamObject> XpuStreamResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+XpuEventResourcePool::XpuEventResourcePool() {
+  int dev_cnt = platform::GetXPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpuEventHandle event;
+      xpu_event_create(&event);
+      return event;
+    };
+
+    auto deleter = [dev_idx](xpuEventHandle event) {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpu_event_destroy(event);
+    };
+
+    pool_.emplace_back(ResourcePool<XpuEventObject>::Create(creator, deleter));
+  }
+}
+
+XpuEventResourcePool& XpuEventResourcePool::Instance() {
+  static XpuEventResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<XpuEventObject> XpuEventResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
--- a/paddle/fluid/platform/device/xpu/xpu_resource_pool.h
+++ b/paddle/fluid/platform/device/xpu/xpu_resource_pool.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_XPU)
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+using XpuStreamObject = std::remove_pointer<xpuStream>::type;
+using XpuEventObject = std::remove_pointer<xpuEventHandle>::type;
+
+class XpuStreamResourcePool {
+ public:
+  std::shared_ptr<XpuStreamObject> New(int dev_idx);
+
+  static XpuStreamResourcePool &Instance();
+
+ private:
+  XpuStreamResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(XpuStreamResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<XpuStreamObject>>> pool_;
+};
+
+class XpuEventResourcePool {
+ public:
+  std::shared_ptr<XpuEventObject> New(int dev_idx);
+
+  static XpuEventResourcePool &Instance();
+
+ private:
+  XpuEventResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(XpuEventResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<XpuEventObject>>> pool_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Copyright (c) 2022 NVIDIA Corporation. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
@@ -188,6 +188,7 @@ class XPUDeviceContext : public phi::XPUContext {
  explicit XPUDeviceContext(XPUPlace place);
  virtual ~XPUDeviceContext();
  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  xpuStream stream() const { return XPUContext::x_context()->xpu_stream; }
 };

 template <>