[new-exec] use stream safe allocator in memcpy_h2d (#37777)

* use sync h2d copy * use stream safe allocator in memcpy_h2d * remove wait * add guard

[new-exec] use stream safe allocator in memcpy_h2d (#37777)
* use sync h2d copy * use stream safe allocator in memcpy_h2d * remove wait * add guard
9ccb6228 · Leo Chen · GitHub · 797d898c · 9ccb6228 · 9ccb6228
4 changed file
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -439,6 +439,7 @@ void InterpreterCore::ExecuteInstructionList(

  if (UNLIKELY(exception_holder_.IsCaught())) {
    VLOG(4) << "Exception caught " << exception_holder_.Type();
+    async_work_queue_->Cancel();
    exception_holder_.ReThrow();
  }


--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -14,6 +14,8 @@ limitations under the License. */

 #include "paddle/fluid/framework/tensor.h"

+DECLARE_bool(use_stream_safe_cuda_allocator);
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -89,6 +91,35 @@ void* Tensor::mutable_data(const platform::Place& place,
  return mutable_data(place, type_, requested_size);
 }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void* Tensor::mutable_data(const platform::CUDAPlace& place,
+                           proto::VarType::Type type,
+                           const gpuStream_t& stream) {
+  if (!FLAGS_use_stream_safe_cuda_allocator) {
+    return mutable_data(place, type);
+  }
+
+  type_ = type;
+  PADDLE_ENFORCE_GE(
+      numel(), 0,
+      platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(), "] now"));
+  size_t size = numel() * SizeOfType(type);
+
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    holder_.reset();
+    holder_ = memory::AllocShared(place, size, stream);
+    offset_ = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+#endif
+
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
  src.check_memory_size();
  *this = src;

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -149,6 +149,11 @@ class Tensor {

  void* mutable_data(const platform::Place& place, size_t requested_size = 0);

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void* mutable_data(const platform::CUDAPlace& place,
+                     proto::VarType::Type type, const gpuStream_t& stream);
+#endif
+
  /**
   * @brief     Return a pointer to mutable memory block.
   *

--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -41,7 +41,12 @@ class MemcpyH2DFunctor {

  void operator()(const framework::LoDTensor &lod_tensor) const {
    auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
-
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    out_tensor.mutable_data(
+        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_.GetPlace()),
+        lod_tensor.type(),
+        static_cast<const platform::CUDADeviceContext *>(&dev_ctx_)->stream());
+#endif
    if (dst_place_type_ == 0 || dst_place_type_ == 1) {
      framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
                            &out_tensor);