From 9ccb622898202fc32bd24c96c119b4e828459960 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 3 Dec 2021 17:36:35 +0800 Subject: [PATCH] [new-exec] use stream safe allocator in memcpy_h2d (#37777) * use sync h2d copy * use stream safe allocator in memcpy_h2d * remove wait * add guard --- .../framework/new_executor/interpretercore.cc | 1 + paddle/fluid/framework/tensor.cc | 31 +++++++++++++++++++ paddle/fluid/framework/tensor.h | 5 +++ paddle/fluid/operators/memcpy_h2d_op.h | 7 ++++- 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index f954b297510..9f6e0557815 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -439,6 +439,7 @@ void InterpreterCore::ExecuteInstructionList( if (UNLIKELY(exception_holder_.IsCaught())) { VLOG(4) << "Exception caught " << exception_holder_.Type(); + async_work_queue_->Cancel(); exception_holder_.ReThrow(); } diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 8d927b87c9a..063ede6ffbf 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" +DECLARE_bool(use_stream_safe_cuda_allocator); + namespace paddle { namespace memory { namespace allocation { @@ -89,6 +91,35 @@ void* Tensor::mutable_data(const platform::Place& place, return mutable_data(place, type_, requested_size); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +void* Tensor::mutable_data(const platform::CUDAPlace& place, + proto::VarType::Type type, + const gpuStream_t& stream) { + if (!FLAGS_use_stream_safe_cuda_allocator) { + return mutable_data(place, type); + } + + type_ = type; + PADDLE_ENFORCE_GE( + numel(), 0, + platform::errors::PreconditionNotMet( + "The Tensor's element number must be equal or greater than zero. " + "The Tensor's shape is [", + dims(), "] now")); + size_t size = numel() * SizeOfType(type); + + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + offset_) { + holder_.reset(); + holder_ = memory::AllocShared(place, size, stream); + offset_ = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} +#endif + Tensor& Tensor::ShareDataWith(const Tensor& src) { src.check_memory_size(); *this = src; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 2efaa3f37f9..494a02878f1 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -149,6 +149,11 @@ class Tensor { void* mutable_data(const platform::Place& place, size_t requested_size = 0); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void* mutable_data(const platform::CUDAPlace& place, + proto::VarType::Type type, const gpuStream_t& stream); +#endif + /** * @brief Return a pointer to mutable memory block. * diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 3998db6731b..43ac5984bc8 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -41,7 +41,12 @@ class MemcpyH2DFunctor { void operator()(const framework::LoDTensor &lod_tensor) const { auto &out_tensor = *out_->GetMutable(); - +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + out_tensor.mutable_data( + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_.GetPlace()), + lod_tensor.type(), + static_cast(&dev_ctx_)->stream()); +#endif if (dst_place_type_ == 0 || dst_place_type_ == 1) { framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor); -- GitLab