diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index f954b2975100713cdd1e25359165e0b3ddfd0306..9f6e0557815062a42ff61a393a603b42abb80f8c 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -439,6 +439,7 @@ void InterpreterCore::ExecuteInstructionList( if (UNLIKELY(exception_holder_.IsCaught())) { VLOG(4) << "Exception caught " << exception_holder_.Type(); + async_work_queue_->Cancel(); exception_holder_.ReThrow(); } diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 8d927b87c9abeefa0ac51f348fc65f00b2e934cf..063ede6ffbf3197ec8fad51d25a5ac56ad8a00ad 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" +DECLARE_bool(use_stream_safe_cuda_allocator); + namespace paddle { namespace memory { namespace allocation { @@ -89,6 +91,35 @@ void* Tensor::mutable_data(const platform::Place& place, return mutable_data(place, type_, requested_size); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +void* Tensor::mutable_data(const platform::CUDAPlace& place, + proto::VarType::Type type, + const gpuStream_t& stream) { + if (!FLAGS_use_stream_safe_cuda_allocator) { + return mutable_data(place, type); + } + + type_ = type; + PADDLE_ENFORCE_GE( + numel(), 0, + platform::errors::PreconditionNotMet( + "The Tensor's element number must be equal or greater than zero. " + "The Tensor's shape is [", + dims(), "] now")); + size_t size = numel() * SizeOfType(type); + + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + offset_) { + holder_.reset(); + holder_ = memory::AllocShared(place, size, stream); + offset_ = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} +#endif + Tensor& Tensor::ShareDataWith(const Tensor& src) { src.check_memory_size(); *this = src; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 2efaa3f37f9e9ac00d04260c4f15e6cabd293fb0..494a02878f1a2c1fc94a50777d3b4b8676b99e8e 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -149,6 +149,11 @@ class Tensor { void* mutable_data(const platform::Place& place, size_t requested_size = 0); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void* mutable_data(const platform::CUDAPlace& place, + proto::VarType::Type type, const gpuStream_t& stream); +#endif + /** * @brief Return a pointer to mutable memory block. * diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 3998db6731b3d2cccfb474d44319c184e6c60bf2..43ac5984bc8c844f146282a8975e468764ed5129 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -41,7 +41,12 @@ class MemcpyH2DFunctor { void operator()(const framework::LoDTensor &lod_tensor) const { auto &out_tensor = *out_->GetMutable(); - +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + out_tensor.mutable_data( + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_.GetPlace()), + lod_tensor.type(), + static_cast(&dev_ctx_)->stream()); +#endif if (dst_place_type_ == 0 || dst_place_type_ == 1) { framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);