未验证 提交 9ccb6228 编写于 作者: L Leo Chen 提交者: GitHub

[new-exec] use stream safe allocator in memcpy_h2d (#37777)

* use sync h2d copy

* use stream safe allocator in memcpy_h2d

* remove wait

* add guard
上级 797d898c
......@@ -439,6 +439,7 @@ void InterpreterCore::ExecuteInstructionList(
if (UNLIKELY(exception_holder_.IsCaught())) {
VLOG(4) << "Exception caught " << exception_holder_.Type();
async_work_queue_->Cancel();
exception_holder_.ReThrow();
}
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
DECLARE_bool(use_stream_safe_cuda_allocator);
namespace paddle {
namespace memory {
namespace allocation {
......@@ -89,6 +91,35 @@ void* Tensor::mutable_data(const platform::Place& place,
return mutable_data(place, type_, requested_size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void* Tensor::mutable_data(const platform::CUDAPlace& place,
proto::VarType::Type type,
const gpuStream_t& stream) {
if (!FLAGS_use_stream_safe_cuda_allocator) {
return mutable_data(place, type);
}
type_ = type;
PADDLE_ENFORCE_GE(
numel(), 0,
platform::errors::PreconditionNotMet(
"The Tensor's element number must be equal or greater than zero. "
"The Tensor's shape is [",
dims(), "] now"));
size_t size = numel() * SizeOfType(type);
/* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) {
holder_.reset();
holder_ = memory::AllocShared(place, size, stream);
offset_ = 0;
}
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
#endif
Tensor& Tensor::ShareDataWith(const Tensor& src) {
src.check_memory_size();
*this = src;
......
......@@ -149,6 +149,11 @@ class Tensor {
void* mutable_data(const platform::Place& place, size_t requested_size = 0);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void* mutable_data(const platform::CUDAPlace& place,
proto::VarType::Type type, const gpuStream_t& stream);
#endif
/**
* @brief Return a pointer to mutable memory block.
*
......
......@@ -41,7 +41,12 @@ class MemcpyH2DFunctor {
void operator()(const framework::LoDTensor &lod_tensor) const {
auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
out_tensor.mutable_data(
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_.GetPlace()),
lod_tensor.type(),
static_cast<const platform::CUDADeviceContext *>(&dev_ctx_)->stream());
#endif
if (dst_place_type_ == 0 || dst_place_type_ == 1) {
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
&out_tensor);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册