提交 9f11da59 编写于 作者: F fengjiayi

Add synchronous TensorCopy and use it in double buffer

上级 3863c6a9
......@@ -20,7 +20,8 @@ namespace paddle {
namespace framework {
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst) {
const platform::DeviceContext& ctx, Tensor* dst,
bool sync = false) {
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
src.check_memory_size();
......@@ -47,9 +48,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
auto stream =
sync ? nullptr
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
.stream();
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
......@@ -58,18 +61,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
auto stream =
sync ? nullptr
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
.stream();
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
auto stream =
sync ? nullptr
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
.stream();
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
}
#endif
}
......
......@@ -30,29 +30,46 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num, cudaStream_t stream = nullptr) {
platform::SetDeviceId(src_place.device);
if (stream) {
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else {
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
}
}
template <>
void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num, cudaStream_t stream = nullptr) {
platform::SetDeviceId(dst_place.device);
if (stream) {
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else {
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
}
}
template <>
void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num, cudaStream_t stream = nullptr) {
if (dst_place == src_place) {
platform::SetDeviceId(src_place.device);
if (stream) {
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
} else {
platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
stream);
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
}
} else {
if (stream) {
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, stream);
} else {
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
num, stream);
}
}
}
......@@ -81,18 +98,26 @@ template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPlace src_place, const void* src, size_t num,
cudaStream_t stream) {
cudaStream_t stream = nullptr) {
platform::SetDeviceId(src_place.device);
if (stream) {
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else {
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
}
}
template <>
void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
platform::CUDAPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num,
cudaStream_t stream) {
cudaStream_t stream = nullptr) {
platform::SetDeviceId(dst_place.device);
if (stream) {
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else {
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
}
}
#endif
......
......@@ -180,7 +180,8 @@ void DoubleBufferReader::PrefetchThreadFunc() {
auto* gpu_ctx = ctxs_[cached_tensor_id].get();
gpu_batch.resize(cpu_batch.size());
for (size_t i = 0; i < cpu_batch.size(); ++i) {
framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i],
true);
gpu_batch[i].set_lod(cpu_batch[i].lod());
}
}
......
......@@ -127,11 +127,24 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
}
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream) {
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) {
PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
"cudaMemcpy failed in paddle::platform::GpuMemcpySync");
}
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, cudaStream_t stream) {
PADDLE_ENFORCE(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync");
}
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) {
PADDLE_ENFORCE(
cudaMemcpyPeer(dst, dst_device, src, src_device, count),
"cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync");
}
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
......
......@@ -57,9 +57,17 @@ size_t GpuMaxChunkSize();
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream);
//! Copy memory from one device to another device.
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream);
//! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind);
//! Copy memory from one device to another device asynchronously.
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, cudaStream_t stream);
//! Copy memory from one device to another device synchronously.
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count);
//! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册