未验证 提交 c0c54ba3 编写于 作者: A Aurelius84 提交者: GitHub

Fix memcpyD2H sync behavior with other stream (#38647)

* Fix memcpyD2H sync behavior with other stream

* add wait

* add wait

* add wait
上级 7c020c71
...@@ -69,16 +69,24 @@ class MemcpyD2HFunctor { ...@@ -69,16 +69,24 @@ class MemcpyD2HFunctor {
} }
private: private:
static constexpr size_t WAIT_THRESHOLD = 64 * 1024;
void CopyLoDTensor(const framework::LoDTensor &src, void CopyLoDTensor(const framework::LoDTensor &src,
framework::LoDTensor &dst) const { // NOLINT framework::LoDTensor &dst) const { // NOLINT
if (dst_place_type_ == 1) { if (dst_place_type_ == 1) {
framework::TensorCopy(src, platform::CUDAPinnedPlace(), dev_ctx_, &dst); framework::TensorCopy(src, platform::CUDAPinnedPlace(), dev_ctx_, &dst);
} else if (dst_place_type_ == 0) { } else if (dst_place_type_ == 0) {
framework::TensorCopySync(src, platform::CPUPlace(), &dst); framework::TensorCopy(src, platform::CPUPlace(), dev_ctx_, &dst);
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"memcpy dst_place_type: %d is not supported yet.", dst_place_type_)); "memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
} }
// NOTE(Aurelius84): host <-> device memory copies of a memory block of 64
// KB or less are asynchronous. See
// https://forums.developer.nvidia.com/t/host-device-memory-copies-up-to-64-kb-are-asynchronous/17907
if (src.memory_size() <= WAIT_THRESHOLD) {
dev_ctx_.Wait();
}
dst.set_lod(src.lod()); dst.set_lod(src.lod());
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册