From 856ef62721d1f308ab8d9d59a2357e5bd96dbf9d Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 8 Apr 2020 23:22:34 +0800 Subject: [PATCH] feat(mgb/core): support copy DeviceTensorND from cpu to cuda GitOrigin-RevId: d56f4ebf1fadccb5f9d6af2497d27744084d3930 --- src/core/impl/comp_node/cpu/comp_node.cpp | 3 +- src/core/impl/comp_node/cuda/comp_node.cpp | 22 +++++++++ src/core/impl/tensor.cpp | 56 ++++++++++++++++++---- src/core/test/graph/misc.cpp | 33 +++++++------ src/core/test/tensor.cpp | 36 ++++++++++++++ 5 files changed, 125 insertions(+), 25 deletions(-) diff --git a/src/core/impl/comp_node/cpu/comp_node.cpp b/src/core/impl/comp_node/cpu/comp_node.cpp index 9e0728bed..b317d1904 100644 --- a/src/core/impl/comp_node/cpu/comp_node.cpp +++ b/src/core/impl/comp_node/cpu/comp_node.cpp @@ -836,9 +836,10 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by( { auto type = cn_impl->env().property().type; mgb_throw_if(type != CompNode::DeviceType::CPU + && type != CompNode::DeviceType::CUDA , MegBrainError, - "currently CPU can only wait for CPU" + "currently CPU can only wait for CPU, CUDA" ); } diff --git a/src/core/impl/comp_node/cuda/comp_node.cpp b/src/core/impl/comp_node/cuda/comp_node.cpp index d4198b398..ae296d98b 100644 --- a/src/core/impl/comp_node/cuda/comp_node.cpp +++ b/src/core/impl/comp_node/cuda/comp_node.cpp @@ -40,6 +40,16 @@ namespace { return std::max(300 * 1024 * 1024, available / 20); } } + using CudaHostFunc = megdnn::thin_function; + void CUDART_CB cuda_host_func_caller(void* ud) { + mgb_assert(ud); + CudaHostFunc* func_ptr = reinterpret_cast(ud); + MGB_TRY { + (*func_ptr)(); + } MGB_FINALLY( + delete func_ptr; + ); + } } // anonymous namespace namespace mgb { @@ -223,6 +233,18 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl { Locator locator_logical() override { return m_locator_logical; } + + void add_callback(CudaHostFunc&& cb) override { + activate(); + CudaHostFunc* func_ptr = new CudaHostFunc(std::move(cb)); + MGB_TRY { + MGB_CUDA_CHECK(cudaLaunchHostFunc(m_env.cuda_env().stream, + cuda_host_func_caller, static_cast(func_ptr))); + } MGB_CATCH(..., { + delete func_ptr; + throw; + }); + } }; MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl); diff --git a/src/core/impl/tensor.cpp b/src/core/impl/tensor.cpp index 2fc5cba18..2e998460b 100644 --- a/src/core/impl/tensor.cpp +++ b/src/core/impl/tensor.cpp @@ -28,15 +28,32 @@ namespace { //! implement non-contiguous d2d copy void noncont_tensor_copy( - const DeviceTensorND &dest, const DeviceTensorND &src, bool, bool) { - auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node()); + const DeviceTensorND &dest, const DeviceTensorND &src, + bool contig_dest, bool contig_src) { + auto src_cn = src.comp_node(); auto dst_cn = dest.comp_node(); - auto relayout = opr::intl::get_megdnn_global_opr( - dst_cn); - dst_cn.activate(); - relayout->exec( - const_cast(src).as_megdnn(), - dest.as_megdnn(), MegDNNHandle::get(src_env).handle()); + if (src_cn.device_type() == dst_cn.device_type()) { + // perform relayout op for better performance when src and dst are + // placed on comp nodes with the same device type + auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node()); + auto relayout = opr::intl::get_megdnn_global_opr( + dst_cn); + dst_cn.activate(); + relayout->exec( + const_cast(src).as_megdnn(), + dest.as_megdnn(), MegDNNHandle::get(src_env).handle()); + } else { + if (contig_src) { + mgb_assert(!contig_dest); + DeviceTensorND tmp{dst_cn}; + tmp.copy_from(src); + dest.copy_from_fixlayout(tmp); + return; + } + DeviceTensorND tmp; + tmp.copy_from(src); + dest.copy_from_fixlayout(tmp); + } } //! implement non-contiguous h2h copy @@ -346,7 +363,28 @@ template<> template<> void TensorStorage::copy_from( const TensorStorage &src, size_t size) const { mgb_assert(size <= this->size() && size <= src.size()); - src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size); + if (src.comp_node().device_type() == CompNode::DeviceType::CPU && + comp_node().device_type() == CompNode::DeviceType::CUDA) { + // current thread(i.e. cuda dispatcher thread) should wait for all + // operations on src's comp_node to finish, otherwise a race condition + // might occur between the worker thread of src's comp_node and the + // thread responsible for copying pageable memory in \p src to a pinned + // buffer, refer to + // https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html + // + // Note: it is highly recommended that copy tensor from cpu to cuda + // with asynchronized disaptching(see graph option async_exec_level), + // or main thread might be blocked by worker thread corresponding to + // the src's comp_node, resulting in bad performance + // + // TODO: consider using cudaMallocHost or cudaHostRegister + // to pin the memory of src tensor, so it does not require synchronization + // and is more efficient + src.comp_node().sync(); + comp_node().copy_to_device(ptr(), src.ptr(), size); + } else { + src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size); + } } diff --git a/src/core/test/graph/misc.cpp b/src/core/test/graph/misc.cpp index bd24b63c3..54f0567cc 100644 --- a/src/core/test/graph/misc.cpp +++ b/src/core/test/graph/misc.cpp @@ -1733,22 +1733,25 @@ TEST(TestGraph, UpdateStaticAllocPlan) { TEST(TestGraph, CPUGPUHybrid) { REQUIRE_GPU(1); - auto cn_cpu = CompNode::load("cpu:default"), - cn_gpu = CompNode::load("gpu0"); - auto graph = ComputingGraph::make(); - HostTensorGenerator<> gen; - auto host_x = gen({42}); - auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}), - y = x * 2, - z = opr::Copy::make(y, cn_gpu) + 1; - HostTensorND host_z; - auto func = graph->compile({make_callback_copy(z, host_z)}); - func->execute(); - for (size_t i = 0; i < 42; ++ i) { - MGB_ASSERT_FLOAT_EQ(host_x->ptr()[i] * 2 + 1, - host_z.ptr()[i]); + auto cn_gpu = CompNode::load("gpu0"); + for (auto&& cn_cpu : {CompNode::load("cpu0"), CompNode::default_cpu()}) { + auto graph = ComputingGraph::make(); + HostTensorGenerator<> gen; + constexpr size_t length = 23333; + auto host_x = gen({length}); + graph->options().var_sanity_check_first_run = false; + auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}), + y = opr::Sleep::make(x, 0.5) * 2, + z_gpu = opr::Copy::make(y, cn_gpu) + 1, + z = opr::Copy::make(z_gpu, cn_cpu) * 2; + HostTensorND host_z; + auto func = graph->compile({make_callback_copy(z, host_z)}); + func->execute(); + for (size_t i = 0; i < length; ++ i) { + MGB_ASSERT_FLOAT_EQ((host_x->ptr()[i] * 2 + 1) * 2, + host_z.ptr()[i]); + } } - } TEST(TestGraph, In2OutOpStreamPropagate) { diff --git a/src/core/test/tensor.cpp b/src/core/test/tensor.cpp index 44f1e9c38..263491b40 100644 --- a/src/core/test/tensor.cpp +++ b/src/core/test/tensor.cpp @@ -11,6 +11,7 @@ #include "megbrain/test/helper.h" +#include "megbrain/comp_node_env.h" #include "megbrain/tensor.h" #include "megbrain/opr/utility.h" #include "megbrain/utils/timer.h" @@ -382,4 +383,39 @@ TEST(TestTensor, NegativeIndex) { run_negative_index_test(); } +TEST(TestTensor, CpuCudaD2DCopy) { + REQUIRE_GPU(1); + auto cn_cpu = CompNode::load("cpu0"), + cn_gpu = CompNode::load("gpu0"); + + HostTensorGenerator<> gen; + constexpr size_t length = 233333; + auto a = gen({length}); + for (auto config: {true, false}) { + DeviceTensorND dev_a{cn_cpu}, dev_b{cn_gpu, a->shape(), a->dtype()}; + dev_a.copy_from(*a).sync(); + + if (!config) { + auto subspec = Slice(0, length, 3).apply(a->layout(), 0); + dev_a = dev_a.sub(subspec); + dev_b = dev_b.sub(subspec); + } + + auto iadd = [ptr = dev_a.ptr(), length = dev_a.shape()[0], + stride = dev_a.layout().stride[0]]() { + for (size_t i = 0; i < length; ++ i) { + ptr[i * stride] += 1; + } + }; + CompNodeEnv::from_comp_node(cn_cpu).cpu_env().dispatch(iadd); + auto event = cn_cpu.create_event(); + event->record(); + cn_gpu.device_wait_event(*event); + dev_b.copy_from_fixlayout(dev_a); + HostTensorND res; + res.copy_from(dev_b).sync(); + MGB_ASSERT_TENSOR_EQ(HostTensorND::make_proxy(dev_a), res); + } +} + // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} -- GitLab