From 856ef62721d1f308ab8d9d59a2357e5bd96dbf9d Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Wed, 8 Apr 2020 23:22:34 +0800
Subject: [PATCH] feat(mgb/core): support copy DeviceTensorND from cpu to cuda

GitOrigin-RevId: d56f4ebf1fadccb5f9d6af2497d27744084d3930
---
 src/core/impl/comp_node/cpu/comp_node.cpp  |  3 +-
 src/core/impl/comp_node/cuda/comp_node.cpp | 22 +++++++++
 src/core/impl/tensor.cpp                   | 56 ++++++++++++++++++----
 src/core/test/graph/misc.cpp               | 33 +++++++------
 src/core/test/tensor.cpp                   | 36 ++++++++++++++
 5 files changed, 125 insertions(+), 25 deletions(-)

diff --git a/src/core/impl/comp_node/cpu/comp_node.cpp b/src/core/impl/comp_node/cpu/comp_node.cpp
index 9e0728bed..b317d1904 100644
--- a/src/core/impl/comp_node/cpu/comp_node.cpp
+++ b/src/core/impl/comp_node/cpu/comp_node.cpp
@@ -836,9 +836,10 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by(
     {
         auto type = cn_impl->env().property().type;
         mgb_throw_if(type != CompNode::DeviceType::CPU
+                             && type != CompNode::DeviceType::CUDA
                              ,
                      MegBrainError,
-                     "currently CPU can only wait for CPU"
+                     "currently CPU can only wait for CPU, CUDA"
         );
     }
 
diff --git a/src/core/impl/comp_node/cuda/comp_node.cpp b/src/core/impl/comp_node/cuda/comp_node.cpp
index d4198b398..ae296d98b 100644
--- a/src/core/impl/comp_node/cuda/comp_node.cpp
+++ b/src/core/impl/comp_node/cuda/comp_node.cpp
@@ -40,6 +40,16 @@ namespace {
             return std::max<size_t>(300 * 1024 * 1024, available / 20);
         }
     }
+    using CudaHostFunc = megdnn::thin_function<void()>;
+    void CUDART_CB cuda_host_func_caller(void* ud) {
+        mgb_assert(ud);
+        CudaHostFunc* func_ptr = reinterpret_cast<CudaHostFunc*>(ud);
+        MGB_TRY {
+            (*func_ptr)();
+        } MGB_FINALLY(
+            delete func_ptr;
+        );
+    }
 } // anonymous namespace
 
 namespace mgb {
@@ -223,6 +233,18 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl {
         Locator locator_logical() override {
             return m_locator_logical;
         }
+
+        void add_callback(CudaHostFunc&& cb) override {
+            activate();
+            CudaHostFunc* func_ptr = new CudaHostFunc(std::move(cb));
+            MGB_TRY {
+                MGB_CUDA_CHECK(cudaLaunchHostFunc(m_env.cuda_env().stream,
+                        cuda_host_func_caller, static_cast<void*>(func_ptr)));
+            } MGB_CATCH(..., {
+                delete func_ptr;
+                throw;
+            });
+        }
 };
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl);
 
diff --git a/src/core/impl/tensor.cpp b/src/core/impl/tensor.cpp
index 2fc5cba18..2e998460b 100644
--- a/src/core/impl/tensor.cpp
+++ b/src/core/impl/tensor.cpp
@@ -28,15 +28,32 @@ namespace {
 
     //! implement non-contiguous d2d copy
     void noncont_tensor_copy(
-            const DeviceTensorND &dest, const DeviceTensorND &src, bool, bool) {
-        auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node());
+            const DeviceTensorND &dest, const DeviceTensorND &src,
+            bool contig_dest, bool contig_src) {
+        auto src_cn = src.comp_node();
         auto dst_cn = dest.comp_node();
-        auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>(
-                dst_cn);
-        dst_cn.activate();
-        relayout->exec(
-                const_cast<DeviceTensorND&>(src).as_megdnn(),
-                dest.as_megdnn(), MegDNNHandle::get(src_env).handle());
+        if (src_cn.device_type() == dst_cn.device_type()) {
+            // perform relayout op for better performance when src and dst are
+            // placed on comp nodes with the same device type
+            auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node());
+            auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>(
+                    dst_cn);
+            dst_cn.activate();
+            relayout->exec(
+                    const_cast<DeviceTensorND&>(src).as_megdnn(),
+                    dest.as_megdnn(), MegDNNHandle::get(src_env).handle());
+        } else {
+            if (contig_src) {
+                mgb_assert(!contig_dest);
+                DeviceTensorND tmp{dst_cn};
+                tmp.copy_from(src);
+                dest.copy_from_fixlayout(tmp);
+                return;
+            }
+            DeviceTensorND tmp;
+            tmp.copy_from(src);
+            dest.copy_from_fixlayout(tmp);
+        }
     }
 
     //! implement non-contiguous h2h copy
@@ -346,7 +363,28 @@ template<> template<>
 void TensorStorage<DeviceTensorStorageTrait>::copy_from(
         const TensorStorage<DeviceTensorStorageTrait> &src, size_t size) const {
     mgb_assert(size <= this->size() && size <= src.size());
-    src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size);
+    if (src.comp_node().device_type() == CompNode::DeviceType::CPU &&
+        comp_node().device_type() == CompNode::DeviceType::CUDA) {
+        // current thread(i.e. cuda dispatcher thread) should wait for all
+        // operations on src's comp_node to finish, otherwise a race condition
+        // might occur between the worker thread of src's comp_node and the
+        // thread responsible for copying pageable memory in \p src to a pinned
+        // buffer, refer to
+        // https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html
+        //
+        // Note: it is highly recommended that copy tensor from cpu to cuda
+        // with asynchronized disaptching(see graph option async_exec_level),
+        // or main thread might be blocked by worker thread corresponding to
+        // the src's comp_node, resulting in bad performance
+        //
+        // TODO: consider using cudaMallocHost or cudaHostRegister
+        // to pin the memory of src tensor, so it does not require synchronization
+        // and is more efficient
+        src.comp_node().sync();
+        comp_node().copy_to_device(ptr(), src.ptr(), size);
+    } else {
+        src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size);
+    }
 }
 
 
diff --git a/src/core/test/graph/misc.cpp b/src/core/test/graph/misc.cpp
index bd24b63c3..54f0567cc 100644
--- a/src/core/test/graph/misc.cpp
+++ b/src/core/test/graph/misc.cpp
@@ -1733,22 +1733,25 @@ TEST(TestGraph, UpdateStaticAllocPlan) {
 
 TEST(TestGraph, CPUGPUHybrid) {
     REQUIRE_GPU(1);
-    auto cn_cpu = CompNode::load("cpu:default"),
-         cn_gpu = CompNode::load("gpu0");
-    auto graph = ComputingGraph::make();
-    HostTensorGenerator<> gen;
-    auto host_x = gen({42});
-    auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}),
-         y = x * 2,
-         z = opr::Copy::make(y, cn_gpu) + 1;
-    HostTensorND host_z;
-    auto func = graph->compile({make_callback_copy(z, host_z)});
-    func->execute();
-    for (size_t i = 0; i < 42; ++ i) {
-        MGB_ASSERT_FLOAT_EQ(host_x->ptr<float>()[i] * 2 + 1,
-                            host_z.ptr<float>()[i]);
+    auto cn_gpu = CompNode::load("gpu0");
+    for (auto&& cn_cpu : {CompNode::load("cpu0"), CompNode::default_cpu()}) {
+        auto graph = ComputingGraph::make();
+        HostTensorGenerator<> gen;
+        constexpr size_t length = 23333;
+        auto host_x = gen({length});
+        graph->options().var_sanity_check_first_run = false;
+        auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}),
+            y = opr::Sleep::make(x, 0.5) * 2,
+            z_gpu = opr::Copy::make(y, cn_gpu) + 1,
+            z = opr::Copy::make(z_gpu, cn_cpu) * 2;
+        HostTensorND host_z;
+        auto func = graph->compile({make_callback_copy(z, host_z)});
+        func->execute();
+        for (size_t i = 0; i < length; ++ i) {
+            MGB_ASSERT_FLOAT_EQ((host_x->ptr<float>()[i] * 2 + 1) * 2,
+                                host_z.ptr<float>()[i]);
+        }
     }
-
 }
 
 TEST(TestGraph, In2OutOpStreamPropagate) {
diff --git a/src/core/test/tensor.cpp b/src/core/test/tensor.cpp
index 44f1e9c38..263491b40 100644
--- a/src/core/test/tensor.cpp
+++ b/src/core/test/tensor.cpp
@@ -11,6 +11,7 @@
 
 #include "megbrain/test/helper.h"
 
+#include "megbrain/comp_node_env.h"
 #include "megbrain/tensor.h"
 #include "megbrain/opr/utility.h"
 #include "megbrain/utils/timer.h"
@@ -382,4 +383,39 @@ TEST(TestTensor, NegativeIndex) {
     run_negative_index_test<HostTensorND, DeviceTensorND>();
 }
 
+TEST(TestTensor, CpuCudaD2DCopy) {
+    REQUIRE_GPU(1);
+    auto cn_cpu = CompNode::load("cpu0"),
+         cn_gpu = CompNode::load("gpu0");
+
+    HostTensorGenerator<> gen;
+    constexpr size_t length = 233333;
+    auto a = gen({length});
+    for (auto config: {true, false}) {
+        DeviceTensorND dev_a{cn_cpu}, dev_b{cn_gpu, a->shape(), a->dtype()};
+        dev_a.copy_from(*a).sync();
+
+        if (!config) {
+            auto subspec = Slice(0, length, 3).apply(a->layout(), 0);
+            dev_a = dev_a.sub(subspec);
+            dev_b = dev_b.sub(subspec);
+        }
+
+        auto iadd = [ptr = dev_a.ptr<float>(), length = dev_a.shape()[0],
+                stride = dev_a.layout().stride[0]]() {
+            for (size_t i = 0; i < length; ++ i) {
+                ptr[i * stride] += 1;
+            }
+        };
+        CompNodeEnv::from_comp_node(cn_cpu).cpu_env().dispatch(iadd);
+        auto event = cn_cpu.create_event();
+        event->record();
+        cn_gpu.device_wait_event(*event);
+        dev_b.copy_from_fixlayout(dev_a);
+        HostTensorND res;
+        res.copy_from(dev_b).sync();
+        MGB_ASSERT_TENSOR_EQ(HostTensorND::make_proxy(dev_a), res);
+    }
+}
+
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
-- 
GitLab