提交 856ef627 编写于 作者: M Megvii Engine Team 提交者: Xinran Xu

feat(mgb/core): support copy DeviceTensorND from cpu to cuda

GitOrigin-RevId: d56f4ebf1fadccb5f9d6af2497d27744084d3930
上级 ca811c2c
......@@ -836,9 +836,10 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by(
{
auto type = cn_impl->env().property().type;
mgb_throw_if(type != CompNode::DeviceType::CPU
&& type != CompNode::DeviceType::CUDA
,
MegBrainError,
"currently CPU can only wait for CPU"
"currently CPU can only wait for CPU, CUDA"
);
}
......
......@@ -40,6 +40,16 @@ namespace {
return std::max<size_t>(300 * 1024 * 1024, available / 20);
}
}
using CudaHostFunc = megdnn::thin_function<void()>;
void CUDART_CB cuda_host_func_caller(void* ud) {
mgb_assert(ud);
CudaHostFunc* func_ptr = reinterpret_cast<CudaHostFunc*>(ud);
MGB_TRY {
(*func_ptr)();
} MGB_FINALLY(
delete func_ptr;
);
}
} // anonymous namespace
namespace mgb {
......@@ -223,6 +233,18 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl {
Locator locator_logical() override {
return m_locator_logical;
}
void add_callback(CudaHostFunc&& cb) override {
activate();
CudaHostFunc* func_ptr = new CudaHostFunc(std::move(cb));
MGB_TRY {
MGB_CUDA_CHECK(cudaLaunchHostFunc(m_env.cuda_env().stream,
cuda_host_func_caller, static_cast<void*>(func_ptr)));
} MGB_CATCH(..., {
delete func_ptr;
throw;
});
}
};
MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl);
......
......@@ -28,15 +28,32 @@ namespace {
//! implement non-contiguous d2d copy
void noncont_tensor_copy(
const DeviceTensorND &dest, const DeviceTensorND &src, bool, bool) {
auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node());
const DeviceTensorND &dest, const DeviceTensorND &src,
bool contig_dest, bool contig_src) {
auto src_cn = src.comp_node();
auto dst_cn = dest.comp_node();
auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>(
dst_cn);
dst_cn.activate();
relayout->exec(
const_cast<DeviceTensorND&>(src).as_megdnn(),
dest.as_megdnn(), MegDNNHandle::get(src_env).handle());
if (src_cn.device_type() == dst_cn.device_type()) {
// perform relayout op for better performance when src and dst are
// placed on comp nodes with the same device type
auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node());
auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>(
dst_cn);
dst_cn.activate();
relayout->exec(
const_cast<DeviceTensorND&>(src).as_megdnn(),
dest.as_megdnn(), MegDNNHandle::get(src_env).handle());
} else {
if (contig_src) {
mgb_assert(!contig_dest);
DeviceTensorND tmp{dst_cn};
tmp.copy_from(src);
dest.copy_from_fixlayout(tmp);
return;
}
DeviceTensorND tmp;
tmp.copy_from(src);
dest.copy_from_fixlayout(tmp);
}
}
//! implement non-contiguous h2h copy
......@@ -346,7 +363,28 @@ template<> template<>
void TensorStorage<DeviceTensorStorageTrait>::copy_from(
const TensorStorage<DeviceTensorStorageTrait> &src, size_t size) const {
mgb_assert(size <= this->size() && size <= src.size());
src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size);
if (src.comp_node().device_type() == CompNode::DeviceType::CPU &&
comp_node().device_type() == CompNode::DeviceType::CUDA) {
// current thread(i.e. cuda dispatcher thread) should wait for all
// operations on src's comp_node to finish, otherwise a race condition
// might occur between the worker thread of src's comp_node and the
// thread responsible for copying pageable memory in \p src to a pinned
// buffer, refer to
// https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html
//
// Note: it is highly recommended that copy tensor from cpu to cuda
// with asynchronized disaptching(see graph option async_exec_level),
// or main thread might be blocked by worker thread corresponding to
// the src's comp_node, resulting in bad performance
//
// TODO: consider using cudaMallocHost or cudaHostRegister
// to pin the memory of src tensor, so it does not require synchronization
// and is more efficient
src.comp_node().sync();
comp_node().copy_to_device(ptr(), src.ptr(), size);
} else {
src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size);
}
}
......
......@@ -1733,22 +1733,25 @@ TEST(TestGraph, UpdateStaticAllocPlan) {
TEST(TestGraph, CPUGPUHybrid) {
REQUIRE_GPU(1);
auto cn_cpu = CompNode::load("cpu:default"),
cn_gpu = CompNode::load("gpu0");
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
auto host_x = gen({42});
auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}),
y = x * 2,
z = opr::Copy::make(y, cn_gpu) + 1;
HostTensorND host_z;
auto func = graph->compile({make_callback_copy(z, host_z)});
func->execute();
for (size_t i = 0; i < 42; ++ i) {
MGB_ASSERT_FLOAT_EQ(host_x->ptr<float>()[i] * 2 + 1,
host_z.ptr<float>()[i]);
auto cn_gpu = CompNode::load("gpu0");
for (auto&& cn_cpu : {CompNode::load("cpu0"), CompNode::default_cpu()}) {
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
constexpr size_t length = 23333;
auto host_x = gen({length});
graph->options().var_sanity_check_first_run = false;
auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}),
y = opr::Sleep::make(x, 0.5) * 2,
z_gpu = opr::Copy::make(y, cn_gpu) + 1,
z = opr::Copy::make(z_gpu, cn_cpu) * 2;
HostTensorND host_z;
auto func = graph->compile({make_callback_copy(z, host_z)});
func->execute();
for (size_t i = 0; i < length; ++ i) {
MGB_ASSERT_FLOAT_EQ((host_x->ptr<float>()[i] * 2 + 1) * 2,
host_z.ptr<float>()[i]);
}
}
}
TEST(TestGraph, In2OutOpStreamPropagate) {
......
......@@ -11,6 +11,7 @@
#include "megbrain/test/helper.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/tensor.h"
#include "megbrain/opr/utility.h"
#include "megbrain/utils/timer.h"
......@@ -382,4 +383,39 @@ TEST(TestTensor, NegativeIndex) {
run_negative_index_test<HostTensorND, DeviceTensorND>();
}
TEST(TestTensor, CpuCudaD2DCopy) {
REQUIRE_GPU(1);
auto cn_cpu = CompNode::load("cpu0"),
cn_gpu = CompNode::load("gpu0");
HostTensorGenerator<> gen;
constexpr size_t length = 233333;
auto a = gen({length});
for (auto config: {true, false}) {
DeviceTensorND dev_a{cn_cpu}, dev_b{cn_gpu, a->shape(), a->dtype()};
dev_a.copy_from(*a).sync();
if (!config) {
auto subspec = Slice(0, length, 3).apply(a->layout(), 0);
dev_a = dev_a.sub(subspec);
dev_b = dev_b.sub(subspec);
}
auto iadd = [ptr = dev_a.ptr<float>(), length = dev_a.shape()[0],
stride = dev_a.layout().stride[0]]() {
for (size_t i = 0; i < length; ++ i) {
ptr[i * stride] += 1;
}
};
CompNodeEnv::from_comp_node(cn_cpu).cpu_env().dispatch(iadd);
auto event = cn_cpu.create_event();
event->record();
cn_gpu.device_wait_event(*event);
dev_b.copy_from_fixlayout(dev_a);
HostTensorND res;
res.copy_from(dev_b).sync();
MGB_ASSERT_TENSOR_EQ(HostTensorND::make_proxy(dev_a), res);
}
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册