Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
856ef627
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
856ef627
编写于
4月 08, 2020
作者:
M
Megvii Engine Team
提交者:
Xinran Xu
4月 22, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(mgb/core): support copy DeviceTensorND from cpu to cuda
GitOrigin-RevId: d56f4ebf1fadccb5f9d6af2497d27744084d3930
上级
ca811c2c
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
125 addition
and
25 deletion
+125
-25
src/core/impl/comp_node/cpu/comp_node.cpp
src/core/impl/comp_node/cpu/comp_node.cpp
+2
-1
src/core/impl/comp_node/cuda/comp_node.cpp
src/core/impl/comp_node/cuda/comp_node.cpp
+22
-0
src/core/impl/tensor.cpp
src/core/impl/tensor.cpp
+47
-9
src/core/test/graph/misc.cpp
src/core/test/graph/misc.cpp
+18
-15
src/core/test/tensor.cpp
src/core/test/tensor.cpp
+36
-0
未找到文件。
src/core/impl/comp_node/cpu/comp_node.cpp
浏览文件 @
856ef627
...
...
@@ -836,9 +836,10 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by(
{
auto
type
=
cn_impl
->
env
().
property
().
type
;
mgb_throw_if
(
type
!=
CompNode
::
DeviceType
::
CPU
&&
type
!=
CompNode
::
DeviceType
::
CUDA
,
MegBrainError
,
"currently CPU can only wait for CPU"
"currently CPU can only wait for CPU
, CUDA
"
);
}
...
...
src/core/impl/comp_node/cuda/comp_node.cpp
浏览文件 @
856ef627
...
...
@@ -40,6 +40,16 @@ namespace {
return
std
::
max
<
size_t
>
(
300
*
1024
*
1024
,
available
/
20
);
}
}
using
CudaHostFunc
=
megdnn
::
thin_function
<
void
()
>
;
void
CUDART_CB
cuda_host_func_caller
(
void
*
ud
)
{
mgb_assert
(
ud
);
CudaHostFunc
*
func_ptr
=
reinterpret_cast
<
CudaHostFunc
*>
(
ud
);
MGB_TRY
{
(
*
func_ptr
)();
}
MGB_FINALLY
(
delete
func_ptr
;
);
}
}
// anonymous namespace
namespace
mgb
{
...
...
@@ -223,6 +233,18 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl {
Locator
locator_logical
()
override
{
return
m_locator_logical
;
}
void
add_callback
(
CudaHostFunc
&&
cb
)
override
{
activate
();
CudaHostFunc
*
func_ptr
=
new
CudaHostFunc
(
std
::
move
(
cb
));
MGB_TRY
{
MGB_CUDA_CHECK
(
cudaLaunchHostFunc
(
m_env
.
cuda_env
().
stream
,
cuda_host_func_caller
,
static_cast
<
void
*>
(
func_ptr
)));
}
MGB_CATCH
(...,
{
delete
func_ptr
;
throw
;
});
}
};
MGB_DYN_TYPE_OBJ_FINAL_IMPL
(
CudaCompNode
::
CompNodeImpl
);
...
...
src/core/impl/tensor.cpp
浏览文件 @
856ef627
...
...
@@ -28,15 +28,32 @@ namespace {
//! implement non-contiguous d2d copy
void
noncont_tensor_copy
(
const
DeviceTensorND
&
dest
,
const
DeviceTensorND
&
src
,
bool
,
bool
)
{
auto
&&
src_env
=
CompNodeEnv
::
from_comp_node
(
src
.
comp_node
());
const
DeviceTensorND
&
dest
,
const
DeviceTensorND
&
src
,
bool
contig_dest
,
bool
contig_src
)
{
auto
src_cn
=
src
.
comp_node
();
auto
dst_cn
=
dest
.
comp_node
();
auto
relayout
=
opr
::
intl
::
get_megdnn_global_opr
<
megdnn
::
Relayout
>
(
dst_cn
);
dst_cn
.
activate
();
relayout
->
exec
(
const_cast
<
DeviceTensorND
&>
(
src
).
as_megdnn
(),
dest
.
as_megdnn
(),
MegDNNHandle
::
get
(
src_env
).
handle
());
if
(
src_cn
.
device_type
()
==
dst_cn
.
device_type
())
{
// perform relayout op for better performance when src and dst are
// placed on comp nodes with the same device type
auto
&&
src_env
=
CompNodeEnv
::
from_comp_node
(
src
.
comp_node
());
auto
relayout
=
opr
::
intl
::
get_megdnn_global_opr
<
megdnn
::
Relayout
>
(
dst_cn
);
dst_cn
.
activate
();
relayout
->
exec
(
const_cast
<
DeviceTensorND
&>
(
src
).
as_megdnn
(),
dest
.
as_megdnn
(),
MegDNNHandle
::
get
(
src_env
).
handle
());
}
else
{
if
(
contig_src
)
{
mgb_assert
(
!
contig_dest
);
DeviceTensorND
tmp
{
dst_cn
};
tmp
.
copy_from
(
src
);
dest
.
copy_from_fixlayout
(
tmp
);
return
;
}
DeviceTensorND
tmp
;
tmp
.
copy_from
(
src
);
dest
.
copy_from_fixlayout
(
tmp
);
}
}
//! implement non-contiguous h2h copy
...
...
@@ -346,7 +363,28 @@ template<> template<>
void
TensorStorage
<
DeviceTensorStorageTrait
>::
copy_from
(
const
TensorStorage
<
DeviceTensorStorageTrait
>
&
src
,
size_t
size
)
const
{
mgb_assert
(
size
<=
this
->
size
()
&&
size
<=
src
.
size
());
src
.
comp_node
().
peer_copy_to
(
m_comp_node
,
ptr
(),
src
.
ptr
(),
size
);
if
(
src
.
comp_node
().
device_type
()
==
CompNode
::
DeviceType
::
CPU
&&
comp_node
().
device_type
()
==
CompNode
::
DeviceType
::
CUDA
)
{
// current thread(i.e. cuda dispatcher thread) should wait for all
// operations on src's comp_node to finish, otherwise a race condition
// might occur between the worker thread of src's comp_node and the
// thread responsible for copying pageable memory in \p src to a pinned
// buffer, refer to
// https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html
//
// Note: it is highly recommended that copy tensor from cpu to cuda
// with asynchronized disaptching(see graph option async_exec_level),
// or main thread might be blocked by worker thread corresponding to
// the src's comp_node, resulting in bad performance
//
// TODO: consider using cudaMallocHost or cudaHostRegister
// to pin the memory of src tensor, so it does not require synchronization
// and is more efficient
src
.
comp_node
().
sync
();
comp_node
().
copy_to_device
(
ptr
(),
src
.
ptr
(),
size
);
}
else
{
src
.
comp_node
().
peer_copy_to
(
m_comp_node
,
ptr
(),
src
.
ptr
(),
size
);
}
}
...
...
src/core/test/graph/misc.cpp
浏览文件 @
856ef627
...
...
@@ -1733,22 +1733,25 @@ TEST(TestGraph, UpdateStaticAllocPlan) {
TEST
(
TestGraph
,
CPUGPUHybrid
)
{
REQUIRE_GPU
(
1
);
auto
cn_cpu
=
CompNode
::
load
(
"cpu:default"
),
cn_gpu
=
CompNode
::
load
(
"gpu0"
);
auto
graph
=
ComputingGraph
::
make
();
HostTensorGenerator
<>
gen
;
auto
host_x
=
gen
({
42
});
auto
x
=
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
host_x
,
{
cn_cpu
}),
y
=
x
*
2
,
z
=
opr
::
Copy
::
make
(
y
,
cn_gpu
)
+
1
;
HostTensorND
host_z
;
auto
func
=
graph
->
compile
({
make_callback_copy
(
z
,
host_z
)});
func
->
execute
();
for
(
size_t
i
=
0
;
i
<
42
;
++
i
)
{
MGB_ASSERT_FLOAT_EQ
(
host_x
->
ptr
<
float
>
()[
i
]
*
2
+
1
,
host_z
.
ptr
<
float
>
()[
i
]);
auto
cn_gpu
=
CompNode
::
load
(
"gpu0"
);
for
(
auto
&&
cn_cpu
:
{
CompNode
::
load
(
"cpu0"
),
CompNode
::
default_cpu
()})
{
auto
graph
=
ComputingGraph
::
make
();
HostTensorGenerator
<>
gen
;
constexpr
size_t
length
=
23333
;
auto
host_x
=
gen
({
length
});
graph
->
options
().
var_sanity_check_first_run
=
false
;
auto
x
=
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
host_x
,
{
cn_cpu
}),
y
=
opr
::
Sleep
::
make
(
x
,
0.5
)
*
2
,
z_gpu
=
opr
::
Copy
::
make
(
y
,
cn_gpu
)
+
1
,
z
=
opr
::
Copy
::
make
(
z_gpu
,
cn_cpu
)
*
2
;
HostTensorND
host_z
;
auto
func
=
graph
->
compile
({
make_callback_copy
(
z
,
host_z
)});
func
->
execute
();
for
(
size_t
i
=
0
;
i
<
length
;
++
i
)
{
MGB_ASSERT_FLOAT_EQ
((
host_x
->
ptr
<
float
>
()[
i
]
*
2
+
1
)
*
2
,
host_z
.
ptr
<
float
>
()[
i
]);
}
}
}
TEST
(
TestGraph
,
In2OutOpStreamPropagate
)
{
...
...
src/core/test/tensor.cpp
浏览文件 @
856ef627
...
...
@@ -11,6 +11,7 @@
#include "megbrain/test/helper.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/tensor.h"
#include "megbrain/opr/utility.h"
#include "megbrain/utils/timer.h"
...
...
@@ -382,4 +383,39 @@ TEST(TestTensor, NegativeIndex) {
run_negative_index_test
<
HostTensorND
,
DeviceTensorND
>
();
}
TEST
(
TestTensor
,
CpuCudaD2DCopy
)
{
REQUIRE_GPU
(
1
);
auto
cn_cpu
=
CompNode
::
load
(
"cpu0"
),
cn_gpu
=
CompNode
::
load
(
"gpu0"
);
HostTensorGenerator
<>
gen
;
constexpr
size_t
length
=
233333
;
auto
a
=
gen
({
length
});
for
(
auto
config
:
{
true
,
false
})
{
DeviceTensorND
dev_a
{
cn_cpu
},
dev_b
{
cn_gpu
,
a
->
shape
(),
a
->
dtype
()};
dev_a
.
copy_from
(
*
a
).
sync
();
if
(
!
config
)
{
auto
subspec
=
Slice
(
0
,
length
,
3
).
apply
(
a
->
layout
(),
0
);
dev_a
=
dev_a
.
sub
(
subspec
);
dev_b
=
dev_b
.
sub
(
subspec
);
}
auto
iadd
=
[
ptr
=
dev_a
.
ptr
<
float
>
(),
length
=
dev_a
.
shape
()[
0
],
stride
=
dev_a
.
layout
().
stride
[
0
]]()
{
for
(
size_t
i
=
0
;
i
<
length
;
++
i
)
{
ptr
[
i
*
stride
]
+=
1
;
}
};
CompNodeEnv
::
from_comp_node
(
cn_cpu
).
cpu_env
().
dispatch
(
iadd
);
auto
event
=
cn_cpu
.
create_event
();
event
->
record
();
cn_gpu
.
device_wait_event
(
*
event
);
dev_b
.
copy_from_fixlayout
(
dev_a
);
HostTensorND
res
;
res
.
copy_from
(
dev_b
).
sync
();
MGB_ASSERT_TENSOR_EQ
(
HostTensorND
::
make_proxy
(
dev_a
),
res
);
}
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录