Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
a0e53118
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
a0e53118
编写于
4月 21, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(src/comp_node): fix calling cuda driver api
GitOrigin-RevId: cc33af2ac4a534cab1b11e1afbe31187330314e5
上级
ccea0e23
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
34 addition
and
74 deletion
+34
-74
src/core/impl/comp_node/cuda/comp_node.cpp
src/core/impl/comp_node/cuda/comp_node.cpp
+34
-74
未找到文件。
src/core/impl/comp_node/cuda/comp_node.cpp
浏览文件 @
a0e53118
...
...
@@ -761,10 +761,10 @@ void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) {
namespace
{
#ifndef __unix__
template
<
typename
Func
,
typename
...
Args
>
CUresult
call_cuda_forksafe
(
Func
func
,
Args
...
args
)
{
template
<
typename
Func
,
typename
Val
>
CUresult
call_cuda_forksafe
(
Func
func
,
Val
*
val
,
size_t
len
)
{
cuInit
(
0
);
return
func
(
args
...
);
return
func
();
}
#else
struct
RAIICloseFD
:
NonCopyableObj
{
...
...
@@ -780,11 +780,13 @@ struct RAIICloseFD : NonCopyableObj {
}
};
// an implementation that does not call cuInit
template
<
typename
Func
,
typename
Val
,
typename
...
Args
>
CUresult
call_cuda_forksafe
(
Func
func
,
Val
*
val
,
Args
...
args
)
{
auto
err
=
func
(
val
,
args
...);
template
<
typename
Func
,
typename
Val
>
CUresult
call_cuda_forksafe
(
Func
func
,
Val
*
val
,
size_t
len
)
{
int
t_ndev
;
// use cuDeviceGetCount to detect cuda initialization to avoid abnormal behavior
auto
err
=
cuDeviceGetCount
(
&
t_ndev
);
if
(
err
!=
CUDA_ERROR_NOT_INITIALIZED
)
return
err
;
return
func
()
;
// cuInit not called, call it in child process
int
fd
[
2
];
mgb_assert
(
pipe
(
fd
)
==
0
,
"pipe() failed"
);
...
...
@@ -799,51 +801,7 @@ CUresult call_cuda_forksafe(Func func, Val* val, Args... args) {
err
=
cuInit
(
0
);
if
(
err
!=
CUDA_SUCCESS
)
break
;
err
=
func
(
val
,
args
...);
}
while
(
0
);
auto
sz
=
write
(
fdw
,
&
err
,
sizeof
(
err
));
if
(
sz
==
sizeof
(
err
)
&&
err
==
CUDA_SUCCESS
)
{
sz
=
write
(
fdw
,
val
,
sizeof
(
*
val
));
}
fdw_guard
.
close
();
std
::
quick_exit
(
0
);
}
fdw_guard
.
close
();
auto
sz
=
read
(
fdr
,
&
err
,
sizeof
(
err
));
mgb_assert
(
sz
==
sizeof
(
err
),
"failed to read error code from child"
);
if
(
err
==
CUDA_SUCCESS
)
{
sz
=
read
(
fdr
,
val
,
sizeof
(
*
val
));
mgb_assert
(
sz
==
sizeof
(
*
val
),
"failed to read value from child"
);
return
err
;
}
// try again, maybe another thread called cuInit while we fork
auto
err2
=
func
(
val
,
args
...);
if
(
err2
==
CUDA_SUCCESS
)
return
err2
;
if
(
err2
==
CUDA_ERROR_NOT_INITIALIZED
)
return
err
;
return
err2
;
}
template
<
typename
Func
,
typename
...
Args
>
CUresult
call_cuda_forksafe
(
Func
func
,
char
*
val
,
int
len
,
Args
...
args
)
{
auto
err
=
func
(
val
,
len
,
args
...);
if
(
err
!=
CUDA_ERROR_NOT_INITIALIZED
)
return
err
;
// cuInit not called, call it in child process
int
fd
[
2
];
mgb_assert
(
pipe
(
fd
)
==
0
,
"pipe() failed"
);
int
fdr
=
fd
[
0
],
fdw
=
fd
[
1
];
RAIICloseFD
fdr_guard
(
fdr
);
RAIICloseFD
fdw_guard
(
fdw
);
auto
cpid
=
fork
();
mgb_assert
(
cpid
!=
-
1
,
"fork() failed"
);
if
(
cpid
==
0
)
{
fdr_guard
.
close
();
do
{
err
=
cuInit
(
0
);
if
(
err
!=
CUDA_SUCCESS
)
break
;
err
=
func
(
val
,
len
,
args
...);
err
=
func
();
}
while
(
0
);
auto
sz
=
write
(
fdw
,
&
err
,
sizeof
(
err
));
if
(
sz
==
sizeof
(
err
)
&&
err
==
CUDA_SUCCESS
)
{
...
...
@@ -858,12 +816,12 @@ CUresult call_cuda_forksafe(Func func, char* val, int len, Args... args) {
if
(
err
==
CUDA_SUCCESS
)
{
sz
=
read
(
fdr
,
val
,
sizeof
(
*
val
)
*
len
);
mgb_assert
(
static_cast
<
size_t
>
(
sz
)
==
sizeof
(
*
val
)
*
static_cast
<
size_t
>
(
len
)
,
static_cast
<
size_t
>
(
sz
)
==
sizeof
(
*
val
)
*
len
,
"failed to read value from child"
);
return
err
;
}
// try again, maybe another thread called cuInit while we fork
auto
err2
=
func
(
val
,
len
,
args
...
);
auto
err2
=
func
();
if
(
err2
==
CUDA_SUCCESS
)
return
err2
;
if
(
err2
==
CUDA_ERROR_NOT_INITIALIZED
)
...
...
@@ -882,6 +840,17 @@ const char* cu_get_error_string(CUresult err) {
return
ret
;
}
#define MGB_CALL_CUDA_FORKSAFE_NOASSERT(func, ptr, len, ...) \
call_cuda_forksafe([&]() { return func(ptr, ##__VA_ARGS__); }, ptr, len)
#define MGB_CALL_CUDA_FORKSAFE(func, ptr, len, ...) \
{ \
auto err = MGB_CALL_CUDA_FORKSAFE_NOASSERT(func, ptr, len, ##__VA_ARGS__); \
if (err != CUDA_SUCCESS) { \
auto err_s = cu_get_error_string(err); \
mgb_log_error(#func " failed: %s (err %d)", err_s, int(err)); \
} \
}
}
// namespace
bool
CudaCompNode
::
available
()
{
...
...
@@ -890,7 +859,7 @@ bool CudaCompNode::available() {
MGB_LOCK_GUARD
(
mtx
);
if
(
result
==
-
1
)
{
int
ndev
=
-
1
;
auto
err
=
call_cuda_forksafe
(
cuDeviceGetCount
,
&
ndev
);
auto
err
=
MGB_CALL_CUDA_FORKSAFE_NOASSERT
(
cuDeviceGetCount
,
&
ndev
,
1
);
result
=
err
==
CUDA_SUCCESS
&&
ndev
>
0
;
auto
err_s
=
cu_get_error_string
(
err
);
//! only show !CUDA_SUCCESS log when with valid stub call
...
...
@@ -1042,12 +1011,11 @@ size_t CudaCompNode::get_device_count(bool warn) {
static
Spinlock
mtx
;
MGB_LOCK_GUARD
(
mtx
);
if
(
cnt
==
-
1
)
{
auto
err
=
call_cuda_forksafe
(
cuDeviceGetCount
,
&
cnt
);
auto
err
=
MGB_CALL_CUDA_FORKSAFE_NOASSERT
(
cuDeviceGetCount
,
&
cnt
,
1
);
auto
err_s
=
cu_get_error_string
(
err
);
if
(
err
!=
CUDA_SUCCESS
)
{
if
(
warn
&&
(
std
::
string
(
err_s
)
!=
"invalid_stub_call"
))
mgb_log_error
(
"cudaGetDeviceCount failed: %s (err %d)"
,
err_s
,
int
(
err
));
mgb_log_error
(
"cuDeviceGetCount failed: %s (err %d)"
,
err_s
,
int
(
err
));
cnt
=
0
;
}
mgb_assert
(
cnt
>=
0
);
...
...
@@ -1088,23 +1056,15 @@ CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
if
(
!
rec
.
init
)
{
MGB_LOCK_GUARD
(
rec
.
mtx_com
);
if
(
!
rec
.
init
)
{
MGB_CALL_CUDA_FORKSAFE
(
cuDeviceGetAttribute
,
&
rec
.
prop
.
major
,
1
,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
,
dev
);
MGB_CALL_CUDA_FORKSAFE
(
cuDeviceGetAttribute
,
&
rec
.
prop
.
minor
,
1
,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
,
dev
);
MGB_CALL_CUDA_FORKSAFE
(
cuDeviceTotalMem
,
&
rec
.
prop
.
total_memory
,
1
,
dev
);
char
pname
[
256
]
=
{
0
};
mgb_assert
(
call_cuda_forksafe
(
cuDeviceGetAttribute
,
&
rec
.
prop
.
major
,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
,
dev
)
==
CUDA_SUCCESS
);
mgb_assert
(
call_cuda_forksafe
(
cuDeviceGetAttribute
,
&
rec
.
prop
.
minor
,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
,
dev
)
==
CUDA_SUCCESS
);
mgb_assert
(
call_cuda_forksafe
(
cuDeviceGetName
,
pname
,
255
,
dev
)
==
CUDA_SUCCESS
);
mgb_assert
(
call_cuda_forksafe
(
cuDeviceTotalMem
,
&
rec
.
prop
.
total_memory
,
dev
)
==
CUDA_SUCCESS
);
MGB_CALL_CUDA_FORKSAFE
(
cuDeviceGetName
,
pname
,
255
,
255
,
dev
);
rec
.
prop
.
name
=
pname
;
rec
.
init
=
true
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录