Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
0a665ea4
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
0a665ea4
编写于
8月 11, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(mge/device): enable to get cuda compute capability
GitOrigin-RevId: b5d3f2225cf946378c7e26fe67d9c3ed38c065c0
上级
722aecd4
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
85 addition
and
11 deletion
+85
-11
imperative/python/megengine/device.py
imperative/python/megengine/device.py
+13
-0
imperative/python/src/common.cpp
imperative/python/src/common.cpp
+2
-0
imperative/python/test/unit/distributed/test_distributed.py
imperative/python/test/unit/distributed/test_distributed.py
+15
-0
src/core/impl/comp_node/comp_node.cpp
src/core/impl/comp_node/comp_node.cpp
+10
-0
src/core/impl/comp_node/cuda/comp_node.cpp
src/core/impl/comp_node/cuda/comp_node.cpp
+40
-11
src/core/impl/comp_node/cuda/comp_node.h
src/core/impl/comp_node/cuda/comp_node.h
+1
-0
src/core/include/megbrain/comp_node.h
src/core/include/megbrain/comp_node.h
+4
-0
未找到文件。
imperative/python/megengine/device.py
浏览文件 @
0a665ea4
...
@@ -11,6 +11,9 @@ import re
...
@@ -11,6 +11,9 @@ import re
from
typing
import
Optional
from
typing
import
Optional
from
.core._imperative_rt.common
import
CompNode
,
DeviceType
from
.core._imperative_rt.common
import
CompNode
,
DeviceType
from
.core._imperative_rt.common
import
(
get_cuda_compute_capability
as
_get_cuda_compute_capability
,
)
from
.core._imperative_rt.common
import
set_prealloc_config
as
_set_prealloc_config
from
.core._imperative_rt.common
import
set_prealloc_config
as
_set_prealloc_config
from
.core._imperative_rt.common
import
what_is_xpu
as
_what_is_xpu
from
.core._imperative_rt.common
import
what_is_xpu
as
_what_is_xpu
...
@@ -20,6 +23,7 @@ __all__ = [
...
@@ -20,6 +23,7 @@ __all__ = [
"get_default_device"
,
"get_default_device"
,
"set_default_device"
,
"set_default_device"
,
"get_mem_status_bytes"
,
"get_mem_status_bytes"
,
"get_cuda_compute_capability"
,
"set_prealloc_config"
,
"set_prealloc_config"
,
"DeviceType"
,
"DeviceType"
,
]
]
...
@@ -126,6 +130,15 @@ def get_mem_status_bytes(device: Optional[str] = None):
...
@@ -126,6 +130,15 @@ def get_mem_status_bytes(device: Optional[str] = None):
return
tot
,
free
return
tot
,
free
def
get_cuda_compute_capability
(
device
:
int
,
device_type
=
DeviceType
.
CUDA
)
->
int
:
r
"""
Get compute capability of the specified device.
It returns a version number, or `SM version`.
"""
return
_get_cuda_compute_capability
(
device
,
device_type
)
set_default_device
(
os
.
getenv
(
"MGE_DEFAULT_DEVICE"
,
"xpux"
))
set_default_device
(
os
.
getenv
(
"MGE_DEFAULT_DEVICE"
,
"xpux"
))
...
...
imperative/python/src/common.cpp
浏览文件 @
0a665ea4
...
@@ -185,6 +185,8 @@ void init_common(py::module m) {
...
@@ -185,6 +185,8 @@ void init_common(py::module m) {
m
.
def
(
"set_prealloc_config"
,
&
CompNode
::
set_prealloc_config
,
m
.
def
(
"set_prealloc_config"
,
&
CompNode
::
set_prealloc_config
,
"specifies how to pre-allocate from raw dev allocator"
);
"specifies how to pre-allocate from raw dev allocator"
);
m
.
def
(
"get_cuda_compute_capability"
,
&
CompNode
::
get_compute_capability
);
m
.
def
(
"what_is_xpu"
,
[]{
m
.
def
(
"what_is_xpu"
,
[]{
return
CompNode
::
Locator
::
parse
(
"xpux"
).
to_physical
().
type
;
return
CompNode
::
Locator
::
parse
(
"xpux"
).
to_physical
().
type
;
});
});
...
...
imperative/python/test/unit/distributed/test_distributed.py
浏览文件 @
0a665ea4
...
@@ -229,3 +229,18 @@ def test_user_set_pop():
...
@@ -229,3 +229,18 @@ def test_user_set_pop():
assert
ret
==
1
assert
ret
==
1
worker
()
worker
()
@
pytest
.
mark
.
require_ngpu
(
2
)
@
pytest
.
mark
.
isolated_distributed
def
test_get_cuda_compute_capability
():
assert
mge
.
device
.
get_cuda_compute_capability
(
0
)
>
0
assert
mge
.
device
.
get_cuda_compute_capability
(
1
)
>
0
@
dist
.
launcher
def
worker
():
x
=
mge
.
tensor
([
1.0
])
assert
mge
.
device
.
get_cuda_compute_capability
(
dist
.
get_rank
())
>
0
worker
()
src/core/impl/comp_node/comp_node.cpp
浏览文件 @
0a665ea4
...
@@ -444,6 +444,16 @@ void CompNode::set_prealloc_config(
...
@@ -444,6 +444,16 @@ void CompNode::set_prealloc_config(
};
};
}
}
size_t
CompNode
::
get_compute_capability
(
int
dev
,
DeviceType
device_type
)
{
switch
(
device_type
)
{
case
DeviceType
::
CUDA
:
return
CudaCompNode
::
get_compute_capability
(
dev
);
default:
mgb_log_warn
(
"unsupport device type for get_compute_capability"
);
return
0
;
};
}
void
*
CompNode
::
alloc_device
(
size_t
size
)
const
{
void
*
CompNode
::
alloc_device
(
size_t
size
)
const
{
auto
ret
=
m_impl
->
alloc_device
(
size
);
auto
ret
=
m_impl
->
alloc_device
(
size
);
static_cast
<
Impl
*>
(
m_impl
)
->
env
().
on_mem_event
(
size
,
true
,
ret
);
static_cast
<
Impl
*>
(
m_impl
)
->
env
().
on_mem_event
(
size
,
true
,
ret
);
...
...
src/core/impl/comp_node/cuda/comp_node.cpp
浏览文件 @
0a665ea4
...
@@ -202,6 +202,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
...
@@ -202,6 +202,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
//! enable peer copy from dev0 to dev1
//! enable peer copy from dev0 to dev1
static
void
enable_peer_access
(
int
dev0
,
int
dev1
);
static
void
enable_peer_access
(
int
dev0
,
int
dev1
);
static
size_t
get_compute_capability
(
int
dev
);
static
void
static_free_device
(
ImplBase
*
self
,
void
*
ptr
)
{
static
void
static_free_device
(
ImplBase
*
self
,
void
*
ptr
)
{
static_cast
<
CompNodeImpl
*>
(
self
)
->
free_device
(
ptr
);
static_cast
<
CompNodeImpl
*>
(
self
)
->
free_device
(
ptr
);
}
}
...
@@ -709,9 +711,10 @@ void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) {
...
@@ -709,9 +711,10 @@ void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) {
namespace
{
namespace
{
#ifndef __unix__
#ifndef __unix__
CUresult
get_device_count_forksafe
(
int
*
pcnt
)
{
template
<
typename
Func
,
typename
...
Args
>
CUresult
call_cuda_forksafe
(
Func
func
,
Args
...
args
)
{
cuInit
(
0
);
cuInit
(
0
);
return
cuDeviceGetCount
(
pcnt
);
return
func
(
args
...
);
}
}
#else
#else
struct
RAIICloseFD
:
NonCopyableObj
{
struct
RAIICloseFD
:
NonCopyableObj
{
...
@@ -727,8 +730,9 @@ struct RAIICloseFD : NonCopyableObj {
...
@@ -727,8 +730,9 @@ struct RAIICloseFD : NonCopyableObj {
}
}
};
};
// an implementation that does not call cuInit
// an implementation that does not call cuInit
CUresult
get_device_count_forksafe
(
int
*
pcnt
)
{
template
<
typename
Func
,
typename
Val
,
typename
...
Args
>
auto
err
=
cuDeviceGetCount
(
pcnt
);
CUresult
call_cuda_forksafe
(
Func
func
,
Val
*
val
,
Args
...
args
)
{
auto
err
=
func
(
val
,
args
...);
if
(
err
!=
CUDA_ERROR_NOT_INITIALIZED
)
return
err
;
if
(
err
!=
CUDA_ERROR_NOT_INITIALIZED
)
return
err
;
// cuInit not called, call it in child process
// cuInit not called, call it in child process
int
fd
[
2
];
int
fd
[
2
];
...
@@ -743,11 +747,11 @@ CUresult get_device_count_forksafe(int* pcnt) {
...
@@ -743,11 +747,11 @@ CUresult get_device_count_forksafe(int* pcnt) {
do
{
do
{
err
=
cuInit
(
0
);
err
=
cuInit
(
0
);
if
(
err
!=
CUDA_SUCCESS
)
break
;
if
(
err
!=
CUDA_SUCCESS
)
break
;
err
=
cuDeviceGetCount
(
pcnt
);
err
=
func
(
val
,
args
...
);
}
while
(
0
);
}
while
(
0
);
auto
sz
=
write
(
fdw
,
&
err
,
sizeof
(
err
));
auto
sz
=
write
(
fdw
,
&
err
,
sizeof
(
err
));
if
(
sz
==
sizeof
(
err
)
&&
err
==
CUDA_SUCCESS
)
{
if
(
sz
==
sizeof
(
err
)
&&
err
==
CUDA_SUCCESS
)
{
sz
=
write
(
fdw
,
pcnt
,
sizeof
(
*
pcnt
));
sz
=
write
(
fdw
,
val
,
sizeof
(
*
val
));
}
}
fdw_guard
.
close
();
fdw_guard
.
close
();
std
::
quick_exit
(
0
);
std
::
quick_exit
(
0
);
...
@@ -756,12 +760,12 @@ CUresult get_device_count_forksafe(int* pcnt) {
...
@@ -756,12 +760,12 @@ CUresult get_device_count_forksafe(int* pcnt) {
auto
sz
=
read
(
fdr
,
&
err
,
sizeof
(
err
));
auto
sz
=
read
(
fdr
,
&
err
,
sizeof
(
err
));
mgb_assert
(
sz
==
sizeof
(
err
),
"failed to read error code from child"
);
mgb_assert
(
sz
==
sizeof
(
err
),
"failed to read error code from child"
);
if
(
err
==
CUDA_SUCCESS
)
{
if
(
err
==
CUDA_SUCCESS
)
{
sz
=
read
(
fdr
,
pcnt
,
sizeof
(
*
pcnt
));
sz
=
read
(
fdr
,
val
,
sizeof
(
*
val
));
mgb_assert
(
sz
==
sizeof
(
*
pcnt
),
"failed to read device count
from child"
);
mgb_assert
(
sz
==
sizeof
(
*
val
),
"failed to read value
from child"
);
return
err
;
return
err
;
}
}
// try again, maybe another thread called cuInit while we fork
// try again, maybe another thread called cuInit while we fork
auto
err2
=
cuDeviceGetCount
(
pcnt
);
auto
err2
=
func
(
val
,
args
...
);
if
(
err2
==
CUDA_SUCCESS
)
return
err2
;
if
(
err2
==
CUDA_SUCCESS
)
return
err2
;
if
(
err2
==
CUDA_ERROR_NOT_INITIALIZED
)
return
err
;
if
(
err2
==
CUDA_ERROR_NOT_INITIALIZED
)
return
err
;
return
err2
;
return
err2
;
...
@@ -783,7 +787,7 @@ bool CudaCompNode::available() {
...
@@ -783,7 +787,7 @@ bool CudaCompNode::available() {
MGB_LOCK_GUARD
(
mtx
);
MGB_LOCK_GUARD
(
mtx
);
if
(
result
==
-
1
)
{
if
(
result
==
-
1
)
{
int
ndev
=
-
1
;
int
ndev
=
-
1
;
auto
err
=
get_device_count_forksafe
(
&
ndev
);
auto
err
=
call_cuda_forksafe
(
cuDeviceGetCount
,
&
ndev
);
result
=
err
==
CUDA_SUCCESS
&&
ndev
>
0
;
result
=
err
==
CUDA_SUCCESS
&&
ndev
>
0
;
if
(
!
result
)
{
if
(
!
result
)
{
mgb_log_warn
(
"cuda unavailable: %s(%d) ndev=%d"
,
mgb_log_warn
(
"cuda unavailable: %s(%d) ndev=%d"
,
...
@@ -934,7 +938,7 @@ size_t CudaCompNode::get_device_count(bool warn) {
...
@@ -934,7 +938,7 @@ size_t CudaCompNode::get_device_count(bool warn) {
static
Spinlock
mtx
;
static
Spinlock
mtx
;
MGB_LOCK_GUARD
(
mtx
);
MGB_LOCK_GUARD
(
mtx
);
if
(
cnt
==
-
1
)
{
if
(
cnt
==
-
1
)
{
auto
err
=
get_device_count_forksafe
(
&
cnt
);
auto
err
=
call_cuda_forksafe
(
cuDeviceGetCount
,
&
cnt
);
if
(
err
!=
CUDA_SUCCESS
)
{
if
(
err
!=
CUDA_SUCCESS
)
{
if
(
warn
)
if
(
warn
)
mgb_log_error
(
"cudaGetDeviceCount failed: %s (err %d)"
,
mgb_log_error
(
"cudaGetDeviceCount failed: %s (err %d)"
,
...
@@ -970,6 +974,27 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req,
...
@@ -970,6 +974,27 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req,
}
}
}
}
size_t
CudaCompNode
::
get_compute_capability
(
int
dev
)
{
size_t
cnt
=
get_device_count
();
if
(
dev
<
0
||
dev
>=
static_cast
<
int
>
(
cnt
))
{
mgb_log_error
(
"request gpu %d out of valid range [0, %lu)"
,
dev
,
cnt
);
return
0
;
}
static
Spinlock
mtx_com
;
MGB_LOCK_GUARD
(
mtx_com
);
int
pmajor
;
int
pminor
;
auto
err
=
call_cuda_forksafe
(
cuDeviceGetAttribute
,
&
pmajor
,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
,
dev
);
if
(
err
!=
CUDA_SUCCESS
)
{
return
0
;
}
auto
err2
=
call_cuda_forksafe
(
cuDeviceGetAttribute
,
&
pminor
,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
,
dev
);
if
(
err2
!=
CUDA_SUCCESS
)
{
return
0
;
}
return
pmajor
*
10
+
pminor
;
}
#else
#else
bool
CudaCompNode
::
available
()
{
bool
CudaCompNode
::
available
()
{
...
@@ -990,6 +1015,10 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req,
...
@@ -990,6 +1015,10 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req,
size_t
max_overhead
,
size_t
max_overhead
,
double
growth_factor
)
{}
double
growth_factor
)
{}
size_t
CudaCompNode
::
get_compute_capability
(
int
dev
)
{
return
0
;
}
#undef err
#undef err
#endif // MGB_CUDA
#endif // MGB_CUDA
...
...
src/core/impl/comp_node/cuda/comp_node.h
浏览文件 @
0a665ea4
...
@@ -33,6 +33,7 @@ namespace mgb {
...
@@ -33,6 +33,7 @@ namespace mgb {
static
Impl
*
load_cuda
(
static
Impl
*
load_cuda
(
const
Locator
&
locator
,
const
Locator
&
locator_logical
);
const
Locator
&
locator
,
const
Locator
&
locator_logical
);
static
void
sync_all
();
static
void
sync_all
();
static
size_t
get_compute_capability
(
int
dev
);
static
void
set_prealloc_config
(
size_t
alignment
,
size_t
min_req
,
static
void
set_prealloc_config
(
size_t
alignment
,
size_t
min_req
,
size_t
max_overhead
,
double
growth_factor
);
size_t
max_overhead
,
double
growth_factor
);
...
...
src/core/include/megbrain/comp_node.h
浏览文件 @
0a665ea4
...
@@ -298,6 +298,10 @@ class CompNode {
...
@@ -298,6 +298,10 @@ class CompNode {
static
void
set_prealloc_config
(
size_t
alignment
,
size_t
min_req
,
static
void
set_prealloc_config
(
size_t
alignment
,
size_t
min_req
,
size_t
max_overhead
,
double
growth_factor
,
size_t
max_overhead
,
double
growth_factor
,
DeviceType
device_type
);
DeviceType
device_type
);
/*!
* \brief get compute capability of the specified device
*/
static
size_t
get_compute_capability
(
int
dev
,
DeviceType
device_type
);
/* =================== synchronization ======================== */
/* =================== synchronization ======================== */
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录