提交 0a665ea4 编写于 作者: M Megvii Engine Team

feat(mge/device): enable to get cuda compute capability

GitOrigin-RevId: b5d3f2225cf946378c7e26fe67d9c3ed38c065c0
上级 722aecd4
...@@ -11,6 +11,9 @@ import re ...@@ -11,6 +11,9 @@ import re
from typing import Optional from typing import Optional
from .core._imperative_rt.common import CompNode, DeviceType from .core._imperative_rt.common import CompNode, DeviceType
from .core._imperative_rt.common import (
get_cuda_compute_capability as _get_cuda_compute_capability,
)
from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config
from .core._imperative_rt.common import what_is_xpu as _what_is_xpu from .core._imperative_rt.common import what_is_xpu as _what_is_xpu
...@@ -20,6 +23,7 @@ __all__ = [ ...@@ -20,6 +23,7 @@ __all__ = [
"get_default_device", "get_default_device",
"set_default_device", "set_default_device",
"get_mem_status_bytes", "get_mem_status_bytes",
"get_cuda_compute_capability",
"set_prealloc_config", "set_prealloc_config",
"DeviceType", "DeviceType",
] ]
...@@ -126,6 +130,15 @@ def get_mem_status_bytes(device: Optional[str] = None): ...@@ -126,6 +130,15 @@ def get_mem_status_bytes(device: Optional[str] = None):
return tot, free return tot, free
def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int:
r"""
Get compute capability of the specified device.
It returns a version number, or `SM version`.
"""
return _get_cuda_compute_capability(device, device_type)
set_default_device(os.getenv("MGE_DEFAULT_DEVICE", "xpux")) set_default_device(os.getenv("MGE_DEFAULT_DEVICE", "xpux"))
......
...@@ -185,6 +185,8 @@ void init_common(py::module m) { ...@@ -185,6 +185,8 @@ void init_common(py::module m) {
m.def("set_prealloc_config", &CompNode::set_prealloc_config, m.def("set_prealloc_config", &CompNode::set_prealloc_config,
"specifies how to pre-allocate from raw dev allocator"); "specifies how to pre-allocate from raw dev allocator");
m.def("get_cuda_compute_capability", &CompNode::get_compute_capability);
m.def("what_is_xpu", []{ m.def("what_is_xpu", []{
return CompNode::Locator::parse("xpux").to_physical().type; return CompNode::Locator::parse("xpux").to_physical().type;
}); });
......
...@@ -229,3 +229,18 @@ def test_user_set_pop(): ...@@ -229,3 +229,18 @@ def test_user_set_pop():
assert ret == 1 assert ret == 1
worker() worker()
@pytest.mark.require_ngpu(2)
@pytest.mark.isolated_distributed
def test_get_cuda_compute_capability():
assert mge.device.get_cuda_compute_capability(0) > 0
assert mge.device.get_cuda_compute_capability(1) > 0
@dist.launcher
def worker():
x = mge.tensor([1.0])
assert mge.device.get_cuda_compute_capability(dist.get_rank()) > 0
worker()
...@@ -444,6 +444,16 @@ void CompNode::set_prealloc_config( ...@@ -444,6 +444,16 @@ void CompNode::set_prealloc_config(
}; };
} }
size_t CompNode::get_compute_capability(int dev, DeviceType device_type) {
switch (device_type) {
case DeviceType::CUDA:
return CudaCompNode::get_compute_capability(dev);
default:
mgb_log_warn("unsupport device type for get_compute_capability");
return 0;
};
}
void* CompNode::alloc_device(size_t size) const { void* CompNode::alloc_device(size_t size) const {
auto ret = m_impl->alloc_device(size); auto ret = m_impl->alloc_device(size);
static_cast<Impl*>(m_impl)->env().on_mem_event(size, true, ret); static_cast<Impl*>(m_impl)->env().on_mem_event(size, true, ret);
......
...@@ -202,6 +202,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { ...@@ -202,6 +202,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
//! enable peer copy from dev0 to dev1 //! enable peer copy from dev0 to dev1
static void enable_peer_access(int dev0, int dev1); static void enable_peer_access(int dev0, int dev1);
static size_t get_compute_capability(int dev);
static void static_free_device(ImplBase* self, void* ptr) { static void static_free_device(ImplBase* self, void* ptr) {
static_cast<CompNodeImpl*>(self)->free_device(ptr); static_cast<CompNodeImpl*>(self)->free_device(ptr);
} }
...@@ -709,9 +711,10 @@ void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) { ...@@ -709,9 +711,10 @@ void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) {
namespace { namespace {
#ifndef __unix__ #ifndef __unix__
CUresult get_device_count_forksafe(int* pcnt) { template<typename Func, typename... Args>
CUresult call_cuda_forksafe(Func func, Args... args) {
cuInit(0); cuInit(0);
return cuDeviceGetCount(pcnt); return func(args...);
} }
#else #else
struct RAIICloseFD : NonCopyableObj { struct RAIICloseFD : NonCopyableObj {
...@@ -727,8 +730,9 @@ struct RAIICloseFD : NonCopyableObj { ...@@ -727,8 +730,9 @@ struct RAIICloseFD : NonCopyableObj {
} }
}; };
// an implementation that does not call cuInit // an implementation that does not call cuInit
CUresult get_device_count_forksafe(int* pcnt) { template<typename Func, typename Val, typename... Args>
auto err = cuDeviceGetCount(pcnt); CUresult call_cuda_forksafe(Func func, Val* val, Args... args) {
auto err = func(val, args...);
if (err != CUDA_ERROR_NOT_INITIALIZED) return err; if (err != CUDA_ERROR_NOT_INITIALIZED) return err;
// cuInit not called, call it in child process // cuInit not called, call it in child process
int fd[2]; int fd[2];
...@@ -743,11 +747,11 @@ CUresult get_device_count_forksafe(int* pcnt) { ...@@ -743,11 +747,11 @@ CUresult get_device_count_forksafe(int* pcnt) {
do { do {
err = cuInit(0); err = cuInit(0);
if (err != CUDA_SUCCESS) break; if (err != CUDA_SUCCESS) break;
err = cuDeviceGetCount(pcnt); err = func(val, args...);
} while (0); } while (0);
auto sz = write(fdw, &err, sizeof(err)); auto sz = write(fdw, &err, sizeof(err));
if (sz == sizeof(err) && err == CUDA_SUCCESS) { if (sz == sizeof(err) && err == CUDA_SUCCESS) {
sz = write(fdw, pcnt, sizeof(*pcnt)); sz = write(fdw, val, sizeof(*val));
} }
fdw_guard.close(); fdw_guard.close();
std::quick_exit(0); std::quick_exit(0);
...@@ -756,12 +760,12 @@ CUresult get_device_count_forksafe(int* pcnt) { ...@@ -756,12 +760,12 @@ CUresult get_device_count_forksafe(int* pcnt) {
auto sz = read(fdr, &err, sizeof(err)); auto sz = read(fdr, &err, sizeof(err));
mgb_assert(sz == sizeof(err), "failed to read error code from child"); mgb_assert(sz == sizeof(err), "failed to read error code from child");
if (err == CUDA_SUCCESS) { if (err == CUDA_SUCCESS) {
sz = read(fdr, pcnt, sizeof(*pcnt)); sz = read(fdr, val, sizeof(*val));
mgb_assert(sz == sizeof(*pcnt), "failed to read device count from child"); mgb_assert(sz == sizeof(*val), "failed to read value from child");
return err; return err;
} }
// try again, maybe another thread called cuInit while we fork // try again, maybe another thread called cuInit while we fork
auto err2 = cuDeviceGetCount(pcnt); auto err2 = func(val, args...);
if (err2 == CUDA_SUCCESS) return err2; if (err2 == CUDA_SUCCESS) return err2;
if (err2 == CUDA_ERROR_NOT_INITIALIZED) return err; if (err2 == CUDA_ERROR_NOT_INITIALIZED) return err;
return err2; return err2;
...@@ -783,7 +787,7 @@ bool CudaCompNode::available() { ...@@ -783,7 +787,7 @@ bool CudaCompNode::available() {
MGB_LOCK_GUARD(mtx); MGB_LOCK_GUARD(mtx);
if (result == -1) { if (result == -1) {
int ndev = -1; int ndev = -1;
auto err = get_device_count_forksafe(&ndev); auto err = call_cuda_forksafe(cuDeviceGetCount, &ndev);
result = err == CUDA_SUCCESS && ndev > 0; result = err == CUDA_SUCCESS && ndev > 0;
if (!result) { if (!result) {
mgb_log_warn("cuda unavailable: %s(%d) ndev=%d", mgb_log_warn("cuda unavailable: %s(%d) ndev=%d",
...@@ -934,7 +938,7 @@ size_t CudaCompNode::get_device_count(bool warn) { ...@@ -934,7 +938,7 @@ size_t CudaCompNode::get_device_count(bool warn) {
static Spinlock mtx; static Spinlock mtx;
MGB_LOCK_GUARD(mtx); MGB_LOCK_GUARD(mtx);
if (cnt == -1) { if (cnt == -1) {
auto err = get_device_count_forksafe(&cnt); auto err = call_cuda_forksafe(cuDeviceGetCount, &cnt);
if (err != CUDA_SUCCESS) { if (err != CUDA_SUCCESS) {
if (warn) if (warn)
mgb_log_error("cudaGetDeviceCount failed: %s (err %d)", mgb_log_error("cudaGetDeviceCount failed: %s (err %d)",
...@@ -970,6 +974,27 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req, ...@@ -970,6 +974,27 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req,
} }
} }
size_t CudaCompNode::get_compute_capability(int dev) {
size_t cnt = get_device_count();
if (dev < 0 || dev >= static_cast<int>(cnt)) {
mgb_log_error("request gpu %d out of valid range [0, %lu)", dev, cnt);
return 0;
}
static Spinlock mtx_com;
MGB_LOCK_GUARD(mtx_com);
int pmajor;
int pminor;
auto err = call_cuda_forksafe(cuDeviceGetAttribute, &pmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
if (err != CUDA_SUCCESS) {
return 0;
}
auto err2 = call_cuda_forksafe(cuDeviceGetAttribute, &pminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
if (err2 != CUDA_SUCCESS) {
return 0;
}
return pmajor * 10 + pminor;
}
#else #else
bool CudaCompNode::available() { bool CudaCompNode::available() {
...@@ -990,6 +1015,10 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req, ...@@ -990,6 +1015,10 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req,
size_t max_overhead, size_t max_overhead,
double growth_factor) {} double growth_factor) {}
size_t CudaCompNode::get_compute_capability(int dev) {
return 0;
}
#undef err #undef err
#endif // MGB_CUDA #endif // MGB_CUDA
......
...@@ -33,6 +33,7 @@ namespace mgb { ...@@ -33,6 +33,7 @@ namespace mgb {
static Impl* load_cuda( static Impl* load_cuda(
const Locator &locator, const Locator &locator_logical); const Locator &locator, const Locator &locator_logical);
static void sync_all(); static void sync_all();
static size_t get_compute_capability(int dev);
static void set_prealloc_config(size_t alignment, size_t min_req, static void set_prealloc_config(size_t alignment, size_t min_req,
size_t max_overhead, double growth_factor); size_t max_overhead, double growth_factor);
......
...@@ -298,6 +298,10 @@ class CompNode { ...@@ -298,6 +298,10 @@ class CompNode {
static void set_prealloc_config(size_t alignment, size_t min_req, static void set_prealloc_config(size_t alignment, size_t min_req,
size_t max_overhead, double growth_factor, size_t max_overhead, double growth_factor,
DeviceType device_type); DeviceType device_type);
/*!
* \brief get compute capability of the specified device
*/
static size_t get_compute_capability(int dev, DeviceType device_type);
/* =================== synchronization ======================== */ /* =================== synchronization ======================== */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册