提交 1c2a323e 编写于 作者: M Megvii Engine Team

feat(mge): add warning message when mismatched cuda sm is detected

GitOrigin-RevId: f78c79eb069fb88208af8e8f00d91e2094371b90
上级 877bda41
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import atexit import atexit
import ctypes import ctypes
import re
import os import os
import platform import platform
import sys import sys
...@@ -89,6 +90,9 @@ if sys.platform == "win32": ...@@ -89,6 +90,9 @@ if sys.platform == "win32":
from .core._imperative_rt.core2 import close as _close from .core._imperative_rt.core2 import close as _close
from .core._imperative_rt.core2 import full_sync as _full_sync from .core._imperative_rt.core2 import full_sync as _full_sync
from .core._imperative_rt.core2 import sync as _sync from .core._imperative_rt.core2 import sync as _sync
from .core._imperative_rt.common import (
get_supported_sm_versions as _get_supported_sm_versions,
)
from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
from .config import * from .config import *
from .device import * from .device import *
...@@ -99,6 +103,25 @@ from .utils import comp_graph_tools as cgtools ...@@ -99,6 +103,25 @@ from .utils import comp_graph_tools as cgtools
from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer
from .version import __version__ from .version import __version__
logger = get_logger(__name__)
ngpus = get_device_count("gpu")
supported_sm_versions = re.findall(r"sm_(\d+)", _get_supported_sm_versions())
for idx in range(ngpus):
prop = get_cuda_device_property(idx)
cur_sm = str(prop.major * 10 + prop.minor)
if not cur_sm in supported_sm_versions:
logger.warning(
"{} with CUDA capability sm_{} is not compatible with the current MegEngine installation. The current MegEngine install supports CUDA {} {}. If you want to use the {} with MegEngine, please check the instructions at https://github.com/MegEngine/MegEngine/blob/master/scripts/cmake-build/BUILD_README.md".format(
prop.name,
cur_sm,
"capabilities" if len(supported_sm_versions) > 1 else "capability",
" ".join(["sm_" + v for v in supported_sm_versions]),
prop.name,
)
)
_set_fork_exec_path_for_timed_func( _set_fork_exec_path_for_timed_func(
sys.executable, sys.executable,
os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"), os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),
......
...@@ -11,9 +11,7 @@ import re ...@@ -11,9 +11,7 @@ import re
from typing import Optional from typing import Optional
from .core._imperative_rt.common import CompNode, DeviceType from .core._imperative_rt.common import CompNode, DeviceType
from .core._imperative_rt.common import ( from .core._imperative_rt.common import get_device_prop as _get_device_prop
get_cuda_compute_capability as _get_cuda_compute_capability,
)
from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config
from .core._imperative_rt.common import what_is_xpu as _what_is_xpu from .core._imperative_rt.common import what_is_xpu as _what_is_xpu
from .core._imperative_rt.utils import _try_coalesce_all_free_memory from .core._imperative_rt.utils import _try_coalesce_all_free_memory
...@@ -25,6 +23,7 @@ __all__ = [ ...@@ -25,6 +23,7 @@ __all__ = [
"set_default_device", "set_default_device",
"get_mem_status_bytes", "get_mem_status_bytes",
"get_cuda_compute_capability", "get_cuda_compute_capability",
"get_cuda_device_property",
"get_allocated_memory", "get_allocated_memory",
"get_reserved_memory", "get_reserved_memory",
"get_max_reserved_memory", "get_max_reserved_memory",
...@@ -161,7 +160,12 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int ...@@ -161,7 +160,12 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int
Returns: Returns:
a version number, or `SM version`. a version number, or `SM version`.
""" """
return _get_cuda_compute_capability(device, device_type) prop = _get_device_prop(device, device_type)
return prop.major * 10 + prop.minor
def get_cuda_device_property(device: int, device_type=DeviceType.CUDA):
return _get_device_prop(device, device_type)
def get_allocated_memory(device: Optional[str] = None): def get_allocated_memory(device: Optional[str] = None):
......
...@@ -123,6 +123,23 @@ void init_common(py::module m) { ...@@ -123,6 +123,23 @@ void init_common(py::module m) {
py::implicitly_convertible<std::string, CompNode>(); py::implicitly_convertible<std::string, CompNode>();
py::class_<CompNode::DeviceProperties>(m, "DeviceProperties")
.def(py::init())
.def_property_readonly(
"name",
[](const CompNode::DeviceProperties prop) { return prop.name; })
.def_property_readonly(
"total_memory",
[](const CompNode::DeviceProperties prop) {
return prop.total_memory;
})
.def_property_readonly(
"major",
[](const CompNode::DeviceProperties prop) { return prop.major; })
.def_property_readonly("minor", [](const CompNode::DeviceProperties prop) {
return prop.minor;
});
def_TensorND<DeviceTensorND>(m, "DeviceTensorND") def_TensorND<DeviceTensorND>(m, "DeviceTensorND")
.def("numpy", [](const DeviceTensorND& self) { .def("numpy", [](const DeviceTensorND& self) {
HostTensorND hv; HostTensorND hv;
...@@ -223,7 +240,12 @@ void init_common(py::module m) { ...@@ -223,7 +240,12 @@ void init_common(py::module m) {
m.def("set_prealloc_config", &CompNode::set_prealloc_config, m.def("set_prealloc_config", &CompNode::set_prealloc_config,
"specifies how to pre-allocate from raw dev allocator"); "specifies how to pre-allocate from raw dev allocator");
m.def("get_cuda_compute_capability", &CompNode::get_compute_capability); m.def("get_device_prop", &CompNode::get_device_prop);
m.def("get_supported_sm_versions", []() {
static const char* mge_gen_code = MGE_CUDA_GENCODE;
return mge_gen_code;
});
m.def("what_is_xpu", m.def("what_is_xpu",
[] { return CompNode::Locator::parse("xpux").to_physical().type; }); [] { return CompNode::Locator::parse("xpux").to_physical().type; });
......
...@@ -431,13 +431,12 @@ void CompNode::set_prealloc_config( ...@@ -431,13 +431,12 @@ void CompNode::set_prealloc_config(
}; };
} }
size_t CompNode::get_compute_capability(int dev, DeviceType device_type) { CompNode::DeviceProperties CompNode::get_device_prop(int dev, DeviceType device_type) {
switch (device_type) { switch (device_type) {
case DeviceType::CUDA: case DeviceType::CUDA:
return CudaCompNode::get_compute_capability(dev); return CudaCompNode::get_device_prop(dev);
default: default:
mgb_log_warn("unsupport device type for get_compute_capability"); mgb_throw(MegBrainError, "unsupport device type for get_device_prop");
return 0;
}; };
} }
......
...@@ -192,11 +192,11 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { ...@@ -192,11 +192,11 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
//! return whether global finalized, and print warning in such case //! return whether global finalized, and print warning in such case
static inline bool check_global_finalized(); static inline bool check_global_finalized();
static CompNode::DeviceProperties get_device_prop(int dev);
//! enable peer copy from dev0 to dev1 //! enable peer copy from dev0 to dev1
static void enable_peer_access(int dev0, int dev1); static void enable_peer_access(int dev0, int dev1);
static size_t get_compute_capability(int dev);
static void static_free_device(ImplBase* self, void* ptr) { static void static_free_device(ImplBase* self, void* ptr) {
static_cast<CompNodeImpl*>(self)->free_device(ptr); static_cast<CompNodeImpl*>(self)->free_device(ptr);
} }
...@@ -208,6 +208,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { ...@@ -208,6 +208,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
public: public:
CompNodeImpl() : Impl(static_free_device, static_free_host) {} CompNodeImpl() : Impl(static_free_device, static_free_host) {}
static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;
void* alloc_device(size_t size) override; void* alloc_device(size_t size) override;
void free_device(void* ptr); void free_device(void* ptr);
...@@ -332,8 +334,6 @@ struct CudaCompNodeImpl::DeviceInfo { ...@@ -332,8 +334,6 @@ struct CudaCompNodeImpl::DeviceInfo {
}; };
struct CudaCompNodeImpl::StaticData { struct CudaCompNodeImpl::StaticData {
static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;
std::recursive_mutex mtx; std::recursive_mutex mtx;
mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config;
...@@ -376,6 +376,13 @@ struct CudaCompNodeImpl::StaticData { ...@@ -376,6 +376,13 @@ struct CudaCompNodeImpl::StaticData {
CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr; CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr;
Spinlock CudaCompNodeImpl::sd_mtx; Spinlock CudaCompNodeImpl::sd_mtx;
struct DevicePropRec {
bool init = false;
CompNode::DeviceProperties prop;
Spinlock mtx_com;
};
DevicePropRec device_prop_rec[CudaCompNodeImpl::MAX_NR_DEVICE];
void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) { void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) {
m_locator = locator; m_locator = locator;
m_locator_logical = locator_logical; m_locator_logical = locator_logical;
...@@ -564,7 +571,7 @@ void CudaCompNodeImpl::sync() { ...@@ -564,7 +571,7 @@ void CudaCompNodeImpl::sync() {
} }
void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) { void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) {
static bool already_enabled[StaticData::MAX_NR_DEVICE][StaticData::MAX_NR_DEVICE]; static bool already_enabled[MAX_NR_DEVICE][MAX_NR_DEVICE];
if (already_enabled[dev0][dev1]) if (already_enabled[dev0][dev1])
return; return;
...@@ -817,6 +824,52 @@ CUresult call_cuda_forksafe(Func func, Val* val, Args... args) { ...@@ -817,6 +824,52 @@ CUresult call_cuda_forksafe(Func func, Val* val, Args... args) {
return err; return err;
return err2; return err2;
} }
template <typename Func, typename... Args>
CUresult call_cuda_forksafe(Func func, char* val, int len, Args... args) {
auto err = func(val, len, args...);
if (err != CUDA_ERROR_NOT_INITIALIZED)
return err;
// cuInit not called, call it in child process
int fd[2];
mgb_assert(pipe(fd) == 0, "pipe() failed");
int fdr = fd[0], fdw = fd[1];
RAIICloseFD fdr_guard(fdr);
RAIICloseFD fdw_guard(fdw);
auto cpid = fork();
mgb_assert(cpid != -1, "fork() failed");
if (cpid == 0) {
fdr_guard.close();
do {
err = cuInit(0);
if (err != CUDA_SUCCESS)
break;
err = func(val, len, args...);
} while (0);
auto sz = write(fdw, &err, sizeof(err));
if (sz == sizeof(err) && err == CUDA_SUCCESS) {
sz = write(fdw, val, sizeof(*val) * len);
}
fdw_guard.close();
std::quick_exit(0);
}
fdw_guard.close();
auto sz = read(fdr, &err, sizeof(err));
mgb_assert(sz == sizeof(err), "failed to read error code from child");
if (err == CUDA_SUCCESS) {
sz = read(fdr, val, sizeof(*val) * len);
mgb_assert(
static_cast<size_t>(sz) == sizeof(*val) * static_cast<size_t>(len),
"failed to read value from child");
return err;
}
// try again, maybe another thread called cuInit while we fork
auto err2 = func(val, len, args...);
if (err2 == CUDA_SUCCESS)
return err2;
if (err2 == CUDA_ERROR_NOT_INITIALIZED)
return err;
return err2;
}
#endif #endif
const char* cu_get_error_string(CUresult err) { const char* cu_get_error_string(CUresult err) {
...@@ -914,10 +967,12 @@ CompNode::Impl* CudaCompNode::load_cuda( ...@@ -914,10 +967,12 @@ CompNode::Impl* CudaCompNode::load_cuda(
} }
if (!available_node) { if (!available_node) {
mgb_assert(sd.nr_node < sd.MAX_NR_COMP_NODE, "too many CompNode allocated"); mgb_assert(
sd.nr_node < CompNodeImpl::MAX_NR_COMP_NODE,
"too many CompNode allocated");
available_node = &sd.node[sd.nr_node++]; available_node = &sd.node[sd.nr_node++];
} }
mgb_assert(locator.device < sd.MAX_NR_DEVICE, "device number too large"); mgb_assert(locator.device < CompNodeImpl::MAX_NR_DEVICE, "device number too large");
mgb_assert(!available_node->m_initialized); mgb_assert(!available_node->m_initialized);
available_node->init(locator, locator_logical); available_node->init(locator, locator_logical);
...@@ -1023,29 +1078,39 @@ void CudaCompNode::set_prealloc_config( ...@@ -1023,29 +1078,39 @@ void CudaCompNode::set_prealloc_config(
} }
} }
size_t CudaCompNode::get_compute_capability(int dev) { CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
size_t cnt = get_device_count(); int cnt = static_cast<int>(get_device_count());
if (dev < 0 || dev >= static_cast<int>(cnt)) { mgb_assert(
mgb_log_error("request gpu %d out of valid range [0, %lu)", dev, cnt); dev >= 0 && dev < cnt, "request gpu %d out of valid range [0, %d)", dev,
return 0; cnt);
}
static Spinlock mtx_com; auto&& rec = device_prop_rec[dev];
MGB_LOCK_GUARD(mtx_com); if (!rec.init) {
int pmajor; MGB_LOCK_GUARD(rec.mtx_com);
int pminor; if (!rec.init) {
auto err = call_cuda_forksafe( char pname[256] = {0};
cuDeviceGetAttribute, &pmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, mgb_assert(
dev); call_cuda_forksafe(
if (err != CUDA_SUCCESS) { cuDeviceGetAttribute, &rec.prop.major,
return 0; CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
} dev) == CUDA_SUCCESS);
auto err2 = call_cuda_forksafe( mgb_assert(
cuDeviceGetAttribute, &pminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, call_cuda_forksafe(
dev); cuDeviceGetAttribute, &rec.prop.minor,
if (err2 != CUDA_SUCCESS) { CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
return 0; dev) == CUDA_SUCCESS);
mgb_assert(
call_cuda_forksafe(cuDeviceGetName, pname, 255, dev) ==
CUDA_SUCCESS);
mgb_assert(
call_cuda_forksafe(cuDeviceTotalMem, &rec.prop.total_memory, dev) ==
CUDA_SUCCESS);
rec.prop.name = pname;
rec.init = true;
}
} }
return pmajor * 10 + pminor;
return rec.prop;
} }
#else #else
...@@ -1067,8 +1132,8 @@ void CudaCompNode::sync_all() {} ...@@ -1067,8 +1132,8 @@ void CudaCompNode::sync_all() {}
void CudaCompNode::set_prealloc_config( void CudaCompNode::set_prealloc_config(
size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {} size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {}
size_t CudaCompNode::get_compute_capability(int dev) { CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
return 0; return CompNode::DeviceProperties{};
} }
#undef err #undef err
......
...@@ -31,7 +31,7 @@ public: ...@@ -31,7 +31,7 @@ public:
static size_t get_device_count(bool warn = true); static size_t get_device_count(bool warn = true);
static Impl* load_cuda(const Locator& locator, const Locator& locator_logical); static Impl* load_cuda(const Locator& locator, const Locator& locator_logical);
static void sync_all(); static void sync_all();
static size_t get_compute_capability(int dev); static DeviceProperties get_device_prop(int dev);
static void set_prealloc_config( static void set_prealloc_config(
size_t alignment, size_t min_req, size_t max_overhead, size_t alignment, size_t min_req, size_t max_overhead,
......
...@@ -80,6 +80,20 @@ public: ...@@ -80,6 +80,20 @@ public:
static constexpr size_t NR_DEVICE_TYPE = static constexpr size_t NR_DEVICE_TYPE =
static_cast<size_t>(DeviceType::MAX_DEVICE_ID); static_cast<size_t>(DeviceType::MAX_DEVICE_ID);
struct DeviceProperties {
DeviceProperties() {
name = "unspec";
total_memory = major = minor = 0;
}
std::string name;
size_t total_memory;
//! for cuda
int major;
int minor;
};
/*! /*!
* \brief an identifier to specify a computing node * \brief an identifier to specify a computing node
* *
...@@ -301,10 +315,11 @@ public: ...@@ -301,10 +315,11 @@ public:
MGE_WIN_DECLSPEC_FUC static void set_prealloc_config( MGE_WIN_DECLSPEC_FUC static void set_prealloc_config(
size_t alignment, size_t min_req, size_t max_overhead, double growth_factor, size_t alignment, size_t min_req, size_t max_overhead, double growth_factor,
DeviceType device_type); DeviceType device_type);
/*! /*!
* \brief get compute capability of the specified device * \brief get device property of the specified device
*/ */
MGE_WIN_DECLSPEC_FUC static size_t get_compute_capability( MGE_WIN_DECLSPEC_FUC static DeviceProperties get_device_prop(
int dev, DeviceType device_type); int dev, DeviceType device_type);
/* =================== synchronization ======================== */ /* =================== synchronization ======================== */
......
...@@ -268,5 +268,6 @@ ...@@ -268,5 +268,6 @@
#endif #endif
#define GIT_FULL_HASH "@GIT_FULL_HASH@" #define GIT_FULL_HASH "@GIT_FULL_HASH@"
#define MGE_CUDA_GENCODE "@MGE_CUDA_GENCODE@"
#endif // _HEADER_MGB_BUILD_CONFIG #endif // _HEADER_MGB_BUILD_CONFIG
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册