提交 1c2a323e 编写于 作者: M Megvii Engine Team

feat(mge): add warning message when mismatched cuda sm is detected

GitOrigin-RevId: f78c79eb069fb88208af8e8f00d91e2094371b90
上级 877bda41
......@@ -8,6 +8,7 @@
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import atexit
import ctypes
import re
import os
import platform
import sys
......@@ -89,6 +90,9 @@ if sys.platform == "win32":
from .core._imperative_rt.core2 import close as _close
from .core._imperative_rt.core2 import full_sync as _full_sync
from .core._imperative_rt.core2 import sync as _sync
from .core._imperative_rt.common import (
get_supported_sm_versions as _get_supported_sm_versions,
)
from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
from .config import *
from .device import *
......@@ -99,6 +103,25 @@ from .utils import comp_graph_tools as cgtools
from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer
from .version import __version__
logger = get_logger(__name__)
ngpus = get_device_count("gpu")
supported_sm_versions = re.findall(r"sm_(\d+)", _get_supported_sm_versions())
for idx in range(ngpus):
prop = get_cuda_device_property(idx)
cur_sm = str(prop.major * 10 + prop.minor)
if not cur_sm in supported_sm_versions:
logger.warning(
"{} with CUDA capability sm_{} is not compatible with the current MegEngine installation. The current MegEngine install supports CUDA {} {}. If you want to use the {} with MegEngine, please check the instructions at https://github.com/MegEngine/MegEngine/blob/master/scripts/cmake-build/BUILD_README.md".format(
prop.name,
cur_sm,
"capabilities" if len(supported_sm_versions) > 1 else "capability",
" ".join(["sm_" + v for v in supported_sm_versions]),
prop.name,
)
)
_set_fork_exec_path_for_timed_func(
sys.executable,
os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),
......
......@@ -11,9 +11,7 @@ import re
from typing import Optional
from .core._imperative_rt.common import CompNode, DeviceType
from .core._imperative_rt.common import (
get_cuda_compute_capability as _get_cuda_compute_capability,
)
from .core._imperative_rt.common import get_device_prop as _get_device_prop
from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config
from .core._imperative_rt.common import what_is_xpu as _what_is_xpu
from .core._imperative_rt.utils import _try_coalesce_all_free_memory
......@@ -25,6 +23,7 @@ __all__ = [
"set_default_device",
"get_mem_status_bytes",
"get_cuda_compute_capability",
"get_cuda_device_property",
"get_allocated_memory",
"get_reserved_memory",
"get_max_reserved_memory",
......@@ -161,7 +160,12 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int
Returns:
a version number, or `SM version`.
"""
return _get_cuda_compute_capability(device, device_type)
prop = _get_device_prop(device, device_type)
return prop.major * 10 + prop.minor
def get_cuda_device_property(device: int, device_type=DeviceType.CUDA):
return _get_device_prop(device, device_type)
def get_allocated_memory(device: Optional[str] = None):
......
......@@ -123,6 +123,23 @@ void init_common(py::module m) {
py::implicitly_convertible<std::string, CompNode>();
py::class_<CompNode::DeviceProperties>(m, "DeviceProperties")
.def(py::init())
.def_property_readonly(
"name",
[](const CompNode::DeviceProperties prop) { return prop.name; })
.def_property_readonly(
"total_memory",
[](const CompNode::DeviceProperties prop) {
return prop.total_memory;
})
.def_property_readonly(
"major",
[](const CompNode::DeviceProperties prop) { return prop.major; })
.def_property_readonly("minor", [](const CompNode::DeviceProperties prop) {
return prop.minor;
});
def_TensorND<DeviceTensorND>(m, "DeviceTensorND")
.def("numpy", [](const DeviceTensorND& self) {
HostTensorND hv;
......@@ -223,7 +240,12 @@ void init_common(py::module m) {
m.def("set_prealloc_config", &CompNode::set_prealloc_config,
"specifies how to pre-allocate from raw dev allocator");
m.def("get_cuda_compute_capability", &CompNode::get_compute_capability);
m.def("get_device_prop", &CompNode::get_device_prop);
m.def("get_supported_sm_versions", []() {
static const char* mge_gen_code = MGE_CUDA_GENCODE;
return mge_gen_code;
});
m.def("what_is_xpu",
[] { return CompNode::Locator::parse("xpux").to_physical().type; });
......
......@@ -431,13 +431,12 @@ void CompNode::set_prealloc_config(
};
}
size_t CompNode::get_compute_capability(int dev, DeviceType device_type) {
CompNode::DeviceProperties CompNode::get_device_prop(int dev, DeviceType device_type) {
switch (device_type) {
case DeviceType::CUDA:
return CudaCompNode::get_compute_capability(dev);
return CudaCompNode::get_device_prop(dev);
default:
mgb_log_warn("unsupport device type for get_compute_capability");
return 0;
mgb_throw(MegBrainError, "unsupport device type for get_device_prop");
};
}
......
......@@ -192,11 +192,11 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
//! return whether global finalized, and print warning in such case
static inline bool check_global_finalized();
static CompNode::DeviceProperties get_device_prop(int dev);
//! enable peer copy from dev0 to dev1
static void enable_peer_access(int dev0, int dev1);
static size_t get_compute_capability(int dev);
static void static_free_device(ImplBase* self, void* ptr) {
static_cast<CompNodeImpl*>(self)->free_device(ptr);
}
......@@ -208,6 +208,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
public:
CompNodeImpl() : Impl(static_free_device, static_free_host) {}
static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;
void* alloc_device(size_t size) override;
void free_device(void* ptr);
......@@ -332,8 +334,6 @@ struct CudaCompNodeImpl::DeviceInfo {
};
struct CudaCompNodeImpl::StaticData {
static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;
std::recursive_mutex mtx;
mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config;
......@@ -376,6 +376,13 @@ struct CudaCompNodeImpl::StaticData {
CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr;
Spinlock CudaCompNodeImpl::sd_mtx;
struct DevicePropRec {
bool init = false;
CompNode::DeviceProperties prop;
Spinlock mtx_com;
};
DevicePropRec device_prop_rec[CudaCompNodeImpl::MAX_NR_DEVICE];
void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) {
m_locator = locator;
m_locator_logical = locator_logical;
......@@ -564,7 +571,7 @@ void CudaCompNodeImpl::sync() {
}
void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) {
static bool already_enabled[StaticData::MAX_NR_DEVICE][StaticData::MAX_NR_DEVICE];
static bool already_enabled[MAX_NR_DEVICE][MAX_NR_DEVICE];
if (already_enabled[dev0][dev1])
return;
......@@ -817,6 +824,52 @@ CUresult call_cuda_forksafe(Func func, Val* val, Args... args) {
return err;
return err2;
}
template <typename Func, typename... Args>
CUresult call_cuda_forksafe(Func func, char* val, int len, Args... args) {
auto err = func(val, len, args...);
if (err != CUDA_ERROR_NOT_INITIALIZED)
return err;
// cuInit not called, call it in child process
int fd[2];
mgb_assert(pipe(fd) == 0, "pipe() failed");
int fdr = fd[0], fdw = fd[1];
RAIICloseFD fdr_guard(fdr);
RAIICloseFD fdw_guard(fdw);
auto cpid = fork();
mgb_assert(cpid != -1, "fork() failed");
if (cpid == 0) {
fdr_guard.close();
do {
err = cuInit(0);
if (err != CUDA_SUCCESS)
break;
err = func(val, len, args...);
} while (0);
auto sz = write(fdw, &err, sizeof(err));
if (sz == sizeof(err) && err == CUDA_SUCCESS) {
sz = write(fdw, val, sizeof(*val) * len);
}
fdw_guard.close();
std::quick_exit(0);
}
fdw_guard.close();
auto sz = read(fdr, &err, sizeof(err));
mgb_assert(sz == sizeof(err), "failed to read error code from child");
if (err == CUDA_SUCCESS) {
sz = read(fdr, val, sizeof(*val) * len);
mgb_assert(
static_cast<size_t>(sz) == sizeof(*val) * static_cast<size_t>(len),
"failed to read value from child");
return err;
}
// try again, maybe another thread called cuInit while we fork
auto err2 = func(val, len, args...);
if (err2 == CUDA_SUCCESS)
return err2;
if (err2 == CUDA_ERROR_NOT_INITIALIZED)
return err;
return err2;
}
#endif
const char* cu_get_error_string(CUresult err) {
......@@ -914,10 +967,12 @@ CompNode::Impl* CudaCompNode::load_cuda(
}
if (!available_node) {
mgb_assert(sd.nr_node < sd.MAX_NR_COMP_NODE, "too many CompNode allocated");
mgb_assert(
sd.nr_node < CompNodeImpl::MAX_NR_COMP_NODE,
"too many CompNode allocated");
available_node = &sd.node[sd.nr_node++];
}
mgb_assert(locator.device < sd.MAX_NR_DEVICE, "device number too large");
mgb_assert(locator.device < CompNodeImpl::MAX_NR_DEVICE, "device number too large");
mgb_assert(!available_node->m_initialized);
available_node->init(locator, locator_logical);
......@@ -1023,29 +1078,39 @@ void CudaCompNode::set_prealloc_config(
}
}
size_t CudaCompNode::get_compute_capability(int dev) {
size_t cnt = get_device_count();
if (dev < 0 || dev >= static_cast<int>(cnt)) {
mgb_log_error("request gpu %d out of valid range [0, %lu)", dev, cnt);
return 0;
}
static Spinlock mtx_com;
MGB_LOCK_GUARD(mtx_com);
int pmajor;
int pminor;
auto err = call_cuda_forksafe(
cuDeviceGetAttribute, &pmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
dev);
if (err != CUDA_SUCCESS) {
return 0;
CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
int cnt = static_cast<int>(get_device_count());
mgb_assert(
dev >= 0 && dev < cnt, "request gpu %d out of valid range [0, %d)", dev,
cnt);
auto&& rec = device_prop_rec[dev];
if (!rec.init) {
MGB_LOCK_GUARD(rec.mtx_com);
if (!rec.init) {
char pname[256] = {0};
mgb_assert(
call_cuda_forksafe(
cuDeviceGetAttribute, &rec.prop.major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
dev) == CUDA_SUCCESS);
mgb_assert(
call_cuda_forksafe(
cuDeviceGetAttribute, &rec.prop.minor,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
dev) == CUDA_SUCCESS);
mgb_assert(
call_cuda_forksafe(cuDeviceGetName, pname, 255, dev) ==
CUDA_SUCCESS);
mgb_assert(
call_cuda_forksafe(cuDeviceTotalMem, &rec.prop.total_memory, dev) ==
CUDA_SUCCESS);
rec.prop.name = pname;
rec.init = true;
}
auto err2 = call_cuda_forksafe(
cuDeviceGetAttribute, &pminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
dev);
if (err2 != CUDA_SUCCESS) {
return 0;
}
return pmajor * 10 + pminor;
return rec.prop;
}
#else
......@@ -1067,8 +1132,8 @@ void CudaCompNode::sync_all() {}
void CudaCompNode::set_prealloc_config(
size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {}
size_t CudaCompNode::get_compute_capability(int dev) {
return 0;
CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
return CompNode::DeviceProperties{};
}
#undef err
......
......@@ -31,7 +31,7 @@ public:
static size_t get_device_count(bool warn = true);
static Impl* load_cuda(const Locator& locator, const Locator& locator_logical);
static void sync_all();
static size_t get_compute_capability(int dev);
static DeviceProperties get_device_prop(int dev);
static void set_prealloc_config(
size_t alignment, size_t min_req, size_t max_overhead,
......
......@@ -80,6 +80,20 @@ public:
static constexpr size_t NR_DEVICE_TYPE =
static_cast<size_t>(DeviceType::MAX_DEVICE_ID);
struct DeviceProperties {
DeviceProperties() {
name = "unspec";
total_memory = major = minor = 0;
}
std::string name;
size_t total_memory;
//! for cuda
int major;
int minor;
};
/*!
* \brief an identifier to specify a computing node
*
......@@ -301,10 +315,11 @@ public:
MGE_WIN_DECLSPEC_FUC static void set_prealloc_config(
size_t alignment, size_t min_req, size_t max_overhead, double growth_factor,
DeviceType device_type);
/*!
* \brief get compute capability of the specified device
* \brief get device property of the specified device
*/
MGE_WIN_DECLSPEC_FUC static size_t get_compute_capability(
MGE_WIN_DECLSPEC_FUC static DeviceProperties get_device_prop(
int dev, DeviceType device_type);
/* =================== synchronization ======================== */
......
......@@ -268,5 +268,6 @@
#endif
#define GIT_FULL_HASH "@GIT_FULL_HASH@"
#define MGE_CUDA_GENCODE "@MGE_CUDA_GENCODE@"
#endif // _HEADER_MGB_BUILD_CONFIG
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册