feat(mge): add warning message when mismatched cuda sm is detected

GitOrigin-RevId: f78c79eb069fb88208af8e8f00d91e2094371b90

feat(mge): add warning message when mismatched cuda sm is detected
GitOrigin-RevId: f78c79eb069fb88208af8e8f00d91e2094371b90
1c2a323e · Megvii Engine Team · 877bda41 · 1c2a323e · 1c2a323e · 1c2a323e
8 changed file
--- a/imperative/python/megengine/__init__.py
+++ b/imperative/python/megengine/__init__.py
@@ -8,6 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import atexit
 import ctypes
+import re
 import os
 import platform
 import sys
@@ -89,6 +90,9 @@ if sys.platform == "win32":
 from .core._imperative_rt.core2 import close as _close
 from .core._imperative_rt.core2 import full_sync as _full_sync
 from .core._imperative_rt.core2 import sync as _sync
+from .core._imperative_rt.common import (
+    get_supported_sm_versions as _get_supported_sm_versions,
+)
 from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
 from .config import *
 from .device import *
@@ -99,6 +103,25 @@ from .utils import comp_graph_tools as cgtools
 from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer
 from .version import __version__
+logger = get_logger(__name__)
+ngpus = get_device_count("gpu")
+supported_sm_versions = re.findall(r"sm_(\d+)", _get_supported_sm_versions())
+for idx in range(ngpus):
+    prop = get_cuda_device_property(idx)
+    cur_sm = str(prop.major * 10 + prop.minor)
+    if not cur_sm in supported_sm_versions:
+        logger.warning(
+            "{} with CUDA capability sm_{} is not compatible with the current MegEngine installation. The current MegEngine install supports CUDA {} {}. If you want to use the {} with MegEngine, please check the instructions at https://github.com/MegEngine/MegEngine/blob/master/scripts/cmake-build/BUILD_README.md".format(
+                prop.name,
+                cur_sm,
+                "capabilities" if len(supported_sm_versions) > 1 else "capability",
+                " ".join(["sm_" + v for v in supported_sm_versions]),
+                prop.name,
+            )
+        )
 _set_fork_exec_path_for_timed_func(
    sys.executable,
    os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),

--- a/imperative/python/megengine/device.py
+++ b/imperative/python/megengine/device.py
@@ -11,9 +11,7 @@ import re
 from typing import Optional
 from .core._imperative_rt.common import CompNode, DeviceType
-from .core._imperative_rt.common import (
+from .core._imperative_rt.common import get_device_prop as _get_device_prop
-    get_cuda_compute_capability as _get_cuda_compute_capability,
-)
 from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config
 from .core._imperative_rt.common import what_is_xpu as _what_is_xpu
 from .core._imperative_rt.utils import _try_coalesce_all_free_memory
@@ -25,6 +23,7 @@ __all__ = [
    "set_default_device",
    "get_mem_status_bytes",
    "get_cuda_compute_capability",
+    "get_cuda_device_property",
    "get_allocated_memory",
    "get_reserved_memory",
    "get_max_reserved_memory",
@@ -161,7 +160,12 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int
    Returns:
        a version number, or `SM version`.
    """
-    return _get_cuda_compute_capability(device, device_type)
+    prop = _get_device_prop(device, device_type)
+    return prop.major * 10 + prop.minor
+def get_cuda_device_property(device: int, device_type=DeviceType.CUDA):
+    return _get_device_prop(device, device_type)
 def get_allocated_memory(device: Optional[str] = None):

--- a/imperative/python/src/common.cpp
+++ b/imperative/python/src/common.cpp
@@ -123,6 +123,23 @@ void init_common(py::module m) {
    py::implicitly_convertible<std::string, CompNode>();
+    py::class_<CompNode::DeviceProperties>(m, "DeviceProperties")
+            .def(py::init())
+            .def_property_readonly(
+                    "name",
+                    [](const CompNode::DeviceProperties prop) { return prop.name; })
+            .def_property_readonly(
+                    "total_memory",
+                    [](const CompNode::DeviceProperties prop) {
+                        return prop.total_memory;
+                    })
+            .def_property_readonly(
+                    "major",
+                    [](const CompNode::DeviceProperties prop) { return prop.major; })
+            .def_property_readonly("minor", [](const CompNode::DeviceProperties prop) {
+                return prop.minor;
+            });
    def_TensorND<DeviceTensorND>(m, "DeviceTensorND")
            .def("numpy", [](const DeviceTensorND& self) {
                HostTensorND hv;
@@ -223,7 +240,12 @@ void init_common(py::module m) {
    m.def("set_prealloc_config", &CompNode::set_prealloc_config,
          "specifies how to pre-allocate from raw dev allocator");
-    m.def("get_cuda_compute_capability", &CompNode::get_compute_capability);
+    m.def("get_device_prop", &CompNode::get_device_prop);
+    m.def("get_supported_sm_versions", []() {
+        static const char* mge_gen_code = MGE_CUDA_GENCODE;
+        return mge_gen_code;
+    });
    m.def("what_is_xpu",
          [] { return CompNode::Locator::parse("xpux").to_physical().type; });

--- a/src/core/impl/comp_node/comp_node.cpp
+++ b/src/core/impl/comp_node/comp_node.cpp
@@ -431,13 +431,12 @@ void CompNode::set_prealloc_config(
    };
 }
-size_t CompNode::get_compute_capability(int dev, DeviceType device_type) {
+CompNode::DeviceProperties CompNode::get_device_prop(int dev, DeviceType device_type) {
    switch (device_type) {
        case DeviceType::CUDA:
-            return CudaCompNode::get_compute_capability(dev);
+            return CudaCompNode::get_device_prop(dev);
        default:
-            mgb_log_warn("unsupport device type for get_compute_capability");
+            mgb_throw(MegBrainError, "unsupport device type for get_device_prop");
-            return 0;
    };
 }

--- a/src/core/impl/comp_node/cuda/comp_node.cpp
+++ b/src/core/impl/comp_node/cuda/comp_node.cpp
@@ -192,11 +192,11 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
    //! return whether global finalized, and print warning in such case
    static inline bool check_global_finalized();
+    static CompNode::DeviceProperties get_device_prop(int dev);
    //! enable peer copy from dev0 to dev1
    static void enable_peer_access(int dev0, int dev1);
-    static size_t get_compute_capability(int dev);
    static void static_free_device(ImplBase* self, void* ptr) {
        static_cast<CompNodeImpl*>(self)->free_device(ptr);
    }
@@ -208,6 +208,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
 public:
    CompNodeImpl() : Impl(static_free_device, static_free_host) {}
+    static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;
    void* alloc_device(size_t size) override;
    void free_device(void* ptr);
@@ -332,8 +334,6 @@ struct CudaCompNodeImpl::DeviceInfo {
 };
 struct CudaCompNodeImpl::StaticData {
-    static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;
    std::recursive_mutex mtx;
    mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config;
@@ -376,6 +376,13 @@ struct CudaCompNodeImpl::StaticData {
 CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr;
 Spinlock CudaCompNodeImpl::sd_mtx;
+struct DevicePropRec {
+    bool init = false;
+    CompNode::DeviceProperties prop;
+    Spinlock mtx_com;
+};
+DevicePropRec device_prop_rec[CudaCompNodeImpl::MAX_NR_DEVICE];
 void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) {
    m_locator = locator;
    m_locator_logical = locator_logical;
@@ -564,7 +571,7 @@ void CudaCompNodeImpl::sync() {
 }
 void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) {
-    static bool already_enabled[StaticData::MAX_NR_DEVICE][StaticData::MAX_NR_DEVICE];
+    static bool already_enabled[MAX_NR_DEVICE][MAX_NR_DEVICE];
    if (already_enabled[dev0][dev1])
        return;
@@ -817,6 +824,52 @@ CUresult call_cuda_forksafe(Func func, Val* val, Args... args) {
        return err;
    return err2;
 }
+template <typename Func, typename... Args>
+CUresult call_cuda_forksafe(Func func, char* val, int len, Args... args) {
+    auto err = func(val, len, args...);
+    if (err != CUDA_ERROR_NOT_INITIALIZED)
+        return err;
+    // cuInit not called, call it in child process
+    int fd[2];
+    mgb_assert(pipe(fd) == 0, "pipe() failed");
+    int fdr = fd[0], fdw = fd[1];
+    RAIICloseFD fdr_guard(fdr);
+    RAIICloseFD fdw_guard(fdw);
+    auto cpid = fork();
+    mgb_assert(cpid != -1, "fork() failed");
+    if (cpid == 0) {
+        fdr_guard.close();
+        do {
+            err = cuInit(0);
+            if (err != CUDA_SUCCESS)
+                break;
+            err = func(val, len, args...);
+        } while (0);
+        auto sz = write(fdw, &err, sizeof(err));
+        if (sz == sizeof(err) && err == CUDA_SUCCESS) {
+            sz = write(fdw, val, sizeof(*val) * len);
+        }
+        fdw_guard.close();
+        std::quick_exit(0);
+    }
+    fdw_guard.close();
+    auto sz = read(fdr, &err, sizeof(err));
+    mgb_assert(sz == sizeof(err), "failed to read error code from child");
+    if (err == CUDA_SUCCESS) {
+        sz = read(fdr, val, sizeof(*val) * len);
+        mgb_assert(
+                static_cast<size_t>(sz) == sizeof(*val) * static_cast<size_t>(len),
+                "failed to read value from child");
+        return err;
+    }
+    // try again, maybe another thread called cuInit while we fork
+    auto err2 = func(val, len, args...);
+    if (err2 == CUDA_SUCCESS)
+        return err2;
+    if (err2 == CUDA_ERROR_NOT_INITIALIZED)
+        return err;
+    return err2;
+}
 #endif
 const char* cu_get_error_string(CUresult err) {
@@ -914,10 +967,12 @@ CompNode::Impl* CudaCompNode::load_cuda(
    }
    if (!available_node) {
-        mgb_assert(sd.nr_node < sd.MAX_NR_COMP_NODE, "too many CompNode allocated");
+        mgb_assert(
+                sd.nr_node < CompNodeImpl::MAX_NR_COMP_NODE,
+                "too many CompNode allocated");
        available_node = &sd.node[sd.nr_node++];
    }
-    mgb_assert(locator.device < sd.MAX_NR_DEVICE, "device number too large");
+    mgb_assert(locator.device < CompNodeImpl::MAX_NR_DEVICE, "device number too large");
    mgb_assert(!available_node->m_initialized);
    available_node->init(locator, locator_logical);
@@ -1023,29 +1078,39 @@ void CudaCompNode::set_prealloc_config(
    }
 }
-size_t CudaCompNode::get_compute_capability(int dev) {
+CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
-    size_t cnt = get_device_count();
+    int cnt = static_cast<int>(get_device_count());
-    if (dev < 0 || dev >= static_cast<int>(cnt)) {
+    mgb_assert(
-        mgb_log_error("request gpu %d out of valid range [0, %lu)", dev, cnt);
+            dev >= 0 && dev < cnt, "request gpu %d out of valid range [0, %d)", dev,
-        return 0;
+            cnt);
-    }
-    static Spinlock mtx_com;
+    auto&& rec = device_prop_rec[dev];
-    MGB_LOCK_GUARD(mtx_com);
+    if (!rec.init) {
-    int pmajor;
+        MGB_LOCK_GUARD(rec.mtx_com);
-    int pminor;
+        if (!rec.init) {
-    auto err = call_cuda_forksafe(
+            char pname[256] = {0};
-            cuDeviceGetAttribute, &pmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+            mgb_assert(
-            dev);
+                    call_cuda_forksafe(
-    if (err != CUDA_SUCCESS) {
+                            cuDeviceGetAttribute, &rec.prop.major,
-        return 0;
+                            CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-    }
+                            dev) == CUDA_SUCCESS);
-    auto err2 = call_cuda_forksafe(
+            mgb_assert(
-            cuDeviceGetAttribute, &pminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                    call_cuda_forksafe(
-            dev);
+                            cuDeviceGetAttribute, &rec.prop.minor,
-    if (err2 != CUDA_SUCCESS) {
+                            CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-        return 0;
+                            dev) == CUDA_SUCCESS);
+            mgb_assert(
+                    call_cuda_forksafe(cuDeviceGetName, pname, 255, dev) ==
+                    CUDA_SUCCESS);
+            mgb_assert(
+                    call_cuda_forksafe(cuDeviceTotalMem, &rec.prop.total_memory, dev) ==
+                    CUDA_SUCCESS);
+            rec.prop.name = pname;
+            rec.init = true;
+        }
    }
-    return pmajor * 10 + pminor;
+    return rec.prop;
 }
 #else
@@ -1067,8 +1132,8 @@ void CudaCompNode::sync_all() {}
 void CudaCompNode::set_prealloc_config(
        size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {}
-size_t CudaCompNode::get_compute_capability(int dev) {
+CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
-    return 0;
+    return CompNode::DeviceProperties{};
 }
 #undef err

--- a/src/core/impl/comp_node/cuda/comp_node.h
+++ b/src/core/impl/comp_node/cuda/comp_node.h
@@ -31,7 +31,7 @@ public:
    static size_t get_device_count(bool warn = true);
    static Impl* load_cuda(const Locator& locator, const Locator& locator_logical);
    static void sync_all();
-    static size_t get_compute_capability(int dev);
+    static DeviceProperties get_device_prop(int dev);
    static void set_prealloc_config(
            size_t alignment, size_t min_req, size_t max_overhead,

--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
@@ -80,6 +80,20 @@ public:
    static constexpr size_t NR_DEVICE_TYPE =
            static_cast<size_t>(DeviceType::MAX_DEVICE_ID);
+    struct DeviceProperties {
+        DeviceProperties() {
+            name = "unspec";
+            total_memory = major = minor = 0;
+        }
+        std::string name;
+        size_t total_memory;
+        //! for cuda
+        int major;
+        int minor;
+    };
    /*!
     * \brief an identifier to specify a computing node
     *
@@ -301,10 +315,11 @@ public:
    MGE_WIN_DECLSPEC_FUC static void set_prealloc_config(
            size_t alignment, size_t min_req, size_t max_overhead, double growth_factor,
            DeviceType device_type);
    /*!
-     * \brief get compute capability of the specified device
+     * \brief get device property of the specified device
     */
-    MGE_WIN_DECLSPEC_FUC static size_t get_compute_capability(
+    MGE_WIN_DECLSPEC_FUC static DeviceProperties get_device_prop(
            int dev, DeviceType device_type);
    /* =================== synchronization ======================== */

--- a/src/megbrain_build_config.h.in
+++ b/src/megbrain_build_config.h.in
@@ -268,5 +268,6 @@
 #endif
 #define GIT_FULL_HASH "@GIT_FULL_HASH@"
+#define MGE_CUDA_GENCODE "@MGE_CUDA_GENCODE@"
 #endif  // _HEADER_MGB_BUILD_CONFIG