提交 2d6827c1 编写于 作者: M Megvii Engine Team

fix(mgb/windows): temporary workround on cuda-windows python exit

code(127), as windows cuda driver unloading before atexit function
may remove this after upgrade cuda runtime

GitOrigin-RevId: cac37ca3ddc569e2a82185c6744da5c676042cc3
上级 517cc684
......@@ -26,6 +26,13 @@ class CompNodeSyncManager : public CompNodeDepedentObject {
ThinHashMap<Blob*, std::unique_ptr<CompNode::Event>> m_blob2event;
std::mutex m_mtx;
public:
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
static bool is_into_atexit;
#endif
std::shared_ptr<void> on_comp_node_finalize() override {
MGB_LOCK_GUARD(m_mtx);
m_blob2event.clear();
......@@ -34,6 +41,16 @@ public:
static CompNodeSyncManager& inst() {
static CompNodeSyncManager sl_inst;
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (!is_into_atexit) {
auto err = atexit([] { is_into_atexit = true; });
mgb_assert(!err, "failed to register atexit function");
}
#endif
return sl_inst;
}
......@@ -52,6 +69,13 @@ public:
m_blob2event.erase(blob);
}
};
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
bool CompNodeSyncManager::is_into_atexit = false;
#endif
// Cache for small blobs
// 1. A blob has to be seen twice (within a window) to be eligible for cache
......@@ -221,6 +245,15 @@ Blob::Blob(CompNode cn, size_t sz):
Blob::~Blob() {
BlobManager::inst()->unregister_blob(this);
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (CompNodeSyncManager::is_into_atexit)
return;
#endif
CompNodeSyncManager::inst().remove(this);
}
......
......@@ -556,6 +556,13 @@ CompNode CompNode::load(const Locator& locator_physical,
}
void CompNode::finalize() {
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround recovery
//! resource by OS temporarily, may need remove this after upgrade cuda
//! runtime
return;
#endif
comp_node_detail::DepedentObjList::invoke_callback_and_clean();
CudaCompNode::finalize();
CpuCompNode::finalize();
......
......@@ -614,6 +614,18 @@ bool CudaCompNodeImpl::check_global_finalized() {
}
return true;
}
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (CudaCompNode::is_into_atexit) {
mgb_log_debug(
"windows cudaErrorCudartUnloading happened!!, resource "
"recovery by OS!!");
return true;
}
#endif
return false;
}
......@@ -733,11 +745,29 @@ void CudaCompNode::finalize() {
}
}
CompNode::Impl* CudaCompNode::load_cuda(
const Locator &locator, const Locator &locator_logical) {
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
bool CudaCompNode::is_into_atexit = false;
#endif
CompNode::Impl* CudaCompNode::load_cuda(const Locator& locator,
const Locator& locator_logical) {
int nr_gpu = get_device_count();
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (!is_into_atexit) {
auto err = atexit([] { is_into_atexit = true; });
mgb_assert(!err, "failed to register atexit function");
}
#endif
mgb_assert(locator.device >= 0 && locator.device < nr_gpu,
"request gpu%d out of valid range [0, %d)", locator.device, nr_gpu);
"request gpu%d out of valid range [0, %d)", locator.device,
nr_gpu);
auto &&sdptr = CudaCompNodeImpl::sd;
{
......
......@@ -36,6 +36,13 @@ namespace mgb {
static void set_prealloc_config(size_t alignment, size_t min_req,
size_t max_overhead, double growth_factor);
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function
//! even register atexit function after init cuda driver! as a
//! workround recovery resource by OS temporarily, may need remove
//! this after upgrade cuda runtime
static bool is_into_atexit;
#endif
};
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册