提交 a0e53118 编写于 作者: M Megvii Engine Team

fix(src/comp_node): fix calling cuda driver api

GitOrigin-RevId: cc33af2ac4a534cab1b11e1afbe31187330314e5
上级 ccea0e23
......@@ -761,10 +761,10 @@ void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) {
namespace {
#ifndef __unix__
template <typename Func, typename... Args>
CUresult call_cuda_forksafe(Func func, Args... args) {
template <typename Func, typename Val>
CUresult call_cuda_forksafe(Func func, Val* val, size_t len) {
cuInit(0);
return func(args...);
return func();
}
#else
struct RAIICloseFD : NonCopyableObj {
......@@ -780,11 +780,13 @@ struct RAIICloseFD : NonCopyableObj {
}
};
// an implementation that does not call cuInit
template <typename Func, typename Val, typename... Args>
CUresult call_cuda_forksafe(Func func, Val* val, Args... args) {
auto err = func(val, args...);
template <typename Func, typename Val>
CUresult call_cuda_forksafe(Func func, Val* val, size_t len) {
int t_ndev;
// use cuDeviceGetCount to detect cuda initialization to avoid abnormal behavior
auto err = cuDeviceGetCount(&t_ndev);
if (err != CUDA_ERROR_NOT_INITIALIZED)
return err;
return func();
// cuInit not called, call it in child process
int fd[2];
mgb_assert(pipe(fd) == 0, "pipe() failed");
......@@ -799,51 +801,7 @@ CUresult call_cuda_forksafe(Func func, Val* val, Args... args) {
err = cuInit(0);
if (err != CUDA_SUCCESS)
break;
err = func(val, args...);
} while (0);
auto sz = write(fdw, &err, sizeof(err));
if (sz == sizeof(err) && err == CUDA_SUCCESS) {
sz = write(fdw, val, sizeof(*val));
}
fdw_guard.close();
std::quick_exit(0);
}
fdw_guard.close();
auto sz = read(fdr, &err, sizeof(err));
mgb_assert(sz == sizeof(err), "failed to read error code from child");
if (err == CUDA_SUCCESS) {
sz = read(fdr, val, sizeof(*val));
mgb_assert(sz == sizeof(*val), "failed to read value from child");
return err;
}
// try again, maybe another thread called cuInit while we fork
auto err2 = func(val, args...);
if (err2 == CUDA_SUCCESS)
return err2;
if (err2 == CUDA_ERROR_NOT_INITIALIZED)
return err;
return err2;
}
template <typename Func, typename... Args>
CUresult call_cuda_forksafe(Func func, char* val, int len, Args... args) {
auto err = func(val, len, args...);
if (err != CUDA_ERROR_NOT_INITIALIZED)
return err;
// cuInit not called, call it in child process
int fd[2];
mgb_assert(pipe(fd) == 0, "pipe() failed");
int fdr = fd[0], fdw = fd[1];
RAIICloseFD fdr_guard(fdr);
RAIICloseFD fdw_guard(fdw);
auto cpid = fork();
mgb_assert(cpid != -1, "fork() failed");
if (cpid == 0) {
fdr_guard.close();
do {
err = cuInit(0);
if (err != CUDA_SUCCESS)
break;
err = func(val, len, args...);
err = func();
} while (0);
auto sz = write(fdw, &err, sizeof(err));
if (sz == sizeof(err) && err == CUDA_SUCCESS) {
......@@ -858,12 +816,12 @@ CUresult call_cuda_forksafe(Func func, char* val, int len, Args... args) {
if (err == CUDA_SUCCESS) {
sz = read(fdr, val, sizeof(*val) * len);
mgb_assert(
static_cast<size_t>(sz) == sizeof(*val) * static_cast<size_t>(len),
static_cast<size_t>(sz) == sizeof(*val) * len,
"failed to read value from child");
return err;
}
// try again, maybe another thread called cuInit while we fork
auto err2 = func(val, len, args...);
auto err2 = func();
if (err2 == CUDA_SUCCESS)
return err2;
if (err2 == CUDA_ERROR_NOT_INITIALIZED)
......@@ -882,6 +840,17 @@ const char* cu_get_error_string(CUresult err) {
return ret;
}
#define MGB_CALL_CUDA_FORKSAFE_NOASSERT(func, ptr, len, ...) \
call_cuda_forksafe([&]() { return func(ptr, ##__VA_ARGS__); }, ptr, len)
#define MGB_CALL_CUDA_FORKSAFE(func, ptr, len, ...) \
{ \
auto err = MGB_CALL_CUDA_FORKSAFE_NOASSERT(func, ptr, len, ##__VA_ARGS__); \
if (err != CUDA_SUCCESS) { \
auto err_s = cu_get_error_string(err); \
mgb_log_error(#func " failed: %s (err %d)", err_s, int(err)); \
} \
}
} // namespace
bool CudaCompNode::available() {
......@@ -890,7 +859,7 @@ bool CudaCompNode::available() {
MGB_LOCK_GUARD(mtx);
if (result == -1) {
int ndev = -1;
auto err = call_cuda_forksafe(cuDeviceGetCount, &ndev);
auto err = MGB_CALL_CUDA_FORKSAFE_NOASSERT(cuDeviceGetCount, &ndev, 1);
result = err == CUDA_SUCCESS && ndev > 0;
auto err_s = cu_get_error_string(err);
//! only show !CUDA_SUCCESS log when with valid stub call
......@@ -1042,12 +1011,11 @@ size_t CudaCompNode::get_device_count(bool warn) {
static Spinlock mtx;
MGB_LOCK_GUARD(mtx);
if (cnt == -1) {
auto err = call_cuda_forksafe(cuDeviceGetCount, &cnt);
auto err = MGB_CALL_CUDA_FORKSAFE_NOASSERT(cuDeviceGetCount, &cnt, 1);
auto err_s = cu_get_error_string(err);
if (err != CUDA_SUCCESS) {
if (warn && (std::string(err_s) != "invalid_stub_call"))
mgb_log_error(
"cudaGetDeviceCount failed: %s (err %d)", err_s, int(err));
mgb_log_error("cuDeviceGetCount failed: %s (err %d)", err_s, int(err));
cnt = 0;
}
mgb_assert(cnt >= 0);
......@@ -1088,23 +1056,15 @@ CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
if (!rec.init) {
MGB_LOCK_GUARD(rec.mtx_com);
if (!rec.init) {
MGB_CALL_CUDA_FORKSAFE(
cuDeviceGetAttribute, &rec.prop.major, 1,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
MGB_CALL_CUDA_FORKSAFE(
cuDeviceGetAttribute, &rec.prop.minor, 1,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
MGB_CALL_CUDA_FORKSAFE(cuDeviceTotalMem, &rec.prop.total_memory, 1, dev);
char pname[256] = {0};
mgb_assert(
call_cuda_forksafe(
cuDeviceGetAttribute, &rec.prop.major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
dev) == CUDA_SUCCESS);
mgb_assert(
call_cuda_forksafe(
cuDeviceGetAttribute, &rec.prop.minor,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
dev) == CUDA_SUCCESS);
mgb_assert(
call_cuda_forksafe(cuDeviceGetName, pname, 255, dev) ==
CUDA_SUCCESS);
mgb_assert(
call_cuda_forksafe(cuDeviceTotalMem, &rec.prop.total_memory, dev) ==
CUDA_SUCCESS);
MGB_CALL_CUDA_FORKSAFE(cuDeviceGetName, pname, 255, 255, dev);
rec.prop.name = pname;
rec.init = true;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册