comp_node.cpp 34.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
#include "./comp_node.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/utils/thread.h"

#include <string>

using namespace mgb;

#if MGB_CUDA

#include "megbrain/comp_node/alloc.h"

#include <cctype>
14
#include <cstdio>
15 16 17

#include <thread>

18
#include <cuda.h>
19 20
#include <cuda_runtime.h>

21 22
#ifdef __unix__
#include <sys/wait.h>
M
Megvii Engine Team 已提交
23
#include <unistd.h>
24 25
#endif

26 27 28
using CudaCompNodeImpl = CudaCompNode::CompNodeImpl;

namespace {
29 30 31 32 33 34 35
size_t get_min_system_memory(size_t available) {
    if (available < (1u << 31)) {
        // 225MiB
        return 225 * 1024 * 1024;
    } else {
        // max(300 MiB, 0.05 * available)
        return std::max<size_t>(300 * 1024 * 1024, available / 20);
36
    }
37 38 39 40 41 42 43 44 45
}
using CudaHostFunc = megdnn::thin_function<void()>;
void CUDART_CB cuda_host_func_caller(void* ud) {
    mgb_assert(ud);
    CudaHostFunc* func_ptr = reinterpret_cast<CudaHostFunc*>(ud);
    MGB_TRY { (*func_ptr)(); }
    MGB_FINALLY(delete func_ptr;);
}
}  // anonymous namespace
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76

namespace mgb {
namespace mem_alloc {
class CudaRawAllocator final : public RawAllocator {
public:
    void* alloc(size_t size) override {
        void* addr;
        cudaError_t cuda_error = cudaMalloc(&addr, size);
        if (cuda_error == cudaSuccess) {
            mgb_assert(addr);
            return addr;
        }
        auto msg = mgb_ssprintf_log(
                "cudaMalloc failed while requesting %zd bytes (%.3fMiB)"
                " of memory; error: %s",
                size, size / (1024.0 * 1024), cudaGetErrorString(cuda_error));
        msg.append(CudaError::get_cuda_extra_info());
        if (cuda_error == cudaErrorMemoryAllocation) {
            mgb_log_error("%s", msg.c_str());
            // clear cuda error
            cudaGetLastError();
            mgb_assert(cudaGetLastError() == cudaSuccess);
            return nullptr;
        }
        mgb_throw_raw(MemAllocError{msg});
    }

    void free(void* ptr) override {
        cudaError_t cuda_error = cudaFree(ptr);
        if (cuda_error == cudaSuccess)
            return;
M
Megvii Engine Team 已提交
77 78
        auto msg = ssprintf(
                "cudaFree failed for %p: %s", ptr, cudaGetErrorString(cuda_error));
79 80 81 82 83 84 85 86
        msg.append(CudaError::get_cuda_extra_info());
        mgb_throw_raw(MemAllocError{msg});
    }

    void get_mem_info(size_t& free, size_t& tot) override {
        cudaError_t cuda_error = cudaMemGetInfo(&free, &tot);
        if (cuda_error == cudaSuccess)
            return;
M
Megvii Engine Team 已提交
87
        auto msg = ssprintf("cudaMemGetInfo failed %s", cudaGetErrorString(cuda_error));
88 89 90 91 92
        msg.append(CudaError::get_cuda_extra_info());
        mgb_throw_raw(MegBrainError{msg});
    }
};

93 94 95 96
class CudaHostAllocator : public RawAllocator {
public:
    void* alloc(size_t size) override {
        void* addr;
M
Megvii Engine Team 已提交
97
        cudaError_t cuda_error = cudaHostAlloc(&addr, size, cudaHostAllocDefault);
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
        if (cuda_error == cudaSuccess) {
            mgb_assert(addr);
            return addr;
        }
        auto msg = mgb_ssprintf_log(
                "cudaHostAlloc failed while requesting %zd bytes (%.3fMiB)"
                " of pinned host memory; error: %s",
                size, size / (1024.0 * 1024), cudaGetErrorString(cuda_error));
        msg.append(CudaError::get_cuda_extra_info());
        if (cuda_error == cudaErrorMemoryAllocation) {
            mgb_log_error("%s", msg.c_str());
            // clear cuda error
            cudaGetLastError();
            mgb_assert(cudaGetLastError() == cudaSuccess);
            return nullptr;
        }
        mgb_throw_raw(MemAllocError{msg});
    }

    void free(void* ptr) override {
        cudaError_t cuda_error = cudaFreeHost(ptr);
        if (cuda_error == cudaSuccess)
            return;
M
Megvii Engine Team 已提交
121 122
        auto msg = ssprintf(
                "cudaFreeHost failed for %p: %s", ptr, cudaGetErrorString(cuda_error));
123 124 125 126 127 128 129 130 131 132
        msg.append(CudaError::get_cuda_extra_info());
        mgb_throw_raw(MemAllocError{msg});
    }

    void get_mem_info(size_t& free, size_t& tot) override {
        free = 0;
        tot = 0;
    }
};

133 134
class CudaDeviceRuntimePolicy : public DeviceRuntimePolicy {
public:
M
Megvii Engine Team 已提交
135 136
    CompNode::DeviceType device_type() override { return CompNode::DeviceType::CUDA; }
    void set_device(int device) override { MGB_CUDA_CHECK(cudaSetDevice(device)); }
137 138 139 140 141 142 143 144
    void device_synchronize(int device) override {
        MGB_CUDA_CHECK(cudaSetDevice(device));
        MGB_CUDA_CHECK(cudaDeviceSynchronize());
    }
};

/* ===================== DevMemAlloc  ===================== */
std::unique_ptr<DevMemAlloc> DevMemAlloc::make_cuda_alloc() {
M
Megvii Engine Team 已提交
145
    return std::make_unique<FwdDevMemAlloc>(std::make_shared<CudaRawAllocator>());
146 147 148 149 150
}
}  // namespace mem_alloc
}  // namespace mgb

/* ===================== CudaCompNodeImpl  ===================== */
151
class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
152 153 154 155 156 157 158
    MGB_DYN_TYPE_OBJ_FINAL_DECL;

    friend class EventImpl;
    friend class CudaCompNode;

    struct DeviceInfo;
    struct StaticData;
159
    static StaticData* sd;
160
    static Spinlock sd_mtx;
161 162 163
#if !MGB_BUILD_SLIM_SERVING
    std::mutex m_update_mem;
#endif
164 165 166 167 168

    //! set to true when m_locator is assigned; set to false if async init
    //! failed
    bool m_initialized = false;
    Locator m_locator, m_locator_logical;
169 170
    mem_alloc::StreamMemAlloc* m_mem_alloc;
    DeviceInfo* m_device_info;
171 172 173 174

    std::unique_ptr<Event> m_sync_event;
    Spinlock m_sync_event_mtx;

175
    void activate() { m_env.cuda_env().activate(); }
176

177
    void init(const Locator& locator, const Locator& locator_logical);
178 179 180 181 182
    void fini();

    //! return whether global finalized, and print warning in such case
    static inline bool check_global_finalized();

183 184
    static CompNode::DeviceProperties get_device_prop(int dev);

185 186 187 188 189 190 191 192 193 194 195
    //! enable peer copy from dev0 to dev1
    static void enable_peer_access(int dev0, int dev1);

    static void static_free_device(ImplBase* self, void* ptr) {
        static_cast<CompNodeImpl*>(self)->free_device(ptr);
    }

    static void static_free_host(ImplBase* self, void* ptr) {
        static_cast<CompNodeImpl*>(self)->free_host(ptr);
    }

196 197
public:
    CompNodeImpl() : Impl(static_free_device, static_free_host) {}
198

199 200
    static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;

201
    void* alloc_device(size_t size) override;
202

203
    void free_device(void* ptr);
204

205
    void* alloc_host(size_t size) override;
206

207
    void free_host(void* ptr);
208

M
Megvii Engine Team 已提交
209
    void copy_to_host(void* host_ptr, const void* device_ptr, size_t size) override {
210
        activate();
M
Megvii Engine Team 已提交
211 212 213
        MGB_CUDA_CHECK(cudaMemcpyAsync(
                host_ptr, device_ptr, size, cudaMemcpyDeviceToHost,
                m_env.cuda_env().stream));
214
    }
215

M
Megvii Engine Team 已提交
216
    void copy_to_device(void* device_ptr, const void* host_ptr, size_t size) override {
217
        activate();
M
Megvii Engine Team 已提交
218 219 220
        MGB_CUDA_CHECK(cudaMemcpyAsync(
                device_ptr, host_ptr, size, cudaMemcpyHostToDevice,
                m_env.cuda_env().stream));
221
    }
222

M
Megvii Engine Team 已提交
223 224
    void peer_copy_to(
            Impl* dest_impl, void* dest, const void* src, size_t size) override;
225

M
Megvii Engine Team 已提交
226
    size_t get_mem_addr_alignment() override { return m_env.property().mem_alignment; }
227

228
    std::unique_ptr<Event> create_event(size_t flags) override;
229

230
    void sync() override;
231

232
    MemNode mem_node() override;
233

234 235 236 237 238 239 240 241
    std::pair<size_t, size_t> get_mem_status_bytes() override {
        // explicitly call cuda_env() to ensure async init is finished
        m_env.cuda_env().activate();
        size_t tot, free;
        MGB_CUDA_CHECK(cudaMemGetInfo(&free, &tot));
        free += m_mem_alloc->get_free_memory_dev().tot;
        return {tot, free};
    }
242

243
#if !MGB_BUILD_SLIM_SERVING
M
Megvii Engine Team 已提交
244 245
    std::pair<size_t, size_t> get_free_left_and_right(
            size_t begin_ptr, size_t end_ptr) override {
246 247
        return m_mem_alloc->get_free_left_and_right(begin_ptr, end_ptr);
    }
248

249
    size_t get_max_block_size_available() override {
250 251 252
        activate();
        return m_mem_alloc->get_max_block_size_available();
    }
253 254 255 256 257 258 259

    size_t get_free_mem() override {
        m_env.cuda_env().activate();
        size_t tot, free;
        MGB_CUDA_CHECK(cudaMemGetInfo(&free, &tot));
        return free;
    }
260 261
#endif

262
    Locator locator() override { return m_locator; }
263

264
    Locator locator_logical() override { return m_locator_logical; }
265

266
    void add_callback(CudaHostFunc&& cb) override {
267
#if CUDART_VERSION >= 10000
268 269 270
        activate();
        CudaHostFunc* func_ptr = new CudaHostFunc(std::move(cb));
        MGB_TRY {
M
Megvii Engine Team 已提交
271 272 273
            MGB_CUDA_CHECK(cudaLaunchHostFunc(
                    m_env.cuda_env().stream, cuda_host_func_caller,
                    static_cast<void*>(func_ptr)));
274 275 276 277 278
        }
        MGB_CATCH(..., {
            delete func_ptr;
            throw;
        });
279
#else
280 281
        MGB_MARK_USED_VAR(cb);
        MGB_MARK_USED_VAR(cuda_host_func_caller);
M
Megvii Engine Team 已提交
282 283 284
        mgb_throw(
                MegBrainError,
                "add_callback only support in cuda10.0 and later version");
285
#endif
286
    }
287

288
    uint64_t get_uid() override { return m_uid; }
289 290

#if !MGB_BUILD_SLIM_SERVING
291 292 293 294 295 296 297 298 299 300
    size_t get_used_memory() override;

    size_t get_max_used_memory() override;

    size_t get_reserved_memory() override;

    size_t get_max_reserved_memory() override;

    void reset_max_used_memory() override;
    void reset_max_reserved_memory() override;
301 302
#endif

303 304
private:
    uint64_t m_uid;
305
#if !MGB_BUILD_SLIM_SERVING
306
    std::unordered_map<void*, size_t> ptr2size;
307
#endif
308 309 310 311 312
};
MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl);

struct CudaCompNodeImpl::DeviceInfo {
    int dev_num = -1;
313 314
    std::atomic_size_t m_used_mem{0};
    std::atomic_size_t m_max_used_mem{0};
315 316
    std::unique_ptr<mem_alloc::DevMemAlloc> mem_alloc;

317
    bool init_done() const { return mem_alloc.get(); }
318

319
    void init(const CompNodeEnv& env);
320

321
    void fini() { mem_alloc.reset(); }
322 323 324 325 326 327 328
};

struct CudaCompNodeImpl::StaticData {
    std::recursive_mutex mtx;

    mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config;

329
    std::unique_ptr<mem_alloc::SimpleCachingAlloc> host_alloc;
330 331
    CudaCompNode::CompNodeImpl node[MAX_NR_COMP_NODE];
    DeviceInfo dev_info[MAX_NR_DEVICE];
332 333
    int nr_node = 0,          //!< number of loaded node[]
            nr_dev_used = 0;  //!< number of used dev_info[]
334

335 336 337
    StaticData()
            : host_alloc(mem_alloc::SimpleCachingAlloc::make(
                      std::make_unique<mem_alloc::CudaHostAllocator>())) {
338 339
        prealloc_config.max_overhead = 0;
        prealloc_config.alignment = 1;
340
        host_alloc->alignment(1);
341 342 343
    }

    ~StaticData() {
344
        for (int i = 0; i < nr_node; ++i)
345
            node[i].fini();
346
        for (int i = 0; i < nr_dev_used; ++i)
347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
            dev_info[i].fini();
    }

    static size_t get_mem_reserve_size() {
        if (auto setting = MGB_GETENV("MGB_CUDA_RESERVE_MEMORY")) {
            if (!strncmp(setting, "b:", 2)) {
                return std::stoull(setting + 2);
            }
            size_t tot, free;
            MGB_CUDA_CHECK(cudaFree(0));
            MGB_CUDA_CHECK(cudaMemGetInfo(&free, &tot));
            return free - get_min_system_memory(free);
        } else {
            return 0;
        }
    }
};
CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr;
Spinlock CudaCompNodeImpl::sd_mtx;

367 368 369 370 371 372 373
struct DevicePropRec {
    bool init = false;
    CompNode::DeviceProperties prop;
    Spinlock mtx_com;
};
DevicePropRec device_prop_rec[CudaCompNodeImpl::MAX_NR_DEVICE];

M
Megvii Engine Team 已提交
374
void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) {
375 376 377 378
    m_locator = locator;
    m_locator_logical = locator_logical;
    m_initialized = true;

379
#if defined(__linux__) || defined(TARGET_OS_MAC)
380
    FILE* fp;
381 382 383 384 385
    fp = fopen("/dev/urandom", "r");
    mgb_assert(fread(&m_uid, sizeof(m_uid), 1, fp) == 1);
    fclose(fp);
#else
    m_uid = std::chrono::duration_cast<std::chrono::nanoseconds>(
386 387
                    std::chrono::system_clock::now().time_since_epoch())
                    .count();
388 389
#endif

390 391 392 393 394
    auto on_succ = [this](cudaStream_t stream) {
        auto locator = m_locator;
        log_comp_node_created(locator, m_locator_logical);

        MGB_LOCK_GUARD(sd->mtx);
395 396
        DeviceInfo* dev_info = nullptr;
        for (int i = 0; i < sd->nr_dev_used; ++i) {
397 398 399 400 401 402 403 404 405 406
            if (sd->dev_info[i].dev_num == locator.device) {
                dev_info = &sd->dev_info[i];
                break;
            }
        }

        if (!dev_info) {
            dev_info = &sd->dev_info[sd->nr_dev_used];
            dev_info->init(m_env);
            // note: add nr_dev_used only after init succeeds
407
            ++sd->nr_dev_used;
408 409
        }
        m_device_info = dev_info;
M
Megvii Engine Team 已提交
410
        m_mem_alloc = dev_info->mem_alloc->add_stream(static_cast<void*>(stream));
411 412 413 414 415 416 417
    };

    auto on_error = [this](std::exception&) {
        MGB_LOCK_GUARD(sd->mtx);
        m_initialized = false;
    };

M
Megvii Engine Team 已提交
418 419
    m_env.init_cuda_async(
            locator.device, make_comp_node_from_impl(this), {on_succ, on_error});
420 421 422 423 424 425 426 427 428 429 430 431 432
}

void CudaCompNodeImpl::fini() {
    if (!m_initialized)
        return;

    m_sync_event.reset();
    m_env.fini();
    m_mem_alloc = nullptr;
    m_device_info = nullptr;
    m_initialized = false;
}

433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
void* CudaCompNodeImpl::alloc_device(size_t size) {
    activate();
#if MGB_BUILD_SLIM_SERVING
    return m_mem_alloc->alloc(size);
#else
    void* ptr = m_mem_alloc->alloc(size);
    {
        MGB_LOCK_GUARD(m_update_mem);
        ptr2size[ptr] = size;
        m_device_info->m_used_mem += size;
        if (m_device_info->m_used_mem > m_device_info->m_max_used_mem) {
            m_device_info->m_max_used_mem = m_device_info->m_used_mem.load();
        }
    }
    return ptr;
#endif
}

451
void CudaCompNodeImpl::free_device(void* ptr) {
452 453 454 455
    if (check_global_finalized())
        return;

    activate();
456 457 458
#if !MGB_BUILD_SLIM_SERVING
    {
        MGB_LOCK_GUARD(m_update_mem);
M
Megvii Engine Team 已提交
459
        mgb_assert(ptr2size.find(ptr) != ptr2size.end(), "ptr %p not found!", ptr);
460
        m_device_info->m_used_mem -= ptr2size.at(ptr);
461 462 463
        ptr2size.erase(ptr);
    }
#endif
464 465 466
    m_mem_alloc->free(ptr);
}

467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492
#if !MGB_BUILD_SLIM_SERVING
size_t CudaCompNodeImpl::get_used_memory() {
    return m_device_info->m_used_mem.load();
}

size_t CudaCompNodeImpl::get_max_used_memory() {
    return m_device_info->m_max_used_mem.load();
}

void CudaCompNodeImpl::reset_max_used_memory() {
    m_device_info->m_max_used_mem = 0;
}

size_t CudaCompNodeImpl::get_reserved_memory() {
    return m_device_info->mem_alloc->get_used_memory();
}

size_t CudaCompNodeImpl::get_max_reserved_memory() {
    return m_device_info->mem_alloc->get_max_used_memory();
}

void CudaCompNodeImpl::reset_max_reserved_memory() {
    m_device_info->mem_alloc->reset_max_used_memory();
}
#endif

493
void* CudaCompNodeImpl::alloc_host(size_t size) {
494 495
    // need activate because it create cuda cuda context in current device
    activate();
496 497 498 499
    return sd->host_alloc->alloc(size);
}

void CudaCompNodeImpl::free_host(void* ptr) {
500 501
    if (check_global_finalized())
        return;
502 503 504
    sd->host_alloc->free(ptr);
}

M
Megvii Engine Team 已提交
505 506
void CudaCompNodeImpl::peer_copy_to(
        Impl* dest_impl, void* dest, const void* src, size_t size) {
507
    if (dest_impl->same_type<CudaCompNodeImpl>()) {
M
Megvii Engine Team 已提交
508
        auto&& dst_env = static_cast<CudaCompNodeImpl*>(dest_impl)->m_env.cuda_env();
509
        auto&& src_env = m_env.cuda_env();
510 511
        activate();
        if (dst_env.device == src_env.device) {
512 513
            MGB_CUDA_CHECK(cudaMemcpyAsync(
                    dest, src, size, cudaMemcpyDeviceToDevice, dst_env.stream));
514 515 516
        } else {
            enable_peer_access(src_env.device, dst_env.device);
            enable_peer_access(dst_env.device, src_env.device);
M
Megvii Engine Team 已提交
517 518
            MGB_CUDA_CHECK(cudaMemcpyPeerAsync(
                    dest, dst_env.device, src, src_env.device, size, dst_env.stream));
519 520 521
        }
        return;
    }
M
Megvii Engine Team 已提交
522 523 524
    mgb_assert(
            dest_impl->env().property().type == DeviceType::CPU,
            "cuda peer_copy_to only implemented for CPU");
525 526 527
    auto copy = [this, dest, src, size]() {
        auto stream = m_env.cuda_env().stream;
        m_env.cuda_env().activate();
M
Megvii Engine Team 已提交
528 529
        MGB_CUDA_CHECK(
                cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
530 531 532 533 534 535 536 537 538 539 540 541 542 543
        MGB_CUDA_CHECK(cudaStreamSynchronize(stream));
    };
    dest_impl->env().cpu_env().dispatch(copy);
}

MemNode CudaCompNodeImpl::mem_node() {
    // m_device_info would be null before async init finishes; so we just return
    // a prive pointer related to device number here
    return MemNode{sd->dev_info + m_locator.device};
}

void CudaCompNodeImpl::sync() {
    activate();

544 545 546
    // do not use MGB_CUDA_CHECK(cudaStreamSynchronize(m_env->stream)) since
    // other threads may be adding operations into the stream, and we only care
    // about previous operations in current thread. However docs of
547 548 549
    // cudaStreamSynchronize did not describe details of such condition, so we
    // use manual event implementation

550
    Event* event;
551 552 553 554 555 556 557 558 559 560 561
    {
        MGB_LOCK_GUARD(m_sync_event_mtx);
        if (!m_sync_event)
            m_sync_event = create_event(0);
        event = m_sync_event.get();
    }
    event->record();
    event->host_wait();
}

void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) {
562
    static bool already_enabled[MAX_NR_DEVICE][MAX_NR_DEVICE];
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577
    if (already_enabled[dev0][dev1])
        return;

    static std::mutex global_lock;
    MGB_LOCK_GUARD(global_lock);
    if (already_enabled[dev0][dev1])
        return;

    int can;
    MGB_CUDA_CHECK(cudaDeviceCanAccessPeer(&can, dev0, dev1));
    if (can) {
        mgb_log("enable peer access from GPU %d to GPU %d", dev0, dev1);
        MGB_CUDA_CHECK(cudaSetDevice(dev0));
        auto err = cudaDeviceEnablePeerAccess(dev1, 0);
        if (err != cudaSuccess) {
M
Megvii Engine Team 已提交
578 579 580
            mgb_log_error(
                    "failed to enable peer access from %d to %d: %s(%d)", dev0, dev1,
                    cudaGetErrorString(err), static_cast<int>(err));
581 582 583 584 585 586 587 588 589 590 591 592
            cudaGetLastError();
        }
    }

    // check for cudaMemcpyPeer usable
    int v0 = 1, v1 = 2;

    int *dp0, *dp1;
    MGB_CUDA_CHECK(cudaSetDevice(dev0));
    MGB_CUDA_CHECK(cudaMalloc(&dp0, sizeof(int)));
    MGB_CUDA_CHECK(cudaSetDevice(dev1));
    MGB_CUDA_CHECK(cudaMalloc(&dp1, sizeof(int)));
593 594
    MGB_CUDA_CHECK(cudaMemcpy(dp0, &v0, sizeof(int), cudaMemcpyHostToDevice));
    MGB_CUDA_CHECK(cudaMemcpy(dp1, &v1, sizeof(int), cudaMemcpyHostToDevice));
595 596
    MGB_CUDA_CHECK(cudaMemcpyPeer(dp1, dev1, dp0, dev0, sizeof(int)));
    int get = 0;
597
    MGB_CUDA_CHECK(cudaMemcpy(&get, dp1, sizeof(int), cudaMemcpyDeviceToHost));
598

M
Megvii Engine Team 已提交
599 600 601 602 603
    mgb_throw_if(
            get != 1, CudaError,
            "P2P copy (%d => %d) check failed; consider disabling "
            "Access Control Services(ACS) for the PCI device",
            dev0, dev1);
604 605 606 607 608 609

    already_enabled[dev0][dev1] = true;
}

/* ===================== CudaCompNodeImpl::DeviceInfo  ===================== */

610
void CudaCompNodeImpl::DeviceInfo::init(const CompNodeEnv& env) {
611 612 613 614 615
    mgb_assert(!mem_alloc);
#if 0
    // forward cudaMalloc
    mem_alloc = mem_alloc::DevMemAlloc::make_cuda_alloc();
#else
616
    auto&& cuenv = env.cuda_env();
617 618 619 620
    cuenv.activate();
    dev_num = cuenv.device;
    auto reserve_size = StaticData::get_mem_reserve_size();
    mem_alloc = mem_alloc::DevMemAlloc::make(
M
Megvii Engine Team 已提交
621
            dev_num, reserve_size, std::make_shared<mem_alloc::CudaRawAllocator>(),
622 623 624 625
            std::make_shared<mem_alloc::CudaDeviceRuntimePolicy>());
    mem_alloc->prealloc_config(sd->prealloc_config);
    auto align = env.property().mem_alignment;
    mem_alloc->alignment(align);
626
    mgb_log_debug(
M
Megvii Engine Team 已提交
627 628
            "cuda: gpu%d: name=`%s' dyn_mem_reserve=%.2fMiB alignment=0x%zx", dev_num,
            cuenv.device_prop.name, reserve_size / 1024.0 / 1024, align);
629 630 631 632 633 634 635 636 637 638 639
#endif
}

bool CudaCompNodeImpl::check_global_finalized() {
    if (!sd) {
        static std::atomic_flag warn_printed = ATOMIC_FLAG_INIT;
        if (!warn_printed.test_and_set()) {
            mgb_log_debug("cuda comp node method called after global finalize");
        }
        return true;
    }
640 641 642 643 644 645 646 647 648 649 650
#if MGB_CUDA && defined(WIN32)
    //! FIXME: windows cuda driver shutdown before call atexit function even
    //! register atexit function after init cuda driver! as a workround
    //! recovery resource by OS temporarily, may need remove this after
    //! upgrade cuda runtime
    if (CudaCompNode::is_into_atexit) {
        mgb_log_debug(
                "windows cudaErrorCudartUnloading happened!!, resource "
                "recovery by OS!!");
        return true;
    }
651 652 653 654 655 656 657 658 659 660 661 662 663
    //! FIXME: megengine dynamic with VCRT, atexit fuctions table have
    //! some order issue, which will lead to cuda runtime uploading, this
    //! always happened at python3 unload dll(means python3 will exit),
    //! as a workround, recovery resource by OS temporarily, may need
    //! remove this after upgrade cuda runtime
    int dev = -1;
    if (cudaErrorCudartUnloading == cudaGetDevice(&dev)) {
        mgb_log_debug(
                "windows cudaErrorCudartUnloading happened!!, resource "
                "recovery by OS!!");
        return true;
    }

664
#endif
665 666 667 668 669
    return false;
}

/* ===================== EventImpl  ===================== */

670
class CudaCompNode::EventImpl final : public EventImplHelper {
671
    bool m_init_finished = false;
672
    CudaCompNodeImpl* const m_comp_node_impl;
673 674 675 676
    cudaEvent_t m_cuda_event;

    void do_record() override {
        m_comp_node_impl->activate();
677
        auto&& env = m_comp_node_impl->m_env.cuda_env();
678 679 680 681 682 683 684 685 686 687
        MGB_CUDA_CHECK(cudaEventRecord(m_cuda_event, env.stream));
    }

    bool do_finished() override {
        m_comp_node_impl->activate();
        cudaError_t err = cudaEventQuery(m_cuda_event);
        if (err == cudaSuccess)
            return true;
        if (err == cudaErrorNotReady)
            return false;
M
Megvii Engine Team 已提交
688 689 690
        mgb_throw(
                CudaError, "failed to query event: %d: %s", int(err),
                cudaGetErrorString(err));
691 692
    }

M
Megvii Engine Team 已提交
693
    void host_wait_cv() override { MGB_CUDA_CHECK(cudaEventSynchronize(m_cuda_event)); }
694

695
    double do_elapsed_time_until(EventImplHelper& end) override {
696 697
        m_comp_node_impl->activate();
        float ret = 0.0;
698 699
        MGB_CUDA_CHECK(cudaEventElapsedTime(
                &ret, m_cuda_event, static_cast<EventImpl&>(end).m_cuda_event));
700 701 702
        return static_cast<double>(ret) * 1e-3;
    }

703
    void do_device_wait_by(Impl* cn_impl) override;
704

705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722
public:
    EventImpl(CudaCompNodeImpl* comp_node_impl, size_t create_flags)
            : EventImplHelper(comp_node_impl, create_flags),
              m_comp_node_impl{comp_node_impl} {
        m_comp_node_impl->activate();
        size_t cuda_flags = cudaEventDisableTiming;
        if (create_flags & NEED_TIMER)
            cuda_flags = 0;
        MGB_CUDA_CHECK(cudaEventCreateWithFlags(&m_cuda_event, cuda_flags));
        m_init_finished = true;
    }

    ~EventImpl() {
        if (m_init_finished) {
            MGB_TRY { MGB_CUDA_CHECK(cudaEventDestroy(m_cuda_event)); }
            MGB_CATCH(MegBrainError & exc, {
                mgb_log_error("failed to destroy cuda event: %s", exc.what());
            })
723
        }
724
    }
725 726
};

727
std::unique_ptr<CompNode::Event> CudaCompNodeImpl::create_event(size_t flags) {
728 729 730
    return std::make_unique<EventImpl>(this, flags);
}

731
void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) {
732 733 734 735 736 737 738 739
    if (cn_impl->dyn_typeinfo() == CudaCompNodeImpl::typeinfo()) {
        auto imp = static_cast<CudaCompNodeImpl*>(cn_impl);
        auto stream = imp->m_env.cuda_env().stream;
        imp->activate();
        MGB_CUDA_CHECK(cudaStreamWaitEvent(stream, m_cuda_event, 0));
        return;
    }
    if (cn_impl->env().property().type == DeviceType::CPU) {
M
Megvii Engine Team 已提交
740
        auto waiter = [this]() { MGB_CUDA_CHECK(cudaEventSynchronize(m_cuda_event)); };
741 742 743 744 745 746 747 748
        cn_impl->add_callback(std::move(waiter));
        return;
    }
    mgb_throw(MegBrainError, "unimplemented event device_wait_by config");
}

/* ===================== CudaCompNode static methods ===================== */

749 750 751
namespace {

#ifndef __unix__
752 753
template <typename Func, typename Val>
CUresult call_cuda_forksafe(Func func, Val* val, size_t len) {
754
    cuInit(0);
755
    return func();
756 757 758 759 760 761
}
#else
struct RAIICloseFD : NonCopyableObj {
    int m_fd = -1;

    RAIICloseFD(int fd) : m_fd(fd) {}
M
Megvii Engine Team 已提交
762
    ~RAIICloseFD() { close(); }
763 764 765 766 767 768 769 770
    void close() {
        if (m_fd != -1) {
            ::close(m_fd);
            m_fd = -1;
        }
    }
};
// an implementation that does not call cuInit
771 772 773 774 775
template <typename Func, typename Val>
CUresult call_cuda_forksafe(Func func, Val* val, size_t len) {
    int t_ndev;
    // use cuDeviceGetCount to detect cuda initialization to avoid abnormal behavior
    auto err = cuDeviceGetCount(&t_ndev);
M
Megvii Engine Team 已提交
776
    if (err != CUDA_ERROR_NOT_INITIALIZED)
777
        return func();
778 779 780 781 782 783 784 785 786 787 788 789
    // cuInit not called, call it in child process
    int fd[2];
    mgb_assert(pipe(fd) == 0, "pipe() failed");
    int fdr = fd[0], fdw = fd[1];
    RAIICloseFD fdr_guard(fdr);
    RAIICloseFD fdw_guard(fdw);
    auto cpid = fork();
    mgb_assert(cpid != -1, "fork() failed");
    if (cpid == 0) {
        fdr_guard.close();
        do {
            err = cuInit(0);
M
Megvii Engine Team 已提交
790 791
            if (err != CUDA_SUCCESS)
                break;
792
            err = func();
793 794 795 796 797 798 799 800 801 802 803 804 805 806
        } while (0);
        auto sz = write(fdw, &err, sizeof(err));
        if (sz == sizeof(err) && err == CUDA_SUCCESS) {
            sz = write(fdw, val, sizeof(*val) * len);
        }
        fdw_guard.close();
        std::quick_exit(0);
    }
    fdw_guard.close();
    auto sz = read(fdr, &err, sizeof(err));
    mgb_assert(sz == sizeof(err), "failed to read error code from child");
    if (err == CUDA_SUCCESS) {
        sz = read(fdr, val, sizeof(*val) * len);
        mgb_assert(
807
                static_cast<size_t>(sz) == sizeof(*val) * len,
808 809 810 811
                "failed to read value from child");
        return err;
    }
    // try again, maybe another thread called cuInit while we fork
812
    auto err2 = func();
813 814 815 816 817 818
    if (err2 == CUDA_SUCCESS)
        return err2;
    if (err2 == CUDA_ERROR_NOT_INITIALIZED)
        return err;
    return err2;
}
819 820 821 822 823
#endif

const char* cu_get_error_string(CUresult err) {
    const char* ret = nullptr;
    cuGetErrorString(err, &ret);
824 825 826 827
    if (!ret) {
        //! caused by cuda stub do not find driver
        ret = "invalid_stub_call";
    }
828 829 830
    return ret;
}

831 832 833 834 835 836 837 838 839 840 841
#define MGB_CALL_CUDA_FORKSAFE_NOASSERT(func, ptr, len, ...) \
    call_cuda_forksafe([&]() { return func(ptr, ##__VA_ARGS__); }, ptr, len)

#define MGB_CALL_CUDA_FORKSAFE(func, ptr, len, ...)                                \
    {                                                                              \
        auto err = MGB_CALL_CUDA_FORKSAFE_NOASSERT(func, ptr, len, ##__VA_ARGS__); \
        if (err != CUDA_SUCCESS) {                                                 \
            auto err_s = cu_get_error_string(err);                                 \
            mgb_log_error(#func " failed: %s (err %d)", err_s, int(err));          \
        }                                                                          \
    }
M
Megvii Engine Team 已提交
842
}  // namespace
843

844 845 846 847 848 849
bool CudaCompNode::available() {
    static int result = -1;
    static Spinlock mtx;
    MGB_LOCK_GUARD(mtx);
    if (result == -1) {
        int ndev = -1;
850
        auto err = MGB_CALL_CUDA_FORKSAFE_NOASSERT(cuDeviceGetCount, &ndev, 1);
851
        result = err == CUDA_SUCCESS && ndev > 0;
852 853 854
        auto err_s = cu_get_error_string(err);
        //! only show !CUDA_SUCCESS log when with valid stub call
        if (!result && (std::string(err_s) != "invalid_stub_call")) {
M
Megvii Engine Team 已提交
855
            mgb_log_warn(
856 857
                    "cuda unavailable: %s(%d) ndev=%d", err_s, static_cast<int>(err),
                    ndev);
858
        }
859
        if (err == CUDA_ERROR_NOT_INITIALIZED) {
860
            mgb_throw(std::runtime_error, "cuda initialization error.");
861 862 863 864 865 866 867 868 869 870 871 872 873 874 875
        }
    }
    return result;
}

void CudaCompNode::finalize() {
    if (CudaCompNodeImpl::sd) {
        sync_all();

        auto ptr = CudaCompNodeImpl::sd;
        CudaCompNodeImpl::sd = nullptr;
        ptr->~StaticData();
    }
}

876 877 878 879 880 881 882
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
bool CudaCompNode::is_into_atexit = false;
#endif
M
Megvii Engine Team 已提交
883 884
CompNode::Impl* CudaCompNode::load_cuda(
        const Locator& locator, const Locator& locator_logical) {
885
    int nr_gpu = get_device_count();
886 887 888 889 890 891 892 893 894 895
#if MGB_CUDA && defined(WIN32)
    //! FIXME: windows cuda driver shutdown before call atexit function even
    //! register atexit function after init cuda driver! as a workround
    //! recovery resource by OS temporarily, may need remove this after
    //! upgrade cuda runtime
    if (!is_into_atexit) {
        auto err = atexit([] { is_into_atexit = true; });
        mgb_assert(!err, "failed to register atexit function");
    }
#endif
M
Megvii Engine Team 已提交
896 897 898
    mgb_assert(
            locator.device >= 0 && locator.device < nr_gpu,
            "request gpu%d out of valid range [0, %d)", locator.device, nr_gpu);
899

900
    auto&& sdptr = CudaCompNodeImpl::sd;
901 902 903 904 905 906 907
    {
        MGB_LOCK_GUARD(CudaCompNodeImpl::sd_mtx);
        if (!sdptr) {
            // use static storage so object can be safely accessed even after
            // global finalize
            using T = CudaCompNodeImpl::StaticData;
            static std::aligned_storage_t<sizeof(T), alignof(T)> storage;
908
            sdptr = new (&storage) T;
909 910
        }
    }
911
    auto&& sd = *sdptr;
912 913
    MGB_LOCK_GUARD(sd.mtx);

914 915 916
    CompNodeImpl* available_node = nullptr;
    for (int i = 0; i < sd.nr_node; ++i) {
        auto&& cur = sd.node[i];
917
        if (cur.m_initialized) {
M
Megvii Engine Team 已提交
918
            if (cur.m_locator == locator && cur.m_locator_logical == locator_logical) {
919 920 921 922 923 924 925 926
                return &cur;
            }
        } else {
            available_node = &cur;
        }
    }

    if (!available_node) {
927 928 929
        mgb_assert(
                sd.nr_node < CompNodeImpl::MAX_NR_COMP_NODE,
                "too many CompNode allocated");
930
        available_node = &sd.node[sd.nr_node++];
931
    }
932
    mgb_assert(locator.device < CompNodeImpl::MAX_NR_DEVICE, "device number too large");
933 934 935 936 937 938 939 940 941 942 943 944 945 946

    mgb_assert(!available_node->m_initialized);
    available_node->init(locator, locator_logical);

    return available_node;
}

void CudaCompNode::try_coalesce_all_free_memory() {
    // TODO: optimized implementation
    auto sd = CudaCompNodeImpl::sd;
    if (!sd)
        return;

    size_t size = 0;
947
    for (int i = 0; i < sd->nr_dev_used; ++i) {
M
Megvii Engine Team 已提交
948
        size += sd->dev_info[i].mem_alloc->gather_stream_free_blk_and_release_full();
949 950
    }
    if (size) {
M
Megvii Engine Team 已提交
951
        mgb_log_debug("%zu bytes freed by try_coalesce_all_free_memory()", size);
952 953 954 955 956 957 958 959
    }
}

void CudaCompNode::sync_all() {
    auto sd = CudaCompNodeImpl::sd;
    if (!sd)
        return;

960
    for (int i = 0;; ++i) {
961
        // ensure async init finished
962
        CompNodeEnv* env;
963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979
        {
            MGB_LOCK_GUARD(sd->mtx);
            if (i >= sd->nr_node) {
                break;
            }
            env = &sd->node[i].env();
        }
        env->cuda_env();
    }

    MGB_LOCK_GUARD(sd->mtx);
    for (int i = 0; i < sd->nr_dev_used; ++i) {
        MGB_CUDA_CHECK(cudaSetDevice(sd->dev_info[i].dev_num));
        MGB_CUDA_CHECK(cudaDeviceSynchronize());
    }
}

980
void CudaCompNode::foreach (thin_function<void(CompNode)> callback) {
981 982 983 984
    auto sd = CudaCompNodeImpl::sd;
    if (!sd)
        return;

985
    for (int i = 0;; ++i) {
986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
        CompNode cur;
        {
            MGB_LOCK_GUARD(sd->mtx);
            if (i >= sd->nr_node)
                return;
            cur = make_comp_node_from_impl(&sd->node[i]);
        }
        callback(cur);
    }
}

size_t CudaCompNode::get_device_count(bool warn) {
    static int cnt = -1;
    static Spinlock mtx;
    MGB_LOCK_GUARD(mtx);
    if (cnt == -1) {
1002
        auto err = MGB_CALL_CUDA_FORKSAFE_NOASSERT(cuDeviceGetCount, &cnt, 1);
1003
        auto err_s = cu_get_error_string(err);
1004
        if (err != CUDA_SUCCESS) {
1005
            if (warn && (std::string(err_s) != "invalid_stub_call"))
1006
                mgb_log_error("cuDeviceGetCount failed: %s (err %d)", err_s, int(err));
1007 1008 1009 1010 1011 1012 1013
            cnt = 0;
        }
        mgb_assert(cnt >= 0);
    }
    return cnt;
}

M
Megvii Engine Team 已提交
1014 1015
void CudaCompNode::set_prealloc_config(
        size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {
1016
    auto&& sdptr = CudaCompNodeImpl::sd;
1017 1018 1019 1020 1021
    {
        MGB_LOCK_GUARD(CudaCompNodeImpl::sd_mtx);
        if (!sdptr) {
            using T = CudaCompNodeImpl::StaticData;
            static std::aligned_storage_t<sizeof(T), alignof(T)> storage;
1022
            sdptr = new (&storage) T;
1023 1024 1025 1026 1027 1028
            sdptr->prealloc_config.alignment = alignment;
            sdptr->prealloc_config.min_req = min_req;
            sdptr->prealloc_config.growth_factor = growth_factor;
            sdptr->prealloc_config.max_overhead = max_overhead;
        } else {
            mgb_log_warn(
1029 1030 1031 1032
                    "invalid call to set_prealloc_config, will fallback to "
                    "default config; "
                    "prealloc_config should be specified before any CUDA "
                    "memory allocation");
1033 1034 1035 1036
        }
    }
}

1037 1038 1039 1040 1041 1042 1043 1044 1045 1046
CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
    int cnt = static_cast<int>(get_device_count());
    mgb_assert(
            dev >= 0 && dev < cnt, "request gpu %d out of valid range [0, %d)", dev,
            cnt);

    auto&& rec = device_prop_rec[dev];
    if (!rec.init) {
        MGB_LOCK_GUARD(rec.mtx_com);
        if (!rec.init) {
1047 1048 1049 1050 1051 1052 1053
            MGB_CALL_CUDA_FORKSAFE(
                    cuDeviceGetAttribute, &rec.prop.major, 1,
                    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
            MGB_CALL_CUDA_FORKSAFE(
                    cuDeviceGetAttribute, &rec.prop.minor, 1,
                    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
            MGB_CALL_CUDA_FORKSAFE(cuDeviceTotalMem, &rec.prop.total_memory, 1, dev);
1054
            char pname[256] = {0};
1055
            MGB_CALL_CUDA_FORKSAFE(cuDeviceGetName, pname, 255, 255, dev);
1056 1057 1058
            rec.prop.name = pname;
            rec.init = true;
        }
1059
    }
1060 1061

    return rec.prop;
1062 1063
}

1064 1065 1066 1067 1068
#else

bool CudaCompNode::available() {
    return false;
}
1069 1070 1071
void CudaCompNode::try_coalesce_all_free_memory() {}
void CudaCompNode::foreach (thin_function<void(CompNode)>) {}
void CudaCompNode::finalize() {}
1072 1073 1074 1075 1076 1077
size_t CudaCompNode::get_device_count(bool warn) {
    return 0;
}
CudaCompNode::Impl* CudaCompNode::load_cuda(const Locator&, const Locator&) {
    mgb_throw(MegBrainError, "cuda disabled at compile time");
}
1078
void CudaCompNode::sync_all() {}
1079

M
Megvii Engine Team 已提交
1080 1081
void CudaCompNode::set_prealloc_config(
        size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {}
1082

1083 1084
CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
    return CompNode::DeviceProperties{};
1085 1086
}

1087 1088
#undef err

1089
#endif  // MGB_CUDA
1090 1091

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}