comp_node.cpp 37.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
#include "./comp_node.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/utils/thread.h"

#include <string>

using namespace mgb;

#if MGB_CUDA

11 12 13 14
#if MEGDNN_WITH_CUDA
#include "cuda_sm_gen.h"
#endif

15 16 17
#include "megbrain/comp_node/alloc.h"

#include <cctype>
18
#include <cstdio>
19
#include <regex>
20 21 22

#include <thread>

23
#include <cuda.h>
24 25
#include <cuda_runtime.h>

26 27
#ifdef __unix__
#include <sys/wait.h>
M
Megvii Engine Team 已提交
28
#include <unistd.h>
29 30
#endif

31 32 33
using CudaCompNodeImpl = CudaCompNode::CompNodeImpl;

namespace {
34 35 36 37 38 39 40
size_t get_min_system_memory(size_t available) {
    if (available < (1u << 31)) {
        // 225MiB
        return 225 * 1024 * 1024;
    } else {
        // max(300 MiB, 0.05 * available)
        return std::max<size_t>(300 * 1024 * 1024, available / 20);
41
    }
42 43 44 45 46 47 48 49 50
}
using CudaHostFunc = megdnn::thin_function<void()>;
void CUDART_CB cuda_host_func_caller(void* ud) {
    mgb_assert(ud);
    CudaHostFunc* func_ptr = reinterpret_cast<CudaHostFunc*>(ud);
    MGB_TRY { (*func_ptr)(); }
    MGB_FINALLY(delete func_ptr;);
}
}  // anonymous namespace
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81

namespace mgb {
namespace mem_alloc {
class CudaRawAllocator final : public RawAllocator {
public:
    void* alloc(size_t size) override {
        void* addr;
        cudaError_t cuda_error = cudaMalloc(&addr, size);
        if (cuda_error == cudaSuccess) {
            mgb_assert(addr);
            return addr;
        }
        auto msg = mgb_ssprintf_log(
                "cudaMalloc failed while requesting %zd bytes (%.3fMiB)"
                " of memory; error: %s",
                size, size / (1024.0 * 1024), cudaGetErrorString(cuda_error));
        msg.append(CudaError::get_cuda_extra_info());
        if (cuda_error == cudaErrorMemoryAllocation) {
            mgb_log_error("%s", msg.c_str());
            // clear cuda error
            cudaGetLastError();
            mgb_assert(cudaGetLastError() == cudaSuccess);
            return nullptr;
        }
        mgb_throw_raw(MemAllocError{msg});
    }

    void free(void* ptr) override {
        cudaError_t cuda_error = cudaFree(ptr);
        if (cuda_error == cudaSuccess)
            return;
M
Megvii Engine Team 已提交
82 83
        auto msg = ssprintf(
                "cudaFree failed for %p: %s", ptr, cudaGetErrorString(cuda_error));
84 85 86 87 88 89 90 91
        msg.append(CudaError::get_cuda_extra_info());
        mgb_throw_raw(MemAllocError{msg});
    }

    void get_mem_info(size_t& free, size_t& tot) override {
        cudaError_t cuda_error = cudaMemGetInfo(&free, &tot);
        if (cuda_error == cudaSuccess)
            return;
M
Megvii Engine Team 已提交
92
        auto msg = ssprintf("cudaMemGetInfo failed %s", cudaGetErrorString(cuda_error));
93 94 95 96 97
        msg.append(CudaError::get_cuda_extra_info());
        mgb_throw_raw(MegBrainError{msg});
    }
};

98 99 100 101
class CudaHostAllocator : public RawAllocator {
public:
    void* alloc(size_t size) override {
        void* addr;
M
Megvii Engine Team 已提交
102
        cudaError_t cuda_error = cudaHostAlloc(&addr, size, cudaHostAllocDefault);
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
        if (cuda_error == cudaSuccess) {
            mgb_assert(addr);
            return addr;
        }
        auto msg = mgb_ssprintf_log(
                "cudaHostAlloc failed while requesting %zd bytes (%.3fMiB)"
                " of pinned host memory; error: %s",
                size, size / (1024.0 * 1024), cudaGetErrorString(cuda_error));
        msg.append(CudaError::get_cuda_extra_info());
        if (cuda_error == cudaErrorMemoryAllocation) {
            mgb_log_error("%s", msg.c_str());
            // clear cuda error
            cudaGetLastError();
            mgb_assert(cudaGetLastError() == cudaSuccess);
            return nullptr;
        }
        mgb_throw_raw(MemAllocError{msg});
    }

    void free(void* ptr) override {
        cudaError_t cuda_error = cudaFreeHost(ptr);
        if (cuda_error == cudaSuccess)
            return;
M
Megvii Engine Team 已提交
126 127
        auto msg = ssprintf(
                "cudaFreeHost failed for %p: %s", ptr, cudaGetErrorString(cuda_error));
128 129 130 131 132 133 134 135 136 137
        msg.append(CudaError::get_cuda_extra_info());
        mgb_throw_raw(MemAllocError{msg});
    }

    void get_mem_info(size_t& free, size_t& tot) override {
        free = 0;
        tot = 0;
    }
};

138 139
class CudaDeviceRuntimePolicy : public DeviceRuntimePolicy {
public:
M
Megvii Engine Team 已提交
140 141
    CompNode::DeviceType device_type() override { return CompNode::DeviceType::CUDA; }
    void set_device(int device) override { MGB_CUDA_CHECK(cudaSetDevice(device)); }
142 143 144 145 146 147 148 149
    void device_synchronize(int device) override {
        MGB_CUDA_CHECK(cudaSetDevice(device));
        MGB_CUDA_CHECK(cudaDeviceSynchronize());
    }
};

/* ===================== DevMemAlloc  ===================== */
std::unique_ptr<DevMemAlloc> DevMemAlloc::make_cuda_alloc() {
M
Megvii Engine Team 已提交
150
    return std::make_unique<FwdDevMemAlloc>(std::make_shared<CudaRawAllocator>());
151 152 153 154 155
}
}  // namespace mem_alloc
}  // namespace mgb

/* ===================== CudaCompNodeImpl  ===================== */
156
class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
157 158 159 160 161 162 163
    MGB_DYN_TYPE_OBJ_FINAL_DECL;

    friend class EventImpl;
    friend class CudaCompNode;

    struct DeviceInfo;
    struct StaticData;
164
    static StaticData* sd;
165
    static Spinlock sd_mtx;
166 167 168
#if !MGB_BUILD_SLIM_SERVING
    std::mutex m_update_mem;
#endif
169 170 171 172 173

    //! set to true when m_locator is assigned; set to false if async init
    //! failed
    bool m_initialized = false;
    Locator m_locator, m_locator_logical;
174 175
    mem_alloc::StreamMemAlloc* m_mem_alloc;
    DeviceInfo* m_device_info;
176 177 178 179

    std::unique_ptr<Event> m_sync_event;
    Spinlock m_sync_event_mtx;

180
    void activate() { m_env.cuda_env().activate(); }
181

182
    void init(const Locator& locator, const Locator& locator_logical);
183 184 185 186 187
    void fini();

    //! return whether global finalized, and print warning in such case
    static inline bool check_global_finalized();

188 189
    static CompNode::DeviceProperties get_device_prop(int dev);

190 191 192 193 194 195 196 197 198 199 200
    //! enable peer copy from dev0 to dev1
    static void enable_peer_access(int dev0, int dev1);

    static void static_free_device(ImplBase* self, void* ptr) {
        static_cast<CompNodeImpl*>(self)->free_device(ptr);
    }

    static void static_free_host(ImplBase* self, void* ptr) {
        static_cast<CompNodeImpl*>(self)->free_host(ptr);
    }

201 202
public:
    CompNodeImpl() : Impl(static_free_device, static_free_host) {}
203

204 205
    static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;

206
    void* alloc_device(size_t size) override;
207

208
    void free_device(void* ptr);
209

210
    void* alloc_host(size_t size) override;
211

212
    void free_host(void* ptr);
213

M
Megvii Engine Team 已提交
214
    void copy_to_host(void* host_ptr, const void* device_ptr, size_t size) override {
215
        activate();
M
Megvii Engine Team 已提交
216 217 218
        MGB_CUDA_CHECK(cudaMemcpyAsync(
                host_ptr, device_ptr, size, cudaMemcpyDeviceToHost,
                m_env.cuda_env().stream));
219
    }
220

M
Megvii Engine Team 已提交
221
    void copy_to_device(void* device_ptr, const void* host_ptr, size_t size) override {
222
        activate();
M
Megvii Engine Team 已提交
223 224 225
        MGB_CUDA_CHECK(cudaMemcpyAsync(
                device_ptr, host_ptr, size, cudaMemcpyHostToDevice,
                m_env.cuda_env().stream));
226
    }
227

M
Megvii Engine Team 已提交
228 229
    void peer_copy_to(
            Impl* dest_impl, void* dest, const void* src, size_t size) override;
230

M
Megvii Engine Team 已提交
231
    size_t get_mem_addr_alignment() override { return m_env.property().mem_alignment; }
232

233
    std::unique_ptr<Event> create_event(size_t flags) override;
234

235
    void sync() override;
236

237
    MemNode mem_node() override;
238

239 240 241 242 243 244 245 246
    std::pair<size_t, size_t> get_mem_status_bytes() override {
        // explicitly call cuda_env() to ensure async init is finished
        m_env.cuda_env().activate();
        size_t tot, free;
        MGB_CUDA_CHECK(cudaMemGetInfo(&free, &tot));
        free += m_mem_alloc->get_free_memory_dev().tot;
        return {tot, free};
    }
247

248
#if !MGB_BUILD_SLIM_SERVING
M
Megvii Engine Team 已提交
249 250
    std::pair<size_t, size_t> get_free_left_and_right(
            size_t begin_ptr, size_t end_ptr) override {
251 252
        return m_mem_alloc->get_free_left_and_right(begin_ptr, end_ptr);
    }
253

254
    size_t get_max_block_size_available() override {
255 256 257
        activate();
        return m_mem_alloc->get_max_block_size_available();
    }
258 259 260 261 262 263 264

    size_t get_free_mem() override {
        m_env.cuda_env().activate();
        size_t tot, free;
        MGB_CUDA_CHECK(cudaMemGetInfo(&free, &tot));
        return free;
    }
265 266
#endif

267
    Locator locator() override { return m_locator; }
268

269
    Locator locator_logical() override { return m_locator_logical; }
270

271
    void add_callback(CudaHostFunc&& cb) override {
272
#if CUDART_VERSION >= 10000
273 274 275
        activate();
        CudaHostFunc* func_ptr = new CudaHostFunc(std::move(cb));
        MGB_TRY {
M
Megvii Engine Team 已提交
276 277 278
            MGB_CUDA_CHECK(cudaLaunchHostFunc(
                    m_env.cuda_env().stream, cuda_host_func_caller,
                    static_cast<void*>(func_ptr)));
279 280 281 282 283
        }
        MGB_CATCH(..., {
            delete func_ptr;
            throw;
        });
284
#else
285 286
        MGB_MARK_USED_VAR(cb);
        MGB_MARK_USED_VAR(cuda_host_func_caller);
M
Megvii Engine Team 已提交
287 288 289
        mgb_throw(
                MegBrainError,
                "add_callback only support in cuda10.0 and later version");
290
#endif
291
    }
292

293
    uint64_t get_uid() override { return m_uid; }
294 295

#if !MGB_BUILD_SLIM_SERVING
296 297 298 299 300 301 302 303 304 305
    size_t get_used_memory() override;

    size_t get_max_used_memory() override;

    size_t get_reserved_memory() override;

    size_t get_max_reserved_memory() override;

    void reset_max_used_memory() override;
    void reset_max_reserved_memory() override;
306 307
#endif

308 309
private:
    uint64_t m_uid;
310
#if !MGB_BUILD_SLIM_SERVING
311
    std::unordered_map<void*, size_t> ptr2size;
312
#endif
313 314 315 316 317
};
MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl);

struct CudaCompNodeImpl::DeviceInfo {
    int dev_num = -1;
318 319
    std::atomic_size_t m_used_mem{0};
    std::atomic_size_t m_max_used_mem{0};
320 321
    std::unique_ptr<mem_alloc::DevMemAlloc> mem_alloc;

322
    bool init_done() const { return mem_alloc.get(); }
323

324
    void init(const CompNodeEnv& env);
325

326
    void fini() { mem_alloc.reset(); }
327 328 329 330 331 332 333
};

struct CudaCompNodeImpl::StaticData {
    std::recursive_mutex mtx;

    mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config;

334
    std::unique_ptr<mem_alloc::SimpleCachingAlloc> host_alloc;
335 336
    CudaCompNode::CompNodeImpl node[MAX_NR_COMP_NODE];
    DeviceInfo dev_info[MAX_NR_DEVICE];
337 338
    int nr_node = 0,          //!< number of loaded node[]
            nr_dev_used = 0;  //!< number of used dev_info[]
339

340 341 342
    StaticData()
            : host_alloc(mem_alloc::SimpleCachingAlloc::make(
                      std::make_unique<mem_alloc::CudaHostAllocator>())) {
343 344
        prealloc_config.max_overhead = 0;
        prealloc_config.alignment = 1;
345
        host_alloc->alignment(1);
346 347 348
    }

    ~StaticData() {
349
        for (int i = 0; i < nr_node; ++i)
350
            node[i].fini();
351
        for (int i = 0; i < nr_dev_used; ++i)
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
            dev_info[i].fini();
    }

    static size_t get_mem_reserve_size() {
        if (auto setting = MGB_GETENV("MGB_CUDA_RESERVE_MEMORY")) {
            if (!strncmp(setting, "b:", 2)) {
                return std::stoull(setting + 2);
            }
            size_t tot, free;
            MGB_CUDA_CHECK(cudaFree(0));
            MGB_CUDA_CHECK(cudaMemGetInfo(&free, &tot));
            return free - get_min_system_memory(free);
        } else {
            return 0;
        }
    }
};
CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr;
Spinlock CudaCompNodeImpl::sd_mtx;

372 373 374 375 376 377 378
struct DevicePropRec {
    bool init = false;
    CompNode::DeviceProperties prop;
    Spinlock mtx_com;
};
DevicePropRec device_prop_rec[CudaCompNodeImpl::MAX_NR_DEVICE];

M
Megvii Engine Team 已提交
379
void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) {
380 381 382 383
    m_locator = locator;
    m_locator_logical = locator_logical;
    m_initialized = true;

384
#if defined(__linux__) || defined(TARGET_OS_MAC)
385
    FILE* fp;
386 387 388 389 390
    fp = fopen("/dev/urandom", "r");
    mgb_assert(fread(&m_uid, sizeof(m_uid), 1, fp) == 1);
    fclose(fp);
#else
    m_uid = std::chrono::duration_cast<std::chrono::nanoseconds>(
391 392
                    std::chrono::system_clock::now().time_since_epoch())
                    .count();
393 394
#endif

395 396 397 398 399
    auto on_succ = [this](cudaStream_t stream) {
        auto locator = m_locator;
        log_comp_node_created(locator, m_locator_logical);

        MGB_LOCK_GUARD(sd->mtx);
400 401
        DeviceInfo* dev_info = nullptr;
        for (int i = 0; i < sd->nr_dev_used; ++i) {
402 403 404 405 406 407 408 409 410 411
            if (sd->dev_info[i].dev_num == locator.device) {
                dev_info = &sd->dev_info[i];
                break;
            }
        }

        if (!dev_info) {
            dev_info = &sd->dev_info[sd->nr_dev_used];
            dev_info->init(m_env);
            // note: add nr_dev_used only after init succeeds
412
            ++sd->nr_dev_used;
413 414
        }
        m_device_info = dev_info;
M
Megvii Engine Team 已提交
415
        m_mem_alloc = dev_info->mem_alloc->add_stream(static_cast<void*>(stream));
416 417 418 419 420 421 422
    };

    auto on_error = [this](std::exception&) {
        MGB_LOCK_GUARD(sd->mtx);
        m_initialized = false;
    };

M
Megvii Engine Team 已提交
423 424
    m_env.init_cuda_async(
            locator.device, make_comp_node_from_impl(this), {on_succ, on_error});
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481
#if MEGDNN_WITH_CUDA
    auto cur_prop = CudaCompNode::get_device_prop(locator.device);
    auto cur_sm =
            std::string("sm_") + std::to_string(cur_prop.major * 10 + cur_prop.minor);
    const std::string mge_gen_code = MGE_CUDA_GENCODE;
    std::regex re("sm_([0-9]+)");
    std::vector<std::string> build_sm(
            std::sregex_token_iterator(mge_gen_code.begin(), mge_gen_code.end(), re),
            std::sregex_token_iterator());

    if (std::find(build_sm.begin(), build_sm.end(), cur_sm) == build_sm.end()) {
        std::string build_sm_info = "";
        for (auto&& s : build_sm) {
            build_sm_info += std::string(" ") + s;
        }

        std::vector<int> support_gpu;
        for (int i = 0; i < get_device_count(); i++) {
            auto prop = CudaCompNode::get_device_prop(i);
            auto sm = std::string("sm_") + std::to_string(prop.major * 10 + prop.minor);
            if (std::find(build_sm.begin(), build_sm.end(), sm) != build_sm.end()) {
                support_gpu.emplace_back(i);
            }
        }

        if (support_gpu.size() == 0) {
            mgb_throw(
                    MegBrainError,
                    "%s(gpu%d) with CUDA capability %s is not compatible with the "
                    "current MegEngine installation. The current MegEngine install "
                    "supports CUDA capabilities%s. If you want to use the %s(gpu%d) "
                    "with MegEngine, please check the instructions at "
                    "https://github.com/MegEngine/MegEngine/blob/master/scripts/"
                    "cmake-build/BUILD_README.md",
                    cur_prop.name.c_str(), locator.device, cur_sm.c_str(),
                    build_sm_info.c_str(), cur_prop.name.c_str(), locator.device);
        } else {
            std::string support_gpu_info = "";
            for (auto&& g : support_gpu) {
                support_gpu_info += std::string(" gpu") + std::to_string(g);
            }
            mgb_throw(
                    MegBrainError,
                    "%s(gpu%d) with CUDA capability %s is not compatible with the "
                    "current MegEngine installation. The current MegEngine install "
                    "supports CUDA capabilities%s. You can try to use%s instead or "
                    "config CUDA_VISIBLE_DEVICES to chosse anthor cuda card.If you "
                    "really want to use the %s(gpu%d) with MegEngine, please check the "
                    "instructions at "
                    "https://github.com/MegEngine/MegEngine/blob/master/scripts/"
                    "cmake-build/BUILD_README.md",
                    cur_prop.name.c_str(), locator.device, cur_sm.c_str(),
                    build_sm_info.c_str(), support_gpu_info.c_str(),
                    cur_prop.name.c_str(), locator.device);
        }
    }
#endif
482 483 484 485 486 487 488 489 490 491 492 493 494
}

void CudaCompNodeImpl::fini() {
    if (!m_initialized)
        return;

    m_sync_event.reset();
    m_env.fini();
    m_mem_alloc = nullptr;
    m_device_info = nullptr;
    m_initialized = false;
}

495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
void* CudaCompNodeImpl::alloc_device(size_t size) {
    activate();
#if MGB_BUILD_SLIM_SERVING
    return m_mem_alloc->alloc(size);
#else
    void* ptr = m_mem_alloc->alloc(size);
    {
        MGB_LOCK_GUARD(m_update_mem);
        ptr2size[ptr] = size;
        m_device_info->m_used_mem += size;
        if (m_device_info->m_used_mem > m_device_info->m_max_used_mem) {
            m_device_info->m_max_used_mem = m_device_info->m_used_mem.load();
        }
    }
    return ptr;
#endif
}

513
void CudaCompNodeImpl::free_device(void* ptr) {
514 515 516 517
    if (check_global_finalized())
        return;

    activate();
518 519 520
#if !MGB_BUILD_SLIM_SERVING
    {
        MGB_LOCK_GUARD(m_update_mem);
M
Megvii Engine Team 已提交
521
        mgb_assert(ptr2size.find(ptr) != ptr2size.end(), "ptr %p not found!", ptr);
522
        m_device_info->m_used_mem -= ptr2size.at(ptr);
523 524 525
        ptr2size.erase(ptr);
    }
#endif
526 527 528
    m_mem_alloc->free(ptr);
}

529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554
#if !MGB_BUILD_SLIM_SERVING
size_t CudaCompNodeImpl::get_used_memory() {
    return m_device_info->m_used_mem.load();
}

size_t CudaCompNodeImpl::get_max_used_memory() {
    return m_device_info->m_max_used_mem.load();
}

void CudaCompNodeImpl::reset_max_used_memory() {
    m_device_info->m_max_used_mem = 0;
}

size_t CudaCompNodeImpl::get_reserved_memory() {
    return m_device_info->mem_alloc->get_used_memory();
}

size_t CudaCompNodeImpl::get_max_reserved_memory() {
    return m_device_info->mem_alloc->get_max_used_memory();
}

void CudaCompNodeImpl::reset_max_reserved_memory() {
    m_device_info->mem_alloc->reset_max_used_memory();
}
#endif

555
void* CudaCompNodeImpl::alloc_host(size_t size) {
556 557
    // need activate because it create cuda cuda context in current device
    activate();
558 559 560 561
    return sd->host_alloc->alloc(size);
}

void CudaCompNodeImpl::free_host(void* ptr) {
562 563
    if (check_global_finalized())
        return;
564 565 566
    sd->host_alloc->free(ptr);
}

M
Megvii Engine Team 已提交
567 568
void CudaCompNodeImpl::peer_copy_to(
        Impl* dest_impl, void* dest, const void* src, size_t size) {
569
    if (dest_impl->same_type<CudaCompNodeImpl>()) {
M
Megvii Engine Team 已提交
570
        auto&& dst_env = static_cast<CudaCompNodeImpl*>(dest_impl)->m_env.cuda_env();
571
        auto&& src_env = m_env.cuda_env();
572 573
        activate();
        if (dst_env.device == src_env.device) {
574 575
            MGB_CUDA_CHECK(cudaMemcpyAsync(
                    dest, src, size, cudaMemcpyDeviceToDevice, dst_env.stream));
576 577 578
        } else {
            enable_peer_access(src_env.device, dst_env.device);
            enable_peer_access(dst_env.device, src_env.device);
M
Megvii Engine Team 已提交
579 580
            MGB_CUDA_CHECK(cudaMemcpyPeerAsync(
                    dest, dst_env.device, src, src_env.device, size, dst_env.stream));
581 582 583
        }
        return;
    }
M
Megvii Engine Team 已提交
584 585 586
    mgb_assert(
            dest_impl->env().property().type == DeviceType::CPU,
            "cuda peer_copy_to only implemented for CPU");
587 588 589
    auto copy = [this, dest, src, size]() {
        auto stream = m_env.cuda_env().stream;
        m_env.cuda_env().activate();
M
Megvii Engine Team 已提交
590 591
        MGB_CUDA_CHECK(
                cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
592 593 594 595 596 597 598 599 600 601 602 603 604 605
        MGB_CUDA_CHECK(cudaStreamSynchronize(stream));
    };
    dest_impl->env().cpu_env().dispatch(copy);
}

MemNode CudaCompNodeImpl::mem_node() {
    // m_device_info would be null before async init finishes; so we just return
    // a prive pointer related to device number here
    return MemNode{sd->dev_info + m_locator.device};
}

void CudaCompNodeImpl::sync() {
    activate();

606 607 608
    // do not use MGB_CUDA_CHECK(cudaStreamSynchronize(m_env->stream)) since
    // other threads may be adding operations into the stream, and we only care
    // about previous operations in current thread. However docs of
609 610 611
    // cudaStreamSynchronize did not describe details of such condition, so we
    // use manual event implementation

612
    Event* event;
613 614 615 616 617 618 619 620 621 622 623
    {
        MGB_LOCK_GUARD(m_sync_event_mtx);
        if (!m_sync_event)
            m_sync_event = create_event(0);
        event = m_sync_event.get();
    }
    event->record();
    event->host_wait();
}

void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) {
624
    static bool already_enabled[MAX_NR_DEVICE][MAX_NR_DEVICE];
625 626 627 628 629 630 631 632 633 634 635 636 637 638 639
    if (already_enabled[dev0][dev1])
        return;

    static std::mutex global_lock;
    MGB_LOCK_GUARD(global_lock);
    if (already_enabled[dev0][dev1])
        return;

    int can;
    MGB_CUDA_CHECK(cudaDeviceCanAccessPeer(&can, dev0, dev1));
    if (can) {
        mgb_log("enable peer access from GPU %d to GPU %d", dev0, dev1);
        MGB_CUDA_CHECK(cudaSetDevice(dev0));
        auto err = cudaDeviceEnablePeerAccess(dev1, 0);
        if (err != cudaSuccess) {
M
Megvii Engine Team 已提交
640 641 642
            mgb_log_error(
                    "failed to enable peer access from %d to %d: %s(%d)", dev0, dev1,
                    cudaGetErrorString(err), static_cast<int>(err));
643 644 645 646 647 648 649 650 651 652 653 654
            cudaGetLastError();
        }
    }

    // check for cudaMemcpyPeer usable
    int v0 = 1, v1 = 2;

    int *dp0, *dp1;
    MGB_CUDA_CHECK(cudaSetDevice(dev0));
    MGB_CUDA_CHECK(cudaMalloc(&dp0, sizeof(int)));
    MGB_CUDA_CHECK(cudaSetDevice(dev1));
    MGB_CUDA_CHECK(cudaMalloc(&dp1, sizeof(int)));
655 656
    MGB_CUDA_CHECK(cudaMemcpy(dp0, &v0, sizeof(int), cudaMemcpyHostToDevice));
    MGB_CUDA_CHECK(cudaMemcpy(dp1, &v1, sizeof(int), cudaMemcpyHostToDevice));
657 658
    MGB_CUDA_CHECK(cudaMemcpyPeer(dp1, dev1, dp0, dev0, sizeof(int)));
    int get = 0;
659
    MGB_CUDA_CHECK(cudaMemcpy(&get, dp1, sizeof(int), cudaMemcpyDeviceToHost));
660

M
Megvii Engine Team 已提交
661 662 663 664 665
    mgb_throw_if(
            get != 1, CudaError,
            "P2P copy (%d => %d) check failed; consider disabling "
            "Access Control Services(ACS) for the PCI device",
            dev0, dev1);
666 667 668 669 670 671

    already_enabled[dev0][dev1] = true;
}

/* ===================== CudaCompNodeImpl::DeviceInfo  ===================== */

672
void CudaCompNodeImpl::DeviceInfo::init(const CompNodeEnv& env) {
673 674 675 676 677
    mgb_assert(!mem_alloc);
#if 0
    // forward cudaMalloc
    mem_alloc = mem_alloc::DevMemAlloc::make_cuda_alloc();
#else
678
    auto&& cuenv = env.cuda_env();
679 680 681 682
    cuenv.activate();
    dev_num = cuenv.device;
    auto reserve_size = StaticData::get_mem_reserve_size();
    mem_alloc = mem_alloc::DevMemAlloc::make(
M
Megvii Engine Team 已提交
683
            dev_num, reserve_size, std::make_shared<mem_alloc::CudaRawAllocator>(),
684 685 686 687
            std::make_shared<mem_alloc::CudaDeviceRuntimePolicy>());
    mem_alloc->prealloc_config(sd->prealloc_config);
    auto align = env.property().mem_alignment;
    mem_alloc->alignment(align);
688
    mgb_log_debug(
M
Megvii Engine Team 已提交
689 690
            "cuda: gpu%d: name=`%s' dyn_mem_reserve=%.2fMiB alignment=0x%zx", dev_num,
            cuenv.device_prop.name, reserve_size / 1024.0 / 1024, align);
691 692 693 694 695 696 697 698 699 700 701
#endif
}

bool CudaCompNodeImpl::check_global_finalized() {
    if (!sd) {
        static std::atomic_flag warn_printed = ATOMIC_FLAG_INIT;
        if (!warn_printed.test_and_set()) {
            mgb_log_debug("cuda comp node method called after global finalize");
        }
        return true;
    }
702 703 704 705 706 707 708 709 710 711 712
#if MGB_CUDA && defined(WIN32)
    //! FIXME: windows cuda driver shutdown before call atexit function even
    //! register atexit function after init cuda driver! as a workround
    //! recovery resource by OS temporarily, may need remove this after
    //! upgrade cuda runtime
    if (CudaCompNode::is_into_atexit) {
        mgb_log_debug(
                "windows cudaErrorCudartUnloading happened!!, resource "
                "recovery by OS!!");
        return true;
    }
713 714 715 716 717 718 719 720 721 722 723 724 725
    //! FIXME: megengine dynamic with VCRT, atexit fuctions table have
    //! some order issue, which will lead to cuda runtime uploading, this
    //! always happened at python3 unload dll(means python3 will exit),
    //! as a workround, recovery resource by OS temporarily, may need
    //! remove this after upgrade cuda runtime
    int dev = -1;
    if (cudaErrorCudartUnloading == cudaGetDevice(&dev)) {
        mgb_log_debug(
                "windows cudaErrorCudartUnloading happened!!, resource "
                "recovery by OS!!");
        return true;
    }

726
#endif
727 728 729 730 731
    return false;
}

/* ===================== EventImpl  ===================== */

732
class CudaCompNode::EventImpl final : public EventImplHelper {
733
    bool m_init_finished = false;
734
    CudaCompNodeImpl* const m_comp_node_impl;
735 736 737 738
    cudaEvent_t m_cuda_event;

    void do_record() override {
        m_comp_node_impl->activate();
739
        auto&& env = m_comp_node_impl->m_env.cuda_env();
740 741 742 743 744 745 746 747 748 749
        MGB_CUDA_CHECK(cudaEventRecord(m_cuda_event, env.stream));
    }

    bool do_finished() override {
        m_comp_node_impl->activate();
        cudaError_t err = cudaEventQuery(m_cuda_event);
        if (err == cudaSuccess)
            return true;
        if (err == cudaErrorNotReady)
            return false;
M
Megvii Engine Team 已提交
750 751 752
        mgb_throw(
                CudaError, "failed to query event: %d: %s", int(err),
                cudaGetErrorString(err));
753 754
    }

M
Megvii Engine Team 已提交
755
    void host_wait_cv() override { MGB_CUDA_CHECK(cudaEventSynchronize(m_cuda_event)); }
756

757
    double do_elapsed_time_until(EventImplHelper& end) override {
758 759
        m_comp_node_impl->activate();
        float ret = 0.0;
760 761
        MGB_CUDA_CHECK(cudaEventElapsedTime(
                &ret, m_cuda_event, static_cast<EventImpl&>(end).m_cuda_event));
762 763 764
        return static_cast<double>(ret) * 1e-3;
    }

765
    void do_device_wait_by(Impl* cn_impl) override;
766

767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
public:
    EventImpl(CudaCompNodeImpl* comp_node_impl, size_t create_flags)
            : EventImplHelper(comp_node_impl, create_flags),
              m_comp_node_impl{comp_node_impl} {
        m_comp_node_impl->activate();
        size_t cuda_flags = cudaEventDisableTiming;
        if (create_flags & NEED_TIMER)
            cuda_flags = 0;
        MGB_CUDA_CHECK(cudaEventCreateWithFlags(&m_cuda_event, cuda_flags));
        m_init_finished = true;
    }

    ~EventImpl() {
        if (m_init_finished) {
            MGB_TRY { MGB_CUDA_CHECK(cudaEventDestroy(m_cuda_event)); }
            MGB_CATCH(MegBrainError & exc, {
                mgb_log_error("failed to destroy cuda event: %s", exc.what());
            })
785
        }
786
    }
787 788
};

789
std::unique_ptr<CompNode::Event> CudaCompNodeImpl::create_event(size_t flags) {
790 791 792
    return std::make_unique<EventImpl>(this, flags);
}

793
void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) {
794 795 796 797 798 799 800 801
    if (cn_impl->dyn_typeinfo() == CudaCompNodeImpl::typeinfo()) {
        auto imp = static_cast<CudaCompNodeImpl*>(cn_impl);
        auto stream = imp->m_env.cuda_env().stream;
        imp->activate();
        MGB_CUDA_CHECK(cudaStreamWaitEvent(stream, m_cuda_event, 0));
        return;
    }
    if (cn_impl->env().property().type == DeviceType::CPU) {
M
Megvii Engine Team 已提交
802
        auto waiter = [this]() { MGB_CUDA_CHECK(cudaEventSynchronize(m_cuda_event)); };
803 804 805 806 807 808 809 810
        cn_impl->add_callback(std::move(waiter));
        return;
    }
    mgb_throw(MegBrainError, "unimplemented event device_wait_by config");
}

/* ===================== CudaCompNode static methods ===================== */

811 812 813
namespace {

#ifndef __unix__
814 815
template <typename Func, typename Val>
CUresult call_cuda_forksafe(Func func, Val* val, size_t len) {
816
    cuInit(0);
817
    return func();
818 819 820 821 822 823
}
#else
struct RAIICloseFD : NonCopyableObj {
    int m_fd = -1;

    RAIICloseFD(int fd) : m_fd(fd) {}
M
Megvii Engine Team 已提交
824
    ~RAIICloseFD() { close(); }
825 826 827 828 829 830 831 832
    void close() {
        if (m_fd != -1) {
            ::close(m_fd);
            m_fd = -1;
        }
    }
};
// an implementation that does not call cuInit
833 834 835 836 837
template <typename Func, typename Val>
CUresult call_cuda_forksafe(Func func, Val* val, size_t len) {
    int t_ndev;
    // use cuDeviceGetCount to detect cuda initialization to avoid abnormal behavior
    auto err = cuDeviceGetCount(&t_ndev);
M
Megvii Engine Team 已提交
838
    if (err != CUDA_ERROR_NOT_INITIALIZED)
839
        return func();
840 841 842 843 844 845 846 847 848 849 850 851
    // cuInit not called, call it in child process
    int fd[2];
    mgb_assert(pipe(fd) == 0, "pipe() failed");
    int fdr = fd[0], fdw = fd[1];
    RAIICloseFD fdr_guard(fdr);
    RAIICloseFD fdw_guard(fdw);
    auto cpid = fork();
    mgb_assert(cpid != -1, "fork() failed");
    if (cpid == 0) {
        fdr_guard.close();
        do {
            err = cuInit(0);
M
Megvii Engine Team 已提交
852 853
            if (err != CUDA_SUCCESS)
                break;
854
            err = func();
855 856 857 858 859 860 861 862 863 864 865 866 867 868
        } while (0);
        auto sz = write(fdw, &err, sizeof(err));
        if (sz == sizeof(err) && err == CUDA_SUCCESS) {
            sz = write(fdw, val, sizeof(*val) * len);
        }
        fdw_guard.close();
        std::quick_exit(0);
    }
    fdw_guard.close();
    auto sz = read(fdr, &err, sizeof(err));
    mgb_assert(sz == sizeof(err), "failed to read error code from child");
    if (err == CUDA_SUCCESS) {
        sz = read(fdr, val, sizeof(*val) * len);
        mgb_assert(
869
                static_cast<size_t>(sz) == sizeof(*val) * len,
870 871 872 873
                "failed to read value from child");
        return err;
    }
    // try again, maybe another thread called cuInit while we fork
874
    auto err2 = func();
875 876 877 878 879 880
    if (err2 == CUDA_SUCCESS)
        return err2;
    if (err2 == CUDA_ERROR_NOT_INITIALIZED)
        return err;
    return err2;
}
881 882 883 884 885
#endif

const char* cu_get_error_string(CUresult err) {
    const char* ret = nullptr;
    cuGetErrorString(err, &ret);
886 887 888 889
    if (!ret) {
        //! caused by cuda stub do not find driver
        ret = "invalid_stub_call";
    }
890 891 892
    return ret;
}

893 894 895 896 897 898 899 900 901 902 903
#define MGB_CALL_CUDA_FORKSAFE_NOASSERT(func, ptr, len, ...) \
    call_cuda_forksafe([&]() { return func(ptr, ##__VA_ARGS__); }, ptr, len)

#define MGB_CALL_CUDA_FORKSAFE(func, ptr, len, ...)                                \
    {                                                                              \
        auto err = MGB_CALL_CUDA_FORKSAFE_NOASSERT(func, ptr, len, ##__VA_ARGS__); \
        if (err != CUDA_SUCCESS) {                                                 \
            auto err_s = cu_get_error_string(err);                                 \
            mgb_log_error(#func " failed: %s (err %d)", err_s, int(err));          \
        }                                                                          \
    }
M
Megvii Engine Team 已提交
904
}  // namespace
905

906 907 908 909 910 911
bool CudaCompNode::available() {
    static int result = -1;
    static Spinlock mtx;
    MGB_LOCK_GUARD(mtx);
    if (result == -1) {
        int ndev = -1;
912
        auto err = MGB_CALL_CUDA_FORKSAFE_NOASSERT(cuDeviceGetCount, &ndev, 1);
913
        result = err == CUDA_SUCCESS && ndev > 0;
914 915 916
        auto err_s = cu_get_error_string(err);
        //! only show !CUDA_SUCCESS log when with valid stub call
        if (!result && (std::string(err_s) != "invalid_stub_call")) {
M
Megvii Engine Team 已提交
917
            mgb_log_warn(
918 919
                    "cuda unavailable: %s(%d) ndev=%d", err_s, static_cast<int>(err),
                    ndev);
920
        }
921
        if (err == CUDA_ERROR_NOT_INITIALIZED) {
922
            mgb_throw(std::runtime_error, "cuda initialization error.");
923 924 925 926 927 928 929 930 931 932 933 934 935 936 937
        }
    }
    return result;
}

void CudaCompNode::finalize() {
    if (CudaCompNodeImpl::sd) {
        sync_all();

        auto ptr = CudaCompNodeImpl::sd;
        CudaCompNodeImpl::sd = nullptr;
        ptr->~StaticData();
    }
}

938 939 940 941 942 943 944
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
bool CudaCompNode::is_into_atexit = false;
#endif
M
Megvii Engine Team 已提交
945 946
CompNode::Impl* CudaCompNode::load_cuda(
        const Locator& locator, const Locator& locator_logical) {
947
    int nr_gpu = get_device_count();
948 949 950 951 952 953 954 955 956 957
#if MGB_CUDA && defined(WIN32)
    //! FIXME: windows cuda driver shutdown before call atexit function even
    //! register atexit function after init cuda driver! as a workround
    //! recovery resource by OS temporarily, may need remove this after
    //! upgrade cuda runtime
    if (!is_into_atexit) {
        auto err = atexit([] { is_into_atexit = true; });
        mgb_assert(!err, "failed to register atexit function");
    }
#endif
M
Megvii Engine Team 已提交
958 959 960
    mgb_assert(
            locator.device >= 0 && locator.device < nr_gpu,
            "request gpu%d out of valid range [0, %d)", locator.device, nr_gpu);
961

962
    auto&& sdptr = CudaCompNodeImpl::sd;
963 964 965 966 967 968 969
    {
        MGB_LOCK_GUARD(CudaCompNodeImpl::sd_mtx);
        if (!sdptr) {
            // use static storage so object can be safely accessed even after
            // global finalize
            using T = CudaCompNodeImpl::StaticData;
            static std::aligned_storage_t<sizeof(T), alignof(T)> storage;
970
            sdptr = new (&storage) T;
971 972
        }
    }
973
    auto&& sd = *sdptr;
974 975
    MGB_LOCK_GUARD(sd.mtx);

976 977 978
    CompNodeImpl* available_node = nullptr;
    for (int i = 0; i < sd.nr_node; ++i) {
        auto&& cur = sd.node[i];
979
        if (cur.m_initialized) {
M
Megvii Engine Team 已提交
980
            if (cur.m_locator == locator && cur.m_locator_logical == locator_logical) {
981 982 983 984 985 986 987 988
                return &cur;
            }
        } else {
            available_node = &cur;
        }
    }

    if (!available_node) {
989 990 991
        mgb_assert(
                sd.nr_node < CompNodeImpl::MAX_NR_COMP_NODE,
                "too many CompNode allocated");
992
        available_node = &sd.node[sd.nr_node++];
993
    }
994
    mgb_assert(locator.device < CompNodeImpl::MAX_NR_DEVICE, "device number too large");
995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008

    mgb_assert(!available_node->m_initialized);
    available_node->init(locator, locator_logical);

    return available_node;
}

void CudaCompNode::try_coalesce_all_free_memory() {
    // TODO: optimized implementation
    auto sd = CudaCompNodeImpl::sd;
    if (!sd)
        return;

    size_t size = 0;
1009
    for (int i = 0; i < sd->nr_dev_used; ++i) {
M
Megvii Engine Team 已提交
1010
        size += sd->dev_info[i].mem_alloc->gather_stream_free_blk_and_release_full();
1011 1012
    }
    if (size) {
M
Megvii Engine Team 已提交
1013
        mgb_log_debug("%zu bytes freed by try_coalesce_all_free_memory()", size);
1014 1015 1016 1017 1018 1019 1020 1021
    }
}

void CudaCompNode::sync_all() {
    auto sd = CudaCompNodeImpl::sd;
    if (!sd)
        return;

1022
    for (int i = 0;; ++i) {
1023
        // ensure async init finished
1024
        CompNodeEnv* env;
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
        {
            MGB_LOCK_GUARD(sd->mtx);
            if (i >= sd->nr_node) {
                break;
            }
            env = &sd->node[i].env();
        }
        env->cuda_env();
    }

    MGB_LOCK_GUARD(sd->mtx);
    for (int i = 0; i < sd->nr_dev_used; ++i) {
        MGB_CUDA_CHECK(cudaSetDevice(sd->dev_info[i].dev_num));
        MGB_CUDA_CHECK(cudaDeviceSynchronize());
    }
}

1042
void CudaCompNode::foreach (thin_function<void(CompNode)> callback) {
1043 1044 1045 1046
    auto sd = CudaCompNodeImpl::sd;
    if (!sd)
        return;

1047
    for (int i = 0;; ++i) {
1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063
        CompNode cur;
        {
            MGB_LOCK_GUARD(sd->mtx);
            if (i >= sd->nr_node)
                return;
            cur = make_comp_node_from_impl(&sd->node[i]);
        }
        callback(cur);
    }
}

size_t CudaCompNode::get_device_count(bool warn) {
    static int cnt = -1;
    static Spinlock mtx;
    MGB_LOCK_GUARD(mtx);
    if (cnt == -1) {
1064
        auto err = MGB_CALL_CUDA_FORKSAFE_NOASSERT(cuDeviceGetCount, &cnt, 1);
1065
        auto err_s = cu_get_error_string(err);
1066
        if (err != CUDA_SUCCESS) {
1067
            if (warn && (std::string(err_s) != "invalid_stub_call"))
1068
                mgb_log_error("cuDeviceGetCount failed: %s (err %d)", err_s, int(err));
1069 1070 1071 1072 1073 1074 1075
            cnt = 0;
        }
        mgb_assert(cnt >= 0);
    }
    return cnt;
}

M
Megvii Engine Team 已提交
1076 1077
void CudaCompNode::set_prealloc_config(
        size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {
1078
    auto&& sdptr = CudaCompNodeImpl::sd;
1079 1080 1081 1082 1083
    {
        MGB_LOCK_GUARD(CudaCompNodeImpl::sd_mtx);
        if (!sdptr) {
            using T = CudaCompNodeImpl::StaticData;
            static std::aligned_storage_t<sizeof(T), alignof(T)> storage;
1084
            sdptr = new (&storage) T;
1085 1086 1087 1088 1089 1090
            sdptr->prealloc_config.alignment = alignment;
            sdptr->prealloc_config.min_req = min_req;
            sdptr->prealloc_config.growth_factor = growth_factor;
            sdptr->prealloc_config.max_overhead = max_overhead;
        } else {
            mgb_log_warn(
1091 1092 1093 1094
                    "invalid call to set_prealloc_config, will fallback to "
                    "default config; "
                    "prealloc_config should be specified before any CUDA "
                    "memory allocation");
1095 1096 1097 1098
        }
    }
}

1099 1100 1101 1102 1103 1104 1105 1106 1107 1108
CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
    int cnt = static_cast<int>(get_device_count());
    mgb_assert(
            dev >= 0 && dev < cnt, "request gpu %d out of valid range [0, %d)", dev,
            cnt);

    auto&& rec = device_prop_rec[dev];
    if (!rec.init) {
        MGB_LOCK_GUARD(rec.mtx_com);
        if (!rec.init) {
1109 1110 1111 1112 1113 1114 1115
            MGB_CALL_CUDA_FORKSAFE(
                    cuDeviceGetAttribute, &rec.prop.major, 1,
                    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
            MGB_CALL_CUDA_FORKSAFE(
                    cuDeviceGetAttribute, &rec.prop.minor, 1,
                    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
            MGB_CALL_CUDA_FORKSAFE(cuDeviceTotalMem, &rec.prop.total_memory, 1, dev);
1116
            char pname[256] = {0};
1117
            MGB_CALL_CUDA_FORKSAFE(cuDeviceGetName, pname, 255, 255, dev);
1118 1119 1120
            rec.prop.name = pname;
            rec.init = true;
        }
1121
    }
1122 1123

    return rec.prop;
1124 1125
}

1126 1127 1128 1129 1130
#else

bool CudaCompNode::available() {
    return false;
}
1131 1132 1133
void CudaCompNode::try_coalesce_all_free_memory() {}
void CudaCompNode::foreach (thin_function<void(CompNode)>) {}
void CudaCompNode::finalize() {}
1134 1135 1136 1137 1138 1139
size_t CudaCompNode::get_device_count(bool warn) {
    return 0;
}
CudaCompNode::Impl* CudaCompNode::load_cuda(const Locator&, const Locator&) {
    mgb_throw(MegBrainError, "cuda disabled at compile time");
}
1140
void CudaCompNode::sync_all() {}
1141

M
Megvii Engine Team 已提交
1142 1143
void CudaCompNode::set_prealloc_config(
        size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {}
1144

1145 1146
CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
    return CompNode::DeviceProperties{};
1147 1148
}

1149 1150
#undef err

1151
#endif  // MGB_CUDA
1152 1153

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}