comp_node.cpp 33.3 KB
Newer Older
1 2 3 4
/**
 * \file src/core/impl/comp_node/cuda/comp_node.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
5
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
6 7 8
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
9 10
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
 */

#include "./comp_node.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/utils/thread.h"

#include <string>

using namespace mgb;

#if MGB_CUDA

#include "megbrain/comp_node/alloc.h"

#include <cctype>
26
#include <cstdio>
27 28 29

#include <thread>

30
#include <cuda.h>
31 32
#include <cuda_runtime.h>

33 34
#ifdef __unix__
#include <sys/wait.h>
M
Megvii Engine Team 已提交
35
#include <unistd.h>
36 37
#endif

38 39 40
using CudaCompNodeImpl = CudaCompNode::CompNodeImpl;

namespace {
41 42 43 44 45 46 47
size_t get_min_system_memory(size_t available) {
    if (available < (1u << 31)) {
        // 225MiB
        return 225 * 1024 * 1024;
    } else {
        // max(300 MiB, 0.05 * available)
        return std::max<size_t>(300 * 1024 * 1024, available / 20);
48
    }
49 50 51 52 53 54 55 56 57
}
using CudaHostFunc = megdnn::thin_function<void()>;
void CUDART_CB cuda_host_func_caller(void* ud) {
    mgb_assert(ud);
    CudaHostFunc* func_ptr = reinterpret_cast<CudaHostFunc*>(ud);
    MGB_TRY { (*func_ptr)(); }
    MGB_FINALLY(delete func_ptr;);
}
}  // anonymous namespace
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88

namespace mgb {
namespace mem_alloc {
class CudaRawAllocator final : public RawAllocator {
public:
    void* alloc(size_t size) override {
        void* addr;
        cudaError_t cuda_error = cudaMalloc(&addr, size);
        if (cuda_error == cudaSuccess) {
            mgb_assert(addr);
            return addr;
        }
        auto msg = mgb_ssprintf_log(
                "cudaMalloc failed while requesting %zd bytes (%.3fMiB)"
                " of memory; error: %s",
                size, size / (1024.0 * 1024), cudaGetErrorString(cuda_error));
        msg.append(CudaError::get_cuda_extra_info());
        if (cuda_error == cudaErrorMemoryAllocation) {
            mgb_log_error("%s", msg.c_str());
            // clear cuda error
            cudaGetLastError();
            mgb_assert(cudaGetLastError() == cudaSuccess);
            return nullptr;
        }
        mgb_throw_raw(MemAllocError{msg});
    }

    void free(void* ptr) override {
        cudaError_t cuda_error = cudaFree(ptr);
        if (cuda_error == cudaSuccess)
            return;
M
Megvii Engine Team 已提交
89 90
        auto msg = ssprintf(
                "cudaFree failed for %p: %s", ptr, cudaGetErrorString(cuda_error));
91 92 93 94 95 96 97 98
        msg.append(CudaError::get_cuda_extra_info());
        mgb_throw_raw(MemAllocError{msg});
    }

    void get_mem_info(size_t& free, size_t& tot) override {
        cudaError_t cuda_error = cudaMemGetInfo(&free, &tot);
        if (cuda_error == cudaSuccess)
            return;
M
Megvii Engine Team 已提交
99
        auto msg = ssprintf("cudaMemGetInfo failed %s", cudaGetErrorString(cuda_error));
100 101 102 103 104
        msg.append(CudaError::get_cuda_extra_info());
        mgb_throw_raw(MegBrainError{msg});
    }
};

105 106 107 108
class CudaHostAllocator : public RawAllocator {
public:
    void* alloc(size_t size) override {
        void* addr;
M
Megvii Engine Team 已提交
109
        cudaError_t cuda_error = cudaHostAlloc(&addr, size, cudaHostAllocDefault);
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
        if (cuda_error == cudaSuccess) {
            mgb_assert(addr);
            return addr;
        }
        auto msg = mgb_ssprintf_log(
                "cudaHostAlloc failed while requesting %zd bytes (%.3fMiB)"
                " of pinned host memory; error: %s",
                size, size / (1024.0 * 1024), cudaGetErrorString(cuda_error));
        msg.append(CudaError::get_cuda_extra_info());
        if (cuda_error == cudaErrorMemoryAllocation) {
            mgb_log_error("%s", msg.c_str());
            // clear cuda error
            cudaGetLastError();
            mgb_assert(cudaGetLastError() == cudaSuccess);
            return nullptr;
        }
        mgb_throw_raw(MemAllocError{msg});
    }

    void free(void* ptr) override {
        cudaError_t cuda_error = cudaFreeHost(ptr);
        if (cuda_error == cudaSuccess)
            return;
M
Megvii Engine Team 已提交
133 134
        auto msg = ssprintf(
                "cudaFreeHost failed for %p: %s", ptr, cudaGetErrorString(cuda_error));
135 136 137 138 139 140 141 142 143 144
        msg.append(CudaError::get_cuda_extra_info());
        mgb_throw_raw(MemAllocError{msg});
    }

    void get_mem_info(size_t& free, size_t& tot) override {
        free = 0;
        tot = 0;
    }
};

145 146
class CudaDeviceRuntimePolicy : public DeviceRuntimePolicy {
public:
M
Megvii Engine Team 已提交
147 148
    CompNode::DeviceType device_type() override { return CompNode::DeviceType::CUDA; }
    void set_device(int device) override { MGB_CUDA_CHECK(cudaSetDevice(device)); }
149 150 151 152 153 154 155 156
    void device_synchronize(int device) override {
        MGB_CUDA_CHECK(cudaSetDevice(device));
        MGB_CUDA_CHECK(cudaDeviceSynchronize());
    }
};

/* ===================== DevMemAlloc  ===================== */
std::unique_ptr<DevMemAlloc> DevMemAlloc::make_cuda_alloc() {
M
Megvii Engine Team 已提交
157
    return std::make_unique<FwdDevMemAlloc>(std::make_shared<CudaRawAllocator>());
158 159 160 161 162
}
}  // namespace mem_alloc
}  // namespace mgb

/* ===================== CudaCompNodeImpl  ===================== */
163
class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
164 165 166 167 168 169 170
    MGB_DYN_TYPE_OBJ_FINAL_DECL;

    friend class EventImpl;
    friend class CudaCompNode;

    struct DeviceInfo;
    struct StaticData;
171
    static StaticData* sd;
172
    static Spinlock sd_mtx;
173 174 175
#if !MGB_BUILD_SLIM_SERVING
    std::mutex m_update_mem;
#endif
176 177 178 179 180

    //! set to true when m_locator is assigned; set to false if async init
    //! failed
    bool m_initialized = false;
    Locator m_locator, m_locator_logical;
181 182
    mem_alloc::StreamMemAlloc* m_mem_alloc;
    DeviceInfo* m_device_info;
183 184 185 186

    std::unique_ptr<Event> m_sync_event;
    Spinlock m_sync_event_mtx;

187
    void activate() { m_env.cuda_env().activate(); }
188

189
    void init(const Locator& locator, const Locator& locator_logical);
190 191 192 193 194 195 196 197
    void fini();

    //! return whether global finalized, and print warning in such case
    static inline bool check_global_finalized();

    //! enable peer copy from dev0 to dev1
    static void enable_peer_access(int dev0, int dev1);

198 199
    static size_t get_compute_capability(int dev);

200 201 202 203 204 205 206 207
    static void static_free_device(ImplBase* self, void* ptr) {
        static_cast<CompNodeImpl*>(self)->free_device(ptr);
    }

    static void static_free_host(ImplBase* self, void* ptr) {
        static_cast<CompNodeImpl*>(self)->free_host(ptr);
    }

208 209
public:
    CompNodeImpl() : Impl(static_free_device, static_free_host) {}
210

211
    void* alloc_device(size_t size) override;
212

213
    void free_device(void* ptr);
214

215
    void* alloc_host(size_t size) override;
216

217
    void free_host(void* ptr);
218

M
Megvii Engine Team 已提交
219
    void copy_to_host(void* host_ptr, const void* device_ptr, size_t size) override {
220
        activate();
M
Megvii Engine Team 已提交
221 222 223
        MGB_CUDA_CHECK(cudaMemcpyAsync(
                host_ptr, device_ptr, size, cudaMemcpyDeviceToHost,
                m_env.cuda_env().stream));
224
    }
225

M
Megvii Engine Team 已提交
226
    void copy_to_device(void* device_ptr, const void* host_ptr, size_t size) override {
227
        activate();
M
Megvii Engine Team 已提交
228 229 230
        MGB_CUDA_CHECK(cudaMemcpyAsync(
                device_ptr, host_ptr, size, cudaMemcpyHostToDevice,
                m_env.cuda_env().stream));
231
    }
232

M
Megvii Engine Team 已提交
233 234
    void peer_copy_to(
            Impl* dest_impl, void* dest, const void* src, size_t size) override;
235

M
Megvii Engine Team 已提交
236
    size_t get_mem_addr_alignment() override { return m_env.property().mem_alignment; }
237

238
    std::unique_ptr<Event> create_event(size_t flags) override;
239

240
    void sync() override;
241

242
    MemNode mem_node() override;
243

244 245 246 247 248 249 250 251
    std::pair<size_t, size_t> get_mem_status_bytes() override {
        // explicitly call cuda_env() to ensure async init is finished
        m_env.cuda_env().activate();
        size_t tot, free;
        MGB_CUDA_CHECK(cudaMemGetInfo(&free, &tot));
        free += m_mem_alloc->get_free_memory_dev().tot;
        return {tot, free};
    }
252

253
#if !MGB_BUILD_SLIM_SERVING
M
Megvii Engine Team 已提交
254 255
    std::pair<size_t, size_t> get_free_left_and_right(
            size_t begin_ptr, size_t end_ptr) override {
256 257
        return m_mem_alloc->get_free_left_and_right(begin_ptr, end_ptr);
    }
258

259
    size_t get_max_block_size_available() override {
260 261 262
        activate();
        return m_mem_alloc->get_max_block_size_available();
    }
263 264 265 266 267 268 269

    size_t get_free_mem() override {
        m_env.cuda_env().activate();
        size_t tot, free;
        MGB_CUDA_CHECK(cudaMemGetInfo(&free, &tot));
        return free;
    }
270 271
#endif

272
    Locator locator() override { return m_locator; }
273

274
    Locator locator_logical() override { return m_locator_logical; }
275

276
    void add_callback(CudaHostFunc&& cb) override {
277
#if CUDART_VERSION >= 10000
278 279 280
        activate();
        CudaHostFunc* func_ptr = new CudaHostFunc(std::move(cb));
        MGB_TRY {
M
Megvii Engine Team 已提交
281 282 283
            MGB_CUDA_CHECK(cudaLaunchHostFunc(
                    m_env.cuda_env().stream, cuda_host_func_caller,
                    static_cast<void*>(func_ptr)));
284 285 286 287 288
        }
        MGB_CATCH(..., {
            delete func_ptr;
            throw;
        });
289
#else
290 291
        MGB_MARK_USED_VAR(cb);
        MGB_MARK_USED_VAR(cuda_host_func_caller);
M
Megvii Engine Team 已提交
292 293 294
        mgb_throw(
                MegBrainError,
                "add_callback only support in cuda10.0 and later version");
295
#endif
296
    }
297

298
    uint64_t get_uid() override { return m_uid; }
299 300

#if !MGB_BUILD_SLIM_SERVING
301 302 303 304 305 306 307 308 309 310
    size_t get_used_memory() override;

    size_t get_max_used_memory() override;

    size_t get_reserved_memory() override;

    size_t get_max_reserved_memory() override;

    void reset_max_used_memory() override;
    void reset_max_reserved_memory() override;
311 312
#endif

313 314
private:
    uint64_t m_uid;
315
#if !MGB_BUILD_SLIM_SERVING
316
    std::unordered_map<void*, size_t> ptr2size;
317
#endif
318 319 320 321 322
};
MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl);

struct CudaCompNodeImpl::DeviceInfo {
    int dev_num = -1;
323 324
    std::atomic_size_t m_used_mem{0};
    std::atomic_size_t m_max_used_mem{0};
325 326
    std::unique_ptr<mem_alloc::DevMemAlloc> mem_alloc;

327
    bool init_done() const { return mem_alloc.get(); }
328

329
    void init(const CompNodeEnv& env);
330

331
    void fini() { mem_alloc.reset(); }
332 333 334 335 336 337 338 339 340
};

struct CudaCompNodeImpl::StaticData {
    static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;

    std::recursive_mutex mtx;

    mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config;

341
    std::unique_ptr<mem_alloc::SimpleCachingAlloc> host_alloc;
342 343
    CudaCompNode::CompNodeImpl node[MAX_NR_COMP_NODE];
    DeviceInfo dev_info[MAX_NR_DEVICE];
344 345
    int nr_node = 0,          //!< number of loaded node[]
            nr_dev_used = 0;  //!< number of used dev_info[]
346

347 348 349
    StaticData()
            : host_alloc(mem_alloc::SimpleCachingAlloc::make(
                      std::make_unique<mem_alloc::CudaHostAllocator>())) {
350 351
        prealloc_config.max_overhead = 0;
        prealloc_config.alignment = 1;
352
        host_alloc->alignment(1);
353 354 355
    }

    ~StaticData() {
356
        for (int i = 0; i < nr_node; ++i)
357
            node[i].fini();
358
        for (int i = 0; i < nr_dev_used; ++i)
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
            dev_info[i].fini();
    }

    static size_t get_mem_reserve_size() {
        if (auto setting = MGB_GETENV("MGB_CUDA_RESERVE_MEMORY")) {
            if (!strncmp(setting, "b:", 2)) {
                return std::stoull(setting + 2);
            }
            size_t tot, free;
            MGB_CUDA_CHECK(cudaFree(0));
            MGB_CUDA_CHECK(cudaMemGetInfo(&free, &tot));
            return free - get_min_system_memory(free);
        } else {
            return 0;
        }
    }
};
CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr;
Spinlock CudaCompNodeImpl::sd_mtx;

M
Megvii Engine Team 已提交
379
void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) {
380 381 382 383
    m_locator = locator;
    m_locator_logical = locator_logical;
    m_initialized = true;

384
#if defined(__linux__) || defined(TARGET_OS_MAC)
385
    FILE* fp;
386 387 388 389 390
    fp = fopen("/dev/urandom", "r");
    mgb_assert(fread(&m_uid, sizeof(m_uid), 1, fp) == 1);
    fclose(fp);
#else
    m_uid = std::chrono::duration_cast<std::chrono::nanoseconds>(
391 392
                    std::chrono::system_clock::now().time_since_epoch())
                    .count();
393 394
#endif

395 396 397 398 399
    auto on_succ = [this](cudaStream_t stream) {
        auto locator = m_locator;
        log_comp_node_created(locator, m_locator_logical);

        MGB_LOCK_GUARD(sd->mtx);
400 401
        DeviceInfo* dev_info = nullptr;
        for (int i = 0; i < sd->nr_dev_used; ++i) {
402 403 404 405 406 407 408 409 410 411
            if (sd->dev_info[i].dev_num == locator.device) {
                dev_info = &sd->dev_info[i];
                break;
            }
        }

        if (!dev_info) {
            dev_info = &sd->dev_info[sd->nr_dev_used];
            dev_info->init(m_env);
            // note: add nr_dev_used only after init succeeds
412
            ++sd->nr_dev_used;
413 414
        }
        m_device_info = dev_info;
M
Megvii Engine Team 已提交
415
        m_mem_alloc = dev_info->mem_alloc->add_stream(static_cast<void*>(stream));
416 417 418 419 420 421 422
    };

    auto on_error = [this](std::exception&) {
        MGB_LOCK_GUARD(sd->mtx);
        m_initialized = false;
    };

M
Megvii Engine Team 已提交
423 424
    m_env.init_cuda_async(
            locator.device, make_comp_node_from_impl(this), {on_succ, on_error});
425 426 427 428 429 430 431 432 433 434 435 436 437
}

void CudaCompNodeImpl::fini() {
    if (!m_initialized)
        return;

    m_sync_event.reset();
    m_env.fini();
    m_mem_alloc = nullptr;
    m_device_info = nullptr;
    m_initialized = false;
}

438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
void* CudaCompNodeImpl::alloc_device(size_t size) {
    activate();
#if MGB_BUILD_SLIM_SERVING
    return m_mem_alloc->alloc(size);
#else
    void* ptr = m_mem_alloc->alloc(size);
    {
        MGB_LOCK_GUARD(m_update_mem);
        ptr2size[ptr] = size;
        m_device_info->m_used_mem += size;
        if (m_device_info->m_used_mem > m_device_info->m_max_used_mem) {
            m_device_info->m_max_used_mem = m_device_info->m_used_mem.load();
        }
    }
    return ptr;
#endif
}

456
void CudaCompNodeImpl::free_device(void* ptr) {
457 458 459 460
    if (check_global_finalized())
        return;

    activate();
461 462 463
#if !MGB_BUILD_SLIM_SERVING
    {
        MGB_LOCK_GUARD(m_update_mem);
M
Megvii Engine Team 已提交
464
        mgb_assert(ptr2size.find(ptr) != ptr2size.end(), "ptr %p not found!", ptr);
465
        m_device_info->m_used_mem -= ptr2size.at(ptr);
466 467 468
        ptr2size.erase(ptr);
    }
#endif
469 470 471
    m_mem_alloc->free(ptr);
}

472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
#if !MGB_BUILD_SLIM_SERVING
size_t CudaCompNodeImpl::get_used_memory() {
    return m_device_info->m_used_mem.load();
}

size_t CudaCompNodeImpl::get_max_used_memory() {
    return m_device_info->m_max_used_mem.load();
}

void CudaCompNodeImpl::reset_max_used_memory() {
    m_device_info->m_max_used_mem = 0;
}

size_t CudaCompNodeImpl::get_reserved_memory() {
    return m_device_info->mem_alloc->get_used_memory();
}

size_t CudaCompNodeImpl::get_max_reserved_memory() {
    return m_device_info->mem_alloc->get_max_used_memory();
}

void CudaCompNodeImpl::reset_max_reserved_memory() {
    m_device_info->mem_alloc->reset_max_used_memory();
}
#endif

498
void* CudaCompNodeImpl::alloc_host(size_t size) {
499 500
    // need activate because it create cuda cuda context in current device
    activate();
501 502 503 504
    return sd->host_alloc->alloc(size);
}

void CudaCompNodeImpl::free_host(void* ptr) {
505 506
    if (check_global_finalized())
        return;
507 508 509
    sd->host_alloc->free(ptr);
}

M
Megvii Engine Team 已提交
510 511
void CudaCompNodeImpl::peer_copy_to(
        Impl* dest_impl, void* dest, const void* src, size_t size) {
512
    if (dest_impl->same_type<CudaCompNodeImpl>()) {
M
Megvii Engine Team 已提交
513
        auto&& dst_env = static_cast<CudaCompNodeImpl*>(dest_impl)->m_env.cuda_env();
514
        auto&& src_env = m_env.cuda_env();
515 516
        activate();
        if (dst_env.device == src_env.device) {
517 518
            MGB_CUDA_CHECK(cudaMemcpyAsync(
                    dest, src, size, cudaMemcpyDeviceToDevice, dst_env.stream));
519 520 521
        } else {
            enable_peer_access(src_env.device, dst_env.device);
            enable_peer_access(dst_env.device, src_env.device);
M
Megvii Engine Team 已提交
522 523
            MGB_CUDA_CHECK(cudaMemcpyPeerAsync(
                    dest, dst_env.device, src, src_env.device, size, dst_env.stream));
524 525 526
        }
        return;
    }
M
Megvii Engine Team 已提交
527 528 529
    mgb_assert(
            dest_impl->env().property().type == DeviceType::CPU,
            "cuda peer_copy_to only implemented for CPU");
530 531 532
    auto copy = [this, dest, src, size]() {
        auto stream = m_env.cuda_env().stream;
        m_env.cuda_env().activate();
M
Megvii Engine Team 已提交
533 534
        MGB_CUDA_CHECK(
                cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
535 536 537 538 539 540 541 542 543 544 545 546 547 548
        MGB_CUDA_CHECK(cudaStreamSynchronize(stream));
    };
    dest_impl->env().cpu_env().dispatch(copy);
}

MemNode CudaCompNodeImpl::mem_node() {
    // m_device_info would be null before async init finishes; so we just return
    // a prive pointer related to device number here
    return MemNode{sd->dev_info + m_locator.device};
}

void CudaCompNodeImpl::sync() {
    activate();

549 550 551
    // do not use MGB_CUDA_CHECK(cudaStreamSynchronize(m_env->stream)) since
    // other threads may be adding operations into the stream, and we only care
    // about previous operations in current thread. However docs of
552 553 554
    // cudaStreamSynchronize did not describe details of such condition, so we
    // use manual event implementation

555
    Event* event;
556 557 558 559 560 561 562 563 564 565 566
    {
        MGB_LOCK_GUARD(m_sync_event_mtx);
        if (!m_sync_event)
            m_sync_event = create_event(0);
        event = m_sync_event.get();
    }
    event->record();
    event->host_wait();
}

void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) {
M
Megvii Engine Team 已提交
567
    static bool already_enabled[StaticData::MAX_NR_DEVICE][StaticData::MAX_NR_DEVICE];
568 569 570 571 572 573 574 575 576 577 578 579 580 581 582
    if (already_enabled[dev0][dev1])
        return;

    static std::mutex global_lock;
    MGB_LOCK_GUARD(global_lock);
    if (already_enabled[dev0][dev1])
        return;

    int can;
    MGB_CUDA_CHECK(cudaDeviceCanAccessPeer(&can, dev0, dev1));
    if (can) {
        mgb_log("enable peer access from GPU %d to GPU %d", dev0, dev1);
        MGB_CUDA_CHECK(cudaSetDevice(dev0));
        auto err = cudaDeviceEnablePeerAccess(dev1, 0);
        if (err != cudaSuccess) {
M
Megvii Engine Team 已提交
583 584 585
            mgb_log_error(
                    "failed to enable peer access from %d to %d: %s(%d)", dev0, dev1,
                    cudaGetErrorString(err), static_cast<int>(err));
586 587 588 589 590 591 592 593 594 595 596 597
            cudaGetLastError();
        }
    }

    // check for cudaMemcpyPeer usable
    int v0 = 1, v1 = 2;

    int *dp0, *dp1;
    MGB_CUDA_CHECK(cudaSetDevice(dev0));
    MGB_CUDA_CHECK(cudaMalloc(&dp0, sizeof(int)));
    MGB_CUDA_CHECK(cudaSetDevice(dev1));
    MGB_CUDA_CHECK(cudaMalloc(&dp1, sizeof(int)));
598 599
    MGB_CUDA_CHECK(cudaMemcpy(dp0, &v0, sizeof(int), cudaMemcpyHostToDevice));
    MGB_CUDA_CHECK(cudaMemcpy(dp1, &v1, sizeof(int), cudaMemcpyHostToDevice));
600 601
    MGB_CUDA_CHECK(cudaMemcpyPeer(dp1, dev1, dp0, dev0, sizeof(int)));
    int get = 0;
602
    MGB_CUDA_CHECK(cudaMemcpy(&get, dp1, sizeof(int), cudaMemcpyDeviceToHost));
603

M
Megvii Engine Team 已提交
604 605 606 607 608
    mgb_throw_if(
            get != 1, CudaError,
            "P2P copy (%d => %d) check failed; consider disabling "
            "Access Control Services(ACS) for the PCI device",
            dev0, dev1);
609 610 611 612 613 614

    already_enabled[dev0][dev1] = true;
}

/* ===================== CudaCompNodeImpl::DeviceInfo  ===================== */

615
void CudaCompNodeImpl::DeviceInfo::init(const CompNodeEnv& env) {
616 617 618 619 620
    mgb_assert(!mem_alloc);
#if 0
    // forward cudaMalloc
    mem_alloc = mem_alloc::DevMemAlloc::make_cuda_alloc();
#else
621
    auto&& cuenv = env.cuda_env();
622 623 624 625
    cuenv.activate();
    dev_num = cuenv.device;
    auto reserve_size = StaticData::get_mem_reserve_size();
    mem_alloc = mem_alloc::DevMemAlloc::make(
M
Megvii Engine Team 已提交
626
            dev_num, reserve_size, std::make_shared<mem_alloc::CudaRawAllocator>(),
627 628 629 630
            std::make_shared<mem_alloc::CudaDeviceRuntimePolicy>());
    mem_alloc->prealloc_config(sd->prealloc_config);
    auto align = env.property().mem_alignment;
    mem_alloc->alignment(align);
631
    mgb_log_debug(
M
Megvii Engine Team 已提交
632 633
            "cuda: gpu%d: name=`%s' dyn_mem_reserve=%.2fMiB alignment=0x%zx", dev_num,
            cuenv.device_prop.name, reserve_size / 1024.0 / 1024, align);
634 635 636 637 638 639 640 641 642 643 644
#endif
}

bool CudaCompNodeImpl::check_global_finalized() {
    if (!sd) {
        static std::atomic_flag warn_printed = ATOMIC_FLAG_INIT;
        if (!warn_printed.test_and_set()) {
            mgb_log_debug("cuda comp node method called after global finalize");
        }
        return true;
    }
645 646 647 648 649 650 651 652 653 654 655
#if MGB_CUDA && defined(WIN32)
    //! FIXME: windows cuda driver shutdown before call atexit function even
    //! register atexit function after init cuda driver! as a workround
    //! recovery resource by OS temporarily, may need remove this after
    //! upgrade cuda runtime
    if (CudaCompNode::is_into_atexit) {
        mgb_log_debug(
                "windows cudaErrorCudartUnloading happened!!, resource "
                "recovery by OS!!");
        return true;
    }
656 657 658 659 660 661 662 663 664 665 666 667 668
    //! FIXME: megengine dynamic with VCRT, atexit fuctions table have
    //! some order issue, which will lead to cuda runtime uploading, this
    //! always happened at python3 unload dll(means python3 will exit),
    //! as a workround, recovery resource by OS temporarily, may need
    //! remove this after upgrade cuda runtime
    int dev = -1;
    if (cudaErrorCudartUnloading == cudaGetDevice(&dev)) {
        mgb_log_debug(
                "windows cudaErrorCudartUnloading happened!!, resource "
                "recovery by OS!!");
        return true;
    }

669
#endif
670 671 672 673 674
    return false;
}

/* ===================== EventImpl  ===================== */

675
class CudaCompNode::EventImpl final : public EventImplHelper {
676
    bool m_init_finished = false;
677
    CudaCompNodeImpl* const m_comp_node_impl;
678 679 680 681
    cudaEvent_t m_cuda_event;

    void do_record() override {
        m_comp_node_impl->activate();
682
        auto&& env = m_comp_node_impl->m_env.cuda_env();
683 684 685 686 687 688 689 690 691 692
        MGB_CUDA_CHECK(cudaEventRecord(m_cuda_event, env.stream));
    }

    bool do_finished() override {
        m_comp_node_impl->activate();
        cudaError_t err = cudaEventQuery(m_cuda_event);
        if (err == cudaSuccess)
            return true;
        if (err == cudaErrorNotReady)
            return false;
M
Megvii Engine Team 已提交
693 694 695
        mgb_throw(
                CudaError, "failed to query event: %d: %s", int(err),
                cudaGetErrorString(err));
696 697
    }

M
Megvii Engine Team 已提交
698
    void host_wait_cv() override { MGB_CUDA_CHECK(cudaEventSynchronize(m_cuda_event)); }
699

700
    double do_elapsed_time_until(EventImplHelper& end) override {
701 702
        m_comp_node_impl->activate();
        float ret = 0.0;
703 704
        MGB_CUDA_CHECK(cudaEventElapsedTime(
                &ret, m_cuda_event, static_cast<EventImpl&>(end).m_cuda_event));
705 706 707
        return static_cast<double>(ret) * 1e-3;
    }

708
    void do_device_wait_by(Impl* cn_impl) override;
709

710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
public:
    EventImpl(CudaCompNodeImpl* comp_node_impl, size_t create_flags)
            : EventImplHelper(comp_node_impl, create_flags),
              m_comp_node_impl{comp_node_impl} {
        m_comp_node_impl->activate();
        size_t cuda_flags = cudaEventDisableTiming;
        if (create_flags & NEED_TIMER)
            cuda_flags = 0;
        MGB_CUDA_CHECK(cudaEventCreateWithFlags(&m_cuda_event, cuda_flags));
        m_init_finished = true;
    }

    ~EventImpl() {
        if (m_init_finished) {
            MGB_TRY { MGB_CUDA_CHECK(cudaEventDestroy(m_cuda_event)); }
            MGB_CATCH(MegBrainError & exc, {
                mgb_log_error("failed to destroy cuda event: %s", exc.what());
            })
728
        }
729
    }
730 731
};

732
std::unique_ptr<CompNode::Event> CudaCompNodeImpl::create_event(size_t flags) {
733 734 735
    return std::make_unique<EventImpl>(this, flags);
}

736
void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) {
737 738 739 740 741 742 743 744
    if (cn_impl->dyn_typeinfo() == CudaCompNodeImpl::typeinfo()) {
        auto imp = static_cast<CudaCompNodeImpl*>(cn_impl);
        auto stream = imp->m_env.cuda_env().stream;
        imp->activate();
        MGB_CUDA_CHECK(cudaStreamWaitEvent(stream, m_cuda_event, 0));
        return;
    }
    if (cn_impl->env().property().type == DeviceType::CPU) {
M
Megvii Engine Team 已提交
745
        auto waiter = [this]() { MGB_CUDA_CHECK(cudaEventSynchronize(m_cuda_event)); };
746 747 748 749 750 751 752 753
        cn_impl->add_callback(std::move(waiter));
        return;
    }
    mgb_throw(MegBrainError, "unimplemented event device_wait_by config");
}

/* ===================== CudaCompNode static methods ===================== */

754 755 756
namespace {

#ifndef __unix__
M
Megvii Engine Team 已提交
757
template <typename Func, typename... Args>
758
CUresult call_cuda_forksafe(Func func, Args... args) {
759
    cuInit(0);
760
    return func(args...);
761 762 763 764 765 766
}
#else
struct RAIICloseFD : NonCopyableObj {
    int m_fd = -1;

    RAIICloseFD(int fd) : m_fd(fd) {}
M
Megvii Engine Team 已提交
767
    ~RAIICloseFD() { close(); }
768 769 770 771 772 773 774 775
    void close() {
        if (m_fd != -1) {
            ::close(m_fd);
            m_fd = -1;
        }
    }
};
// an implementation that does not call cuInit
M
Megvii Engine Team 已提交
776
template <typename Func, typename Val, typename... Args>
777 778
CUresult call_cuda_forksafe(Func func, Val* val, Args... args) {
    auto err = func(val, args...);
M
Megvii Engine Team 已提交
779 780
    if (err != CUDA_ERROR_NOT_INITIALIZED)
        return err;
781 782 783 784 785 786 787 788 789 790 791 792
    // cuInit not called, call it in child process
    int fd[2];
    mgb_assert(pipe(fd) == 0, "pipe() failed");
    int fdr = fd[0], fdw = fd[1];
    RAIICloseFD fdr_guard(fdr);
    RAIICloseFD fdw_guard(fdw);
    auto cpid = fork();
    mgb_assert(cpid != -1, "fork() failed");
    if (cpid == 0) {
        fdr_guard.close();
        do {
            err = cuInit(0);
M
Megvii Engine Team 已提交
793 794
            if (err != CUDA_SUCCESS)
                break;
795
            err = func(val, args...);
796 797 798
        } while (0);
        auto sz = write(fdw, &err, sizeof(err));
        if (sz == sizeof(err) && err == CUDA_SUCCESS) {
799
            sz = write(fdw, val, sizeof(*val));
800 801 802 803 804 805 806 807
        }
        fdw_guard.close();
        std::quick_exit(0);
    }
    fdw_guard.close();
    auto sz = read(fdr, &err, sizeof(err));
    mgb_assert(sz == sizeof(err), "failed to read error code from child");
    if (err == CUDA_SUCCESS) {
808 809
        sz = read(fdr, val, sizeof(*val));
        mgb_assert(sz == sizeof(*val), "failed to read value from child");
810 811 812
        return err;
    }
    // try again, maybe another thread called cuInit while we fork
813
    auto err2 = func(val, args...);
M
Megvii Engine Team 已提交
814 815 816 817
    if (err2 == CUDA_SUCCESS)
        return err2;
    if (err2 == CUDA_ERROR_NOT_INITIALIZED)
        return err;
818 819 820 821 822 823 824
    return err2;
}
#endif

const char* cu_get_error_string(CUresult err) {
    const char* ret = nullptr;
    cuGetErrorString(err, &ret);
825 826 827 828
    if (!ret) {
        //! caused by cuda stub do not find driver
        ret = "invalid_stub_call";
    }
829 830 831
    return ret;
}

M
Megvii Engine Team 已提交
832
}  // namespace
833

834 835 836 837 838 839
bool CudaCompNode::available() {
    static int result = -1;
    static Spinlock mtx;
    MGB_LOCK_GUARD(mtx);
    if (result == -1) {
        int ndev = -1;
840
        auto err = call_cuda_forksafe(cuDeviceGetCount, &ndev);
841
        result = err == CUDA_SUCCESS && ndev > 0;
842 843 844
        auto err_s = cu_get_error_string(err);
        //! only show !CUDA_SUCCESS log when with valid stub call
        if (!result && (std::string(err_s) != "invalid_stub_call")) {
M
Megvii Engine Team 已提交
845
            mgb_log_warn(
846 847
                    "cuda unavailable: %s(%d) ndev=%d", err_s, static_cast<int>(err),
                    ndev);
848
        }
849
        if (err == CUDA_ERROR_NOT_INITIALIZED) {
850
            mgb_throw(std::runtime_error, "cuda initialization error.");
851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
        }
    }
    return result;
}

void CudaCompNode::finalize() {
    if (CudaCompNodeImpl::sd) {
        sync_all();

        auto ptr = CudaCompNodeImpl::sd;
        CudaCompNodeImpl::sd = nullptr;
        ptr->~StaticData();
    }
}

866 867 868 869 870 871 872
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
bool CudaCompNode::is_into_atexit = false;
#endif
M
Megvii Engine Team 已提交
873 874
CompNode::Impl* CudaCompNode::load_cuda(
        const Locator& locator, const Locator& locator_logical) {
875
    int nr_gpu = get_device_count();
876 877 878 879 880 881 882 883 884 885
#if MGB_CUDA && defined(WIN32)
    //! FIXME: windows cuda driver shutdown before call atexit function even
    //! register atexit function after init cuda driver! as a workround
    //! recovery resource by OS temporarily, may need remove this after
    //! upgrade cuda runtime
    if (!is_into_atexit) {
        auto err = atexit([] { is_into_atexit = true; });
        mgb_assert(!err, "failed to register atexit function");
    }
#endif
M
Megvii Engine Team 已提交
886 887 888
    mgb_assert(
            locator.device >= 0 && locator.device < nr_gpu,
            "request gpu%d out of valid range [0, %d)", locator.device, nr_gpu);
889

890
    auto&& sdptr = CudaCompNodeImpl::sd;
891 892 893 894 895 896 897
    {
        MGB_LOCK_GUARD(CudaCompNodeImpl::sd_mtx);
        if (!sdptr) {
            // use static storage so object can be safely accessed even after
            // global finalize
            using T = CudaCompNodeImpl::StaticData;
            static std::aligned_storage_t<sizeof(T), alignof(T)> storage;
898
            sdptr = new (&storage) T;
899 900
        }
    }
901
    auto&& sd = *sdptr;
902 903
    MGB_LOCK_GUARD(sd.mtx);

904 905 906
    CompNodeImpl* available_node = nullptr;
    for (int i = 0; i < sd.nr_node; ++i) {
        auto&& cur = sd.node[i];
907
        if (cur.m_initialized) {
M
Megvii Engine Team 已提交
908
            if (cur.m_locator == locator && cur.m_locator_logical == locator_logical) {
909 910 911 912 913 914 915 916
                return &cur;
            }
        } else {
            available_node = &cur;
        }
    }

    if (!available_node) {
M
Megvii Engine Team 已提交
917
        mgb_assert(sd.nr_node < sd.MAX_NR_COMP_NODE, "too many CompNode allocated");
918
        available_node = &sd.node[sd.nr_node++];
919
    }
920
    mgb_assert(locator.device < sd.MAX_NR_DEVICE, "device number too large");
921 922 923 924 925 926 927 928 929 930 931 932 933 934

    mgb_assert(!available_node->m_initialized);
    available_node->init(locator, locator_logical);

    return available_node;
}

void CudaCompNode::try_coalesce_all_free_memory() {
    // TODO: optimized implementation
    auto sd = CudaCompNodeImpl::sd;
    if (!sd)
        return;

    size_t size = 0;
935
    for (int i = 0; i < sd->nr_dev_used; ++i) {
M
Megvii Engine Team 已提交
936
        size += sd->dev_info[i].mem_alloc->gather_stream_free_blk_and_release_full();
937 938
    }
    if (size) {
M
Megvii Engine Team 已提交
939
        mgb_log_debug("%zu bytes freed by try_coalesce_all_free_memory()", size);
940 941 942 943 944 945 946 947
    }
}

void CudaCompNode::sync_all() {
    auto sd = CudaCompNodeImpl::sd;
    if (!sd)
        return;

948
    for (int i = 0;; ++i) {
949
        // ensure async init finished
950
        CompNodeEnv* env;
951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967
        {
            MGB_LOCK_GUARD(sd->mtx);
            if (i >= sd->nr_node) {
                break;
            }
            env = &sd->node[i].env();
        }
        env->cuda_env();
    }

    MGB_LOCK_GUARD(sd->mtx);
    for (int i = 0; i < sd->nr_dev_used; ++i) {
        MGB_CUDA_CHECK(cudaSetDevice(sd->dev_info[i].dev_num));
        MGB_CUDA_CHECK(cudaDeviceSynchronize());
    }
}

968
void CudaCompNode::foreach (thin_function<void(CompNode)> callback) {
969 970 971 972
    auto sd = CudaCompNodeImpl::sd;
    if (!sd)
        return;

973
    for (int i = 0;; ++i) {
974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989
        CompNode cur;
        {
            MGB_LOCK_GUARD(sd->mtx);
            if (i >= sd->nr_node)
                return;
            cur = make_comp_node_from_impl(&sd->node[i]);
        }
        callback(cur);
    }
}

size_t CudaCompNode::get_device_count(bool warn) {
    static int cnt = -1;
    static Spinlock mtx;
    MGB_LOCK_GUARD(mtx);
    if (cnt == -1) {
990
        auto err = call_cuda_forksafe(cuDeviceGetCount, &cnt);
991
        auto err_s = cu_get_error_string(err);
992
        if (err != CUDA_SUCCESS) {
993
            if (warn && (std::string(err_s) != "invalid_stub_call"))
M
Megvii Engine Team 已提交
994
                mgb_log_error(
995
                        "cudaGetDeviceCount failed: %s (err %d)", err_s, int(err));
996 997 998 999 1000 1001 1002
            cnt = 0;
        }
        mgb_assert(cnt >= 0);
    }
    return cnt;
}

M
Megvii Engine Team 已提交
1003 1004
void CudaCompNode::set_prealloc_config(
        size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {
1005
    auto&& sdptr = CudaCompNodeImpl::sd;
1006 1007 1008 1009 1010
    {
        MGB_LOCK_GUARD(CudaCompNodeImpl::sd_mtx);
        if (!sdptr) {
            using T = CudaCompNodeImpl::StaticData;
            static std::aligned_storage_t<sizeof(T), alignof(T)> storage;
1011
            sdptr = new (&storage) T;
1012 1013 1014 1015 1016 1017
            sdptr->prealloc_config.alignment = alignment;
            sdptr->prealloc_config.min_req = min_req;
            sdptr->prealloc_config.growth_factor = growth_factor;
            sdptr->prealloc_config.max_overhead = max_overhead;
        } else {
            mgb_log_warn(
1018 1019 1020 1021
                    "invalid call to set_prealloc_config, will fallback to "
                    "default config; "
                    "prealloc_config should be specified before any CUDA "
                    "memory allocation");
1022 1023 1024 1025
        }
    }
}

1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
size_t CudaCompNode::get_compute_capability(int dev) {
    size_t cnt = get_device_count();
    if (dev < 0 || dev >= static_cast<int>(cnt)) {
        mgb_log_error("request gpu %d out of valid range [0, %lu)", dev, cnt);
        return 0;
    }
    static Spinlock mtx_com;
    MGB_LOCK_GUARD(mtx_com);
    int pmajor;
    int pminor;
M
Megvii Engine Team 已提交
1036 1037 1038
    auto err = call_cuda_forksafe(
            cuDeviceGetAttribute, &pmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
            dev);
1039 1040 1041
    if (err != CUDA_SUCCESS) {
        return 0;
    }
M
Megvii Engine Team 已提交
1042 1043 1044
    auto err2 = call_cuda_forksafe(
            cuDeviceGetAttribute, &pminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
            dev);
1045 1046 1047 1048 1049 1050
    if (err2 != CUDA_SUCCESS) {
        return 0;
    }
    return pmajor * 10 + pminor;
}

1051 1052 1053 1054 1055
#else

bool CudaCompNode::available() {
    return false;
}
1056 1057 1058
void CudaCompNode::try_coalesce_all_free_memory() {}
void CudaCompNode::foreach (thin_function<void(CompNode)>) {}
void CudaCompNode::finalize() {}
1059 1060 1061 1062 1063 1064
size_t CudaCompNode::get_device_count(bool warn) {
    return 0;
}
CudaCompNode::Impl* CudaCompNode::load_cuda(const Locator&, const Locator&) {
    mgb_throw(MegBrainError, "cuda disabled at compile time");
}
1065
void CudaCompNode::sync_all() {}
1066

M
Megvii Engine Team 已提交
1067 1068
void CudaCompNode::set_prealloc_config(
        size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {}
1069

1070 1071 1072 1073
size_t CudaCompNode::get_compute_capability(int dev) {
    return 0;
}

1074 1075
#undef err

1076
#endif  // MGB_CUDA
1077 1078

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}