From 3bd8ef35890ab3e424ef4105b6fc1871365d1e99 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 31 Mar 2020 21:19:17 +0800 Subject: [PATCH] feat(mgb/compnode): add atlas compnode GitOrigin-RevId: 19f3c330039c3d0accd9787446c391495f425b6e --- CMakeLists.txt | 9 ++++++ dnn/include/megcore.h | 1 + dnn/src/CMakeLists.txt | 1 + .../common/megcore/common/device_context.hpp | 1 + dnn/src/common/megcore/public_api/device.cpp | 7 +++++ dnn/test/CMakeLists.txt | 1 - python_module/src/cpp/opr_defs.cpp | 1 + python_module/src/cpp/opr_defs.h | 2 ++ python_module/src/cpp/opr_helper.h | 1 - src/CMakeLists.txt | 3 +- src/core/impl/comp_node_env.cpp | 4 +-- src/core/impl/graph/var_node_mem_mgr.cpp | 28 ++++++++++--------- src/core/impl/graph/var_node_mem_mgr.h | 8 +++--- src/core/include/megbrain/comp_node_env.h | 4 +-- src/core/include/megbrain/exception.h | 1 - src/core/test/comp_node.cpp | 3 -- src/megbrain_build_config.h.in | 1 + test/CMakeLists.txt | 1 + test/src/helper.cpp | 26 +++++++++++++++++ test/src/include/megbrain/test/helper.h | 22 ++++++++++++++- tools/param_defs/mgb_opr_param_defs.py | 4 +-- 21 files changed, 97 insertions(+), 32 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b753bd0a3..9c67ac733 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,6 +143,15 @@ if(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") endif() +option(MGE_WITH_JIT "Build MegEngine with JIT." ON) +option(MGE_WITH_HALIDE "Build MegEngine with Halide JIT" ON) +option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF) +option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON) +option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) +option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) +option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) +option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) + if(NOT MGE_WITH_JIT) if(MGE_WITH_HALIDE) message(WARNING "MGE_WITH_HALIDE is set to OFF with MGE_WITH_JIT disabled") diff --git a/dnn/include/megcore.h b/dnn/include/megcore.h index fead54e47..a8effea2e 100644 --- a/dnn/include/megcore.h +++ b/dnn/include/megcore.h @@ -84,6 +84,7 @@ megcoreStatus_t megcoreGetDeviceFlags( unsigned int *flags); megcoreStatus_t megcoreActivate(megcoreDeviceHandle_t handle); +megcoreStatus_t megcoreDeactivate(megcoreDeviceHandle_t handle); megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle, void **devPtr, size_t sizeInBytes); megcoreStatus_t megcoreFree(megcoreDeviceHandle_t handle, diff --git a/dnn/src/CMakeLists.txt b/dnn/src/CMakeLists.txt index 93411e168..ecafe71c1 100644 --- a/dnn/src/CMakeLists.txt +++ b/dnn/src/CMakeLists.txt @@ -86,6 +86,7 @@ if (BUILD_SHARED_LIBS) else() target_link_libraries(megdnn PRIVATE ${MGE_BLAS_LIBS}) endif() + if(CMAKE_THREAD_LIBS_INIT) target_link_libraries(megdnn PRIVATE Threads::Threads) endif() diff --git a/dnn/src/common/megcore/common/device_context.hpp b/dnn/src/common/megcore/common/device_context.hpp index 765132be9..c12f59c31 100644 --- a/dnn/src/common/megcore/common/device_context.hpp +++ b/dnn/src/common/megcore/common/device_context.hpp @@ -38,6 +38,7 @@ class DeviceContext { virtual size_t mem_alignment_in_bytes() const noexcept = 0; virtual void activate() = 0; + virtual void deactivate() {} virtual void *malloc(size_t size_in_bytes) = 0; virtual void free(void *ptr) = 0; diff --git a/dnn/src/common/megcore/public_api/device.cpp b/dnn/src/common/megcore/public_api/device.cpp index 96dfaa767..7dbe00685 100644 --- a/dnn/src/common/megcore/public_api/device.cpp +++ b/dnn/src/common/megcore/public_api/device.cpp @@ -74,6 +74,13 @@ megcoreStatus_t megcoreActivate(megcoreDeviceHandle_t handle) return megcoreSuccess; } +megcoreStatus_t megcoreDeactivate(megcoreDeviceHandle_t handle) +{ + megdnn_assert(handle); + handle->content->deactivate(); + return megcoreSuccess; +} + megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle, void **devPtr, size_t sizeInBytes) { diff --git a/dnn/test/CMakeLists.txt b/dnn/test/CMakeLists.txt index 07bd78038..2a50bdd89 100644 --- a/dnn/test/CMakeLists.txt +++ b/dnn/test/CMakeLists.txt @@ -27,7 +27,6 @@ endif() - add_executable(megdnn_test ${SOURCES}) target_link_libraries(megdnn_test gtest) target_link_libraries(megdnn_test megdnn ${MGE_BLAS_LIBS}) diff --git a/python_module/src/cpp/opr_defs.cpp b/python_module/src/cpp/opr_defs.cpp index db1e8febb..1cbc5979d 100644 --- a/python_module/src/cpp/opr_defs.cpp +++ b/python_module/src/cpp/opr_defs.cpp @@ -246,6 +246,7 @@ SymbolVarArray _Opr::tensor_rt_runtime(const SymbolVarArray& inputs, } #endif + SymbolVar _Opr::timestamp(SymbolVar input, PyObject* dest, size_t dest_off, const OperatorNodeConfig& config) { auto tensor = std::make_shared( diff --git a/python_module/src/cpp/opr_defs.h b/python_module/src/cpp/opr_defs.h index 2998d545e..82ac9ceb7 100644 --- a/python_module/src/cpp/opr_defs.h +++ b/python_module/src/cpp/opr_defs.h @@ -118,6 +118,8 @@ static SymbolVarArray tensor_rt_runtime(const SymbolVarArray& inputs, PyObject* data_bytes, const OperatorNodeConfig& config); + + static SymbolVar timestamp(SymbolVar input, PyObject* dest, size_t dest_off, const OperatorNodeConfig& config); diff --git a/python_module/src/cpp/opr_helper.h b/python_module/src/cpp/opr_helper.h index 27e7eeb4a..15b49d5ab 100644 --- a/python_module/src/cpp/opr_helper.h +++ b/python_module/src/cpp/opr_helper.h @@ -18,7 +18,6 @@ #if MGB_ENABLE_OPR_MM #include "megbrain/opr/collective_comm.h" #endif - using AxisIndexer = mgb::opr::indexing::AxisIndexer; /*! diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 13331421e..11a4cef9c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -88,7 +88,7 @@ if (MGB_WITH_FLATBUFFERS) ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs COMMAND ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs - DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} + DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} VERBATIM ) add_custom_command( @@ -124,7 +124,6 @@ if (MGB_WITH_FLATBUFFERS) target_include_directories(megbrain PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/serialization/include) target_compile_definitions(megbrain PUBLIC MGB_ENABLE_FBS_SERIALIZATION=1) target_link_libraries(megbrain PUBLIC flatbuffers) - set (GENERATED_FLATBUFFERS_CONVERTER_PATH ${CMAKE_CURRENT_BINARY_DIR}/genfiles) set (GEN_FLATBUFFERS_CONVERTER_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_converter.py) file (MAKE_DIRECTORY ${GENERATED_FLATBUFFERS_CONVERTER_PATH}) diff --git a/src/core/impl/comp_node_env.cpp b/src/core/impl/comp_node_env.cpp index 9c8bfb899..b584868d7 100644 --- a/src/core/impl/comp_node_env.cpp +++ b/src/core/impl/comp_node_env.cpp @@ -96,7 +96,7 @@ megcore::AsyncErrorInfo* MegDNNHandle::make_async_error_info( cn.free_device(ptr); } }; - megcore::AsyncErrorInfo zero_info{0, nullptr, "", {0,0,0,0}}; + megcore::AsyncErrorInfo zero_info{0, nullptr, "", {0, 0, 0, 0}}; auto ptr = static_cast( env.comp_node().alloc_device(sizeof(zero_info))); cn.copy_to_device(ptr, &zero_info, sizeof(zero_info)); @@ -106,7 +106,7 @@ megcore::AsyncErrorInfo* MegDNNHandle::make_async_error_info( } #endif -/* =================== misc =================== */ + /* =================== misc =================== */ #if MGB_CUDA diff --git a/src/core/impl/graph/var_node_mem_mgr.cpp b/src/core/impl/graph/var_node_mem_mgr.cpp index 401f2390e..bea4b1d07 100644 --- a/src/core/impl/graph/var_node_mem_mgr.cpp +++ b/src/core/impl/graph/var_node_mem_mgr.cpp @@ -123,9 +123,9 @@ StaticDeviceMemoryManager::make_default_impl() { } #endif // MGB_THREAD_SAFE -/* ==================== CUDAAsyncVarReleaser ==================== */ -#if MGB_CUDA -class VarNodeMemManager::CUDAAsyncVarReleaser { +/* ==================== AsyncVarReleaser ==================== */ +#if MGB_CUDA +class VarNodeMemManager::AsyncVarReleaser { struct WaiterParam { CompNode cn; CompNode::Event *event; @@ -133,10 +133,10 @@ class VarNodeMemManager::CUDAAsyncVarReleaser { }; class Waiter final: public AsyncQueueSC { - CUDAAsyncVarReleaser *m_par_releaser; + AsyncVarReleaser *m_par_releaser; public: - Waiter(CUDAAsyncVarReleaser *releaser): + Waiter(AsyncVarReleaser *releaser): m_par_releaser(releaser) { } @@ -159,7 +159,7 @@ class VarNodeMemManager::CUDAAsyncVarReleaser { Spinlock m_event_pool_lock; public: - ~CUDAAsyncVarReleaser() { + ~AsyncVarReleaser() { wait_release_finish(); } @@ -247,15 +247,16 @@ bool VarNodeMemManager::ImpureMemPlanManager::check_need_realloc() { VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph): m_owner_graph(graph), m_seq_mem_opt(graph) -#if MGB_CUDA - ,m_cuda_asyn_var_releaser(new CUDAAsyncVarReleaser) +#if MGB_CUDA + ,m_asyn_var_releaser(new AsyncVarReleaser) #endif { auto on_comp_seq_finish = [this](const event::CompSeqExecFinished& ev) { + MGB_MARK_USED_VAR(ev); // async release is only used for sync between multiple comp nodes, and // does not wait for device to finish -#if MGB_CUDA - m_cuda_asyn_var_releaser->wait_release_finish(); +#if MGB_CUDA + m_asyn_var_releaser->wait_release_finish(); #endif m_cpu_async_release_barrier.wait_zero(); }; @@ -295,9 +296,10 @@ VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph): graph->event().register_receiver_permanent( on_comp_seq_error); -#if MGB_ENABLE_VAR_DEV_MEM_DEFRAGMENTER +#if MGB_ENABLE_VAR_DEV_MEM_DEFRAGMENTER && (MGB_CUDA \ + ) auto on_mem_defrag_start = [this](const event::BeforeMemDefrag&) { - m_cuda_asyn_var_releaser->wait_release_finish(); + m_asyn_var_releaser->wait_release_finish(); }; graph->event().register_receiver_permanent( on_mem_defrag_start); @@ -1341,7 +1343,7 @@ void VarNodeMemManager::decr_var_mem_refcnt( } #if MGB_CUDA case DT::CUDA: - m_cuda_asyn_var_releaser->add(dispatch_cn, var); + m_asyn_var_releaser->add(dispatch_cn, var); break; #endif default: diff --git a/src/core/impl/graph/var_node_mem_mgr.h b/src/core/impl/graph/var_node_mem_mgr.h index 414c03708..2f2e99717 100644 --- a/src/core/impl/graph/var_node_mem_mgr.h +++ b/src/core/impl/graph/var_node_mem_mgr.h @@ -431,10 +431,10 @@ class VarNodeMemManager { SyncableCounter m_cpu_async_release_barrier; -#if MGB_CUDA - //! release dynamic var on after cuda event finishes - class CUDAAsyncVarReleaser; - std::unique_ptr m_cuda_asyn_var_releaser; +#if MGB_CUDA + //! release dynamic var on after compnode event finishes + class AsyncVarReleaser; + std::unique_ptr m_asyn_var_releaser; #endif VarDevMemDefragmenter m_var_dev_mem_defragmenter{this}; diff --git a/src/core/include/megbrain/comp_node_env.h b/src/core/include/megbrain/comp_node_env.h index fd0e846c7..2abece19b 100644 --- a/src/core/include/megbrain/comp_node_env.h +++ b/src/core/include/megbrain/comp_node_env.h @@ -41,9 +41,9 @@ } \ } while (0) -#endif // MGB_ENABLE_LOGGING +#endif //MGB_ENABLE_LOGGING +#endif //MGB_CUDA -#endif //! whether to enable asynchronous initialization for CompNode and CompNodeEnv #define MGB_ENABLE_COMP_NODE_ASYNC_INIT (MGB_CUDA) diff --git a/src/core/include/megbrain/exception.h b/src/core/include/megbrain/exception.h index 9f5eb604c..ecdfdbf0c 100644 --- a/src/core/include/megbrain/exception.h +++ b/src/core/include/megbrain/exception.h @@ -136,7 +136,6 @@ public: * error message */ static std::string get_cuda_extra_info(); - CudaError(const std::string& msg); }; diff --git a/src/core/test/comp_node.cpp b/src/core/test/comp_node.cpp index d16a8f7e0..9731f8aca 100644 --- a/src/core/test/comp_node.cpp +++ b/src/core/test/comp_node.cpp @@ -59,9 +59,6 @@ TEST(TestCompNode, Parse) { ASSERT_THROW(L::parse("cpu0:"), MegBrainError); ASSERT_THROW(L::parse("cpu0:x"), MegBrainError); ASSERT_THROW(L::parse("cpu2:23x"), MegBrainError); - ASSERT_THROW(L::parse("heaxgon0"), MegBrainError); - ASSERT_THROW(L::parse("rcom0"), MegBrainError); - ASSERT_THROW(L::parse("cmabricon0"), MegBrainError); ASSERT_THROW(L::parse("multithread"), MegBrainError); ASSERT_THROW(L::parse("multithread1:"), MegBrainError); ASSERT_THROW(L::parse("multithread1:default"), MegBrainError); diff --git a/src/megbrain_build_config.h.in b/src/megbrain_build_config.h.in index 55afbe61c..d04fd787a 100644 --- a/src/megbrain_build_config.h.in +++ b/src/megbrain_build_config.h.in @@ -53,6 +53,7 @@ #cmakedefine01 MEGDNN_THREADS_512 #cmakedefine01 MEGDNN_ENABLE_MULTI_THREADS + // whether cuda is available #ifndef MGB_CUDA #define MGB_CUDA 1 diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4559e9918..767715b60 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,6 +15,7 @@ if (MGE_WITH_CUDA AND MGE_WITH_TRT) list(APPEND SOURCES ${SOURCES_}) endif() + add_executable(megbrain_test ${SOURCES}) target_link_libraries(megbrain_test gtest) target_link_libraries(megbrain_test megengine) diff --git a/test/src/helper.cpp b/test/src/helper.cpp index bedca8fb5..df4fe649a 100644 --- a/test/src/helper.cpp +++ b/test/src/helper.cpp @@ -98,22 +98,48 @@ dtype, RandomDistribution::UNIFORM>::operator ()( return ret; } +template +std::shared_ptr HostTensorGenerator< +dtype, RandomDistribution::CONSTANT>::operator ()( + const TensorShape &shape, CompNode cn) { + if (!cn.valid()) + cn = CompNode::load("xpu0"); + std::shared_ptr ret = + std::make_shared(cn, shape, dtype()); + auto ptr = ret->ptr(); + for (size_t i = 0, it = shape.total_nr_elems(); i < it; ++ i) { + ptr[i] = m_default_val; + } + return ret; +} + + // explicit instantialization of HostTensorGenerator namespace mgb { template class HostTensorGenerator< dtype::Float32, RandomDistribution::GAUSSIAN>; template class HostTensorGenerator< dtype::Float32, RandomDistribution::UNIFORM>; + template class HostTensorGenerator< + dtype::Float32, RandomDistribution::CONSTANT>; template class HostTensorGenerator< dtype::Float16, RandomDistribution::GAUSSIAN>; template class HostTensorGenerator< dtype::Int8, RandomDistribution::UNIFORM>; + template class HostTensorGenerator< + dtype::Int8, RandomDistribution::CONSTANT>; template class HostTensorGenerator< dtype::Uint8, RandomDistribution::UNIFORM>; + template class HostTensorGenerator< + dtype::Uint8, RandomDistribution::CONSTANT>; template class HostTensorGenerator< dtype::Int16, RandomDistribution::UNIFORM>; + template class HostTensorGenerator< + dtype::Int16, RandomDistribution::CONSTANT>; template class HostTensorGenerator< dtype::Int32, RandomDistribution::UNIFORM>; + template class HostTensorGenerator< + dtype::Int32, RandomDistribution::CONSTANT>; std::shared_ptr HostTensorGenerator:: operator()(const TensorShape& shape, CompNode cn) { diff --git a/test/src/include/megbrain/test/helper.h b/test/src/include/megbrain/test/helper.h index e4e9f51eb..065fade48 100644 --- a/test/src/include/megbrain/test/helper.h +++ b/test/src/include/megbrain/test/helper.h @@ -175,7 +175,7 @@ class RNGxorshf { }; enum class RandomDistribution { - GAUSSIAN, UNIFORM + GAUSSIAN, UNIFORM, CONSTANT }; template @@ -322,6 +322,26 @@ class HostTensorGenerator final: ctype m_lo, m_hi; }; +//! const value +template +class HostTensorGenerator final: + public HostTensorGeneratorBase { + + public: + using ctype = typename DTypeTrait::ctype; + + HostTensorGenerator(ctype default_val) + : HostTensorGeneratorBase{next_rand_seed()}, + m_default_val{default_val} {} + + std::shared_ptr operator ()( + const TensorShape &shape, CompNode cn = {}) override; + using HostTensorGeneratorBase::operator(); + + private: + ctype m_default_val; +}; + template <> class HostTensorGenerator final : public HostTensorGeneratorBase { diff --git a/tools/param_defs/mgb_opr_param_defs.py b/tools/param_defs/mgb_opr_param_defs.py index c1ad2fe65..6b64364a6 100644 --- a/tools/param_defs/mgb_opr_param_defs.py +++ b/tools/param_defs/mgb_opr_param_defs.py @@ -21,8 +21,8 @@ pdef('PersistentOutputStorage').add_fields( (pdef('ExecutionPolicy', 'specify how to select an algorithm for an operator'). add_enum('Strategy', Doc('HEURISTIC', 'use heuristic to choose the fastest algorithm'), - Doc('HEURISTIC_REPRODUCIBLE', 'use heuristic to choose the fastest algorithm, ' - 'and the chosen algorithm is reproducible'), + Doc('HEURISTIC_REPRODUCIBLE', 'use heuristic to choose the fastest algorithm, ' + 'and the chosen algorithm is reproducible'), Doc('PROFILE', 'run possible algorithms on real device to find the best'), Doc('PROFILE_REPRODUCIBLE', -- GitLab