diff --git a/CMakeLists.txt b/CMakeLists.txt index b753bd0a3c54b79729422afaa6101b4629ea8d0b..9c67ac733bf3727a6d8ca72aa87a895bc76a56ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,6 +143,15 @@ if(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") endif() +option(MGE_WITH_JIT "Build MegEngine with JIT." ON) +option(MGE_WITH_HALIDE "Build MegEngine with Halide JIT" ON) +option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF) +option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON) +option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) +option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) +option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) +option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) + if(NOT MGE_WITH_JIT) if(MGE_WITH_HALIDE) message(WARNING "MGE_WITH_HALIDE is set to OFF with MGE_WITH_JIT disabled") diff --git a/dnn/include/megcore.h b/dnn/include/megcore.h index fead54e47461e25f72272d4128da0d19f52fb5a1..a8effea2e122424cf91d8650028334d76b00bfe2 100644 --- a/dnn/include/megcore.h +++ b/dnn/include/megcore.h @@ -84,6 +84,7 @@ megcoreStatus_t megcoreGetDeviceFlags( unsigned int *flags); megcoreStatus_t megcoreActivate(megcoreDeviceHandle_t handle); +megcoreStatus_t megcoreDeactivate(megcoreDeviceHandle_t handle); megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle, void **devPtr, size_t sizeInBytes); megcoreStatus_t megcoreFree(megcoreDeviceHandle_t handle, diff --git a/dnn/src/CMakeLists.txt b/dnn/src/CMakeLists.txt index 93411e1686e6abc71fdee0b3b4ea8000cde2657d..ecafe71c164765a685318dc3cf1a702183a73102 100644 --- a/dnn/src/CMakeLists.txt +++ b/dnn/src/CMakeLists.txt @@ -86,6 +86,7 @@ if (BUILD_SHARED_LIBS) else() target_link_libraries(megdnn PRIVATE ${MGE_BLAS_LIBS}) endif() + if(CMAKE_THREAD_LIBS_INIT) target_link_libraries(megdnn PRIVATE Threads::Threads) endif() diff --git a/dnn/src/common/megcore/common/device_context.hpp b/dnn/src/common/megcore/common/device_context.hpp index 765132be978c4f966e8183bebfc76e79f7b47b72..c12f59c31a9eb00d144b9f310729ae57f01b15da 100644 --- a/dnn/src/common/megcore/common/device_context.hpp +++ b/dnn/src/common/megcore/common/device_context.hpp @@ -38,6 +38,7 @@ class DeviceContext { virtual size_t mem_alignment_in_bytes() const noexcept = 0; virtual void activate() = 0; + virtual void deactivate() {} virtual void *malloc(size_t size_in_bytes) = 0; virtual void free(void *ptr) = 0; diff --git a/dnn/src/common/megcore/public_api/device.cpp b/dnn/src/common/megcore/public_api/device.cpp index 96dfaa7673748efa81b8b7d4019c66f60560abaf..7dbe0068527b4130ee7badd2b6b4f8ce02b92162 100644 --- a/dnn/src/common/megcore/public_api/device.cpp +++ b/dnn/src/common/megcore/public_api/device.cpp @@ -74,6 +74,13 @@ megcoreStatus_t megcoreActivate(megcoreDeviceHandle_t handle) return megcoreSuccess; } +megcoreStatus_t megcoreDeactivate(megcoreDeviceHandle_t handle) +{ + megdnn_assert(handle); + handle->content->deactivate(); + return megcoreSuccess; +} + megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle, void **devPtr, size_t sizeInBytes) { diff --git a/dnn/test/CMakeLists.txt b/dnn/test/CMakeLists.txt index 07bd780381bc0856cf954a6d5efed5bd6d825eda..2a50bdd89dfeb2412b6c7271666c05b5b11df9ec 100644 --- a/dnn/test/CMakeLists.txt +++ b/dnn/test/CMakeLists.txt @@ -27,7 +27,6 @@ endif() - add_executable(megdnn_test ${SOURCES}) target_link_libraries(megdnn_test gtest) target_link_libraries(megdnn_test megdnn ${MGE_BLAS_LIBS}) diff --git a/python_module/src/cpp/opr_defs.cpp b/python_module/src/cpp/opr_defs.cpp index db1e8febbac034755ea757fbf7356c2a06906300..1cbc5979d993d860bcd1b0a7deae3ff6ee89798d 100644 --- a/python_module/src/cpp/opr_defs.cpp +++ b/python_module/src/cpp/opr_defs.cpp @@ -246,6 +246,7 @@ SymbolVarArray _Opr::tensor_rt_runtime(const SymbolVarArray& inputs, } #endif + SymbolVar _Opr::timestamp(SymbolVar input, PyObject* dest, size_t dest_off, const OperatorNodeConfig& config) { auto tensor = std::make_shared( diff --git a/python_module/src/cpp/opr_defs.h b/python_module/src/cpp/opr_defs.h index 2998d545ec17af802d6ac8c0dc0ee0ba8c9f2c1e..82ac9ceb78fa7a5f3afc81d65573ce6aa8365e19 100644 --- a/python_module/src/cpp/opr_defs.h +++ b/python_module/src/cpp/opr_defs.h @@ -118,6 +118,8 @@ static SymbolVarArray tensor_rt_runtime(const SymbolVarArray& inputs, PyObject* data_bytes, const OperatorNodeConfig& config); + + static SymbolVar timestamp(SymbolVar input, PyObject* dest, size_t dest_off, const OperatorNodeConfig& config); diff --git a/python_module/src/cpp/opr_helper.h b/python_module/src/cpp/opr_helper.h index 27e7eeb4a10ef9aeecb1721f89bd4d988be1fd81..15b49d5ab9f695b1e768f9456857d0f98018dc8e 100644 --- a/python_module/src/cpp/opr_helper.h +++ b/python_module/src/cpp/opr_helper.h @@ -18,7 +18,6 @@ #if MGB_ENABLE_OPR_MM #include "megbrain/opr/collective_comm.h" #endif - using AxisIndexer = mgb::opr::indexing::AxisIndexer; /*! diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 13331421e5de6f2a1bb5f1e820cf0a39e05da5bf..11a4cef9c192a794d26fe305c550dc7ea0ba787b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -88,7 +88,7 @@ if (MGB_WITH_FLATBUFFERS) ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs COMMAND ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs - DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} + DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} VERBATIM ) add_custom_command( @@ -124,7 +124,6 @@ if (MGB_WITH_FLATBUFFERS) target_include_directories(megbrain PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/serialization/include) target_compile_definitions(megbrain PUBLIC MGB_ENABLE_FBS_SERIALIZATION=1) target_link_libraries(megbrain PUBLIC flatbuffers) - set (GENERATED_FLATBUFFERS_CONVERTER_PATH ${CMAKE_CURRENT_BINARY_DIR}/genfiles) set (GEN_FLATBUFFERS_CONVERTER_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_converter.py) file (MAKE_DIRECTORY ${GENERATED_FLATBUFFERS_CONVERTER_PATH}) diff --git a/src/core/impl/comp_node_env.cpp b/src/core/impl/comp_node_env.cpp index 9c8bfb899e46c2c266439c62e782f471608de012..b584868d7c9673a2f0c7556fbfb652c3097d9d9b 100644 --- a/src/core/impl/comp_node_env.cpp +++ b/src/core/impl/comp_node_env.cpp @@ -96,7 +96,7 @@ megcore::AsyncErrorInfo* MegDNNHandle::make_async_error_info( cn.free_device(ptr); } }; - megcore::AsyncErrorInfo zero_info{0, nullptr, "", {0,0,0,0}}; + megcore::AsyncErrorInfo zero_info{0, nullptr, "", {0, 0, 0, 0}}; auto ptr = static_cast( env.comp_node().alloc_device(sizeof(zero_info))); cn.copy_to_device(ptr, &zero_info, sizeof(zero_info)); @@ -106,7 +106,7 @@ megcore::AsyncErrorInfo* MegDNNHandle::make_async_error_info( } #endif -/* =================== misc =================== */ + /* =================== misc =================== */ #if MGB_CUDA diff --git a/src/core/impl/graph/var_node_mem_mgr.cpp b/src/core/impl/graph/var_node_mem_mgr.cpp index 401f2390ebd4200edfd144ab9cc7dcbb6fb5584d..bea4b1d070fb185aba384f930fb591456881b842 100644 --- a/src/core/impl/graph/var_node_mem_mgr.cpp +++ b/src/core/impl/graph/var_node_mem_mgr.cpp @@ -123,9 +123,9 @@ StaticDeviceMemoryManager::make_default_impl() { } #endif // MGB_THREAD_SAFE -/* ==================== CUDAAsyncVarReleaser ==================== */ -#if MGB_CUDA -class VarNodeMemManager::CUDAAsyncVarReleaser { +/* ==================== AsyncVarReleaser ==================== */ +#if MGB_CUDA +class VarNodeMemManager::AsyncVarReleaser { struct WaiterParam { CompNode cn; CompNode::Event *event; @@ -133,10 +133,10 @@ class VarNodeMemManager::CUDAAsyncVarReleaser { }; class Waiter final: public AsyncQueueSC { - CUDAAsyncVarReleaser *m_par_releaser; + AsyncVarReleaser *m_par_releaser; public: - Waiter(CUDAAsyncVarReleaser *releaser): + Waiter(AsyncVarReleaser *releaser): m_par_releaser(releaser) { } @@ -159,7 +159,7 @@ class VarNodeMemManager::CUDAAsyncVarReleaser { Spinlock m_event_pool_lock; public: - ~CUDAAsyncVarReleaser() { + ~AsyncVarReleaser() { wait_release_finish(); } @@ -247,15 +247,16 @@ bool VarNodeMemManager::ImpureMemPlanManager::check_need_realloc() { VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph): m_owner_graph(graph), m_seq_mem_opt(graph) -#if MGB_CUDA - ,m_cuda_asyn_var_releaser(new CUDAAsyncVarReleaser) +#if MGB_CUDA + ,m_asyn_var_releaser(new AsyncVarReleaser) #endif { auto on_comp_seq_finish = [this](const event::CompSeqExecFinished& ev) { + MGB_MARK_USED_VAR(ev); // async release is only used for sync between multiple comp nodes, and // does not wait for device to finish -#if MGB_CUDA - m_cuda_asyn_var_releaser->wait_release_finish(); +#if MGB_CUDA + m_asyn_var_releaser->wait_release_finish(); #endif m_cpu_async_release_barrier.wait_zero(); }; @@ -295,9 +296,10 @@ VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph): graph->event().register_receiver_permanent( on_comp_seq_error); -#if MGB_ENABLE_VAR_DEV_MEM_DEFRAGMENTER +#if MGB_ENABLE_VAR_DEV_MEM_DEFRAGMENTER && (MGB_CUDA \ + ) auto on_mem_defrag_start = [this](const event::BeforeMemDefrag&) { - m_cuda_asyn_var_releaser->wait_release_finish(); + m_asyn_var_releaser->wait_release_finish(); }; graph->event().register_receiver_permanent( on_mem_defrag_start); @@ -1341,7 +1343,7 @@ void VarNodeMemManager::decr_var_mem_refcnt( } #if MGB_CUDA case DT::CUDA: - m_cuda_asyn_var_releaser->add(dispatch_cn, var); + m_asyn_var_releaser->add(dispatch_cn, var); break; #endif default: diff --git a/src/core/impl/graph/var_node_mem_mgr.h b/src/core/impl/graph/var_node_mem_mgr.h index 414c03708048abb477446f4db180b6f7953b7aed..2f2e99717f288ee174e9853de2685ba361e1c33f 100644 --- a/src/core/impl/graph/var_node_mem_mgr.h +++ b/src/core/impl/graph/var_node_mem_mgr.h @@ -431,10 +431,10 @@ class VarNodeMemManager { SyncableCounter m_cpu_async_release_barrier; -#if MGB_CUDA - //! release dynamic var on after cuda event finishes - class CUDAAsyncVarReleaser; - std::unique_ptr m_cuda_asyn_var_releaser; +#if MGB_CUDA + //! release dynamic var on after compnode event finishes + class AsyncVarReleaser; + std::unique_ptr m_asyn_var_releaser; #endif VarDevMemDefragmenter m_var_dev_mem_defragmenter{this}; diff --git a/src/core/include/megbrain/comp_node_env.h b/src/core/include/megbrain/comp_node_env.h index fd0e846c7c560a38f140ece0b4fdd1e762a6c2b1..2abece19b723a01103feefbab878ecfab80caa77 100644 --- a/src/core/include/megbrain/comp_node_env.h +++ b/src/core/include/megbrain/comp_node_env.h @@ -41,9 +41,9 @@ } \ } while (0) -#endif // MGB_ENABLE_LOGGING +#endif //MGB_ENABLE_LOGGING +#endif //MGB_CUDA -#endif //! whether to enable asynchronous initialization for CompNode and CompNodeEnv #define MGB_ENABLE_COMP_NODE_ASYNC_INIT (MGB_CUDA) diff --git a/src/core/include/megbrain/exception.h b/src/core/include/megbrain/exception.h index 9f5eb604c4ec8a93a02ff100ff15290a844156d7..ecdfdbf0c97348f69712b9c3db0fcf7d9ea3418d 100644 --- a/src/core/include/megbrain/exception.h +++ b/src/core/include/megbrain/exception.h @@ -136,7 +136,6 @@ public: * error message */ static std::string get_cuda_extra_info(); - CudaError(const std::string& msg); }; diff --git a/src/core/test/comp_node.cpp b/src/core/test/comp_node.cpp index d16a8f7e08b7b8de8f03203d09c1047e1e25ef14..9731f8aca105cd6d444ae3a0bfa28f405c1c7a43 100644 --- a/src/core/test/comp_node.cpp +++ b/src/core/test/comp_node.cpp @@ -59,9 +59,6 @@ TEST(TestCompNode, Parse) { ASSERT_THROW(L::parse("cpu0:"), MegBrainError); ASSERT_THROW(L::parse("cpu0:x"), MegBrainError); ASSERT_THROW(L::parse("cpu2:23x"), MegBrainError); - ASSERT_THROW(L::parse("heaxgon0"), MegBrainError); - ASSERT_THROW(L::parse("rcom0"), MegBrainError); - ASSERT_THROW(L::parse("cmabricon0"), MegBrainError); ASSERT_THROW(L::parse("multithread"), MegBrainError); ASSERT_THROW(L::parse("multithread1:"), MegBrainError); ASSERT_THROW(L::parse("multithread1:default"), MegBrainError); diff --git a/src/megbrain_build_config.h.in b/src/megbrain_build_config.h.in index 55afbe61c37f3da9f2d39862b71b7dab78785ae2..d04fd787a5060a54ff8bedabcfbdc5b04882c8b6 100644 --- a/src/megbrain_build_config.h.in +++ b/src/megbrain_build_config.h.in @@ -53,6 +53,7 @@ #cmakedefine01 MEGDNN_THREADS_512 #cmakedefine01 MEGDNN_ENABLE_MULTI_THREADS + // whether cuda is available #ifndef MGB_CUDA #define MGB_CUDA 1 diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4559e9918fd75340e9486d6cf1cc8f2da0708e9c..767715b6019ff02f73bb63e39159adcdc0820d63 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,6 +15,7 @@ if (MGE_WITH_CUDA AND MGE_WITH_TRT) list(APPEND SOURCES ${SOURCES_}) endif() + add_executable(megbrain_test ${SOURCES}) target_link_libraries(megbrain_test gtest) target_link_libraries(megbrain_test megengine) diff --git a/test/src/helper.cpp b/test/src/helper.cpp index bedca8fb53daa06b7c07fabcf873889ac2ff60a2..df4fe649ad14d97ec43940f796f7f212f2ae21bf 100644 --- a/test/src/helper.cpp +++ b/test/src/helper.cpp @@ -98,22 +98,48 @@ dtype, RandomDistribution::UNIFORM>::operator ()( return ret; } +template +std::shared_ptr HostTensorGenerator< +dtype, RandomDistribution::CONSTANT>::operator ()( + const TensorShape &shape, CompNode cn) { + if (!cn.valid()) + cn = CompNode::load("xpu0"); + std::shared_ptr ret = + std::make_shared(cn, shape, dtype()); + auto ptr = ret->ptr(); + for (size_t i = 0, it = shape.total_nr_elems(); i < it; ++ i) { + ptr[i] = m_default_val; + } + return ret; +} + + // explicit instantialization of HostTensorGenerator namespace mgb { template class HostTensorGenerator< dtype::Float32, RandomDistribution::GAUSSIAN>; template class HostTensorGenerator< dtype::Float32, RandomDistribution::UNIFORM>; + template class HostTensorGenerator< + dtype::Float32, RandomDistribution::CONSTANT>; template class HostTensorGenerator< dtype::Float16, RandomDistribution::GAUSSIAN>; template class HostTensorGenerator< dtype::Int8, RandomDistribution::UNIFORM>; + template class HostTensorGenerator< + dtype::Int8, RandomDistribution::CONSTANT>; template class HostTensorGenerator< dtype::Uint8, RandomDistribution::UNIFORM>; + template class HostTensorGenerator< + dtype::Uint8, RandomDistribution::CONSTANT>; template class HostTensorGenerator< dtype::Int16, RandomDistribution::UNIFORM>; + template class HostTensorGenerator< + dtype::Int16, RandomDistribution::CONSTANT>; template class HostTensorGenerator< dtype::Int32, RandomDistribution::UNIFORM>; + template class HostTensorGenerator< + dtype::Int32, RandomDistribution::CONSTANT>; std::shared_ptr HostTensorGenerator:: operator()(const TensorShape& shape, CompNode cn) { diff --git a/test/src/include/megbrain/test/helper.h b/test/src/include/megbrain/test/helper.h index e4e9f51eb25f0ddebdf97d0f14499ed0bcc47156..065fade48eadf43e7f5780a10b40030348fa05f7 100644 --- a/test/src/include/megbrain/test/helper.h +++ b/test/src/include/megbrain/test/helper.h @@ -175,7 +175,7 @@ class RNGxorshf { }; enum class RandomDistribution { - GAUSSIAN, UNIFORM + GAUSSIAN, UNIFORM, CONSTANT }; template @@ -322,6 +322,26 @@ class HostTensorGenerator final: ctype m_lo, m_hi; }; +//! const value +template +class HostTensorGenerator final: + public HostTensorGeneratorBase { + + public: + using ctype = typename DTypeTrait::ctype; + + HostTensorGenerator(ctype default_val) + : HostTensorGeneratorBase{next_rand_seed()}, + m_default_val{default_val} {} + + std::shared_ptr operator ()( + const TensorShape &shape, CompNode cn = {}) override; + using HostTensorGeneratorBase::operator(); + + private: + ctype m_default_val; +}; + template <> class HostTensorGenerator final : public HostTensorGeneratorBase { diff --git a/tools/param_defs/mgb_opr_param_defs.py b/tools/param_defs/mgb_opr_param_defs.py index c1ad2fe65fed28f8a19a47ef48804a1af833c4b7..6b64364a65d06ba4b677f68d92cfad5197d07b24 100644 --- a/tools/param_defs/mgb_opr_param_defs.py +++ b/tools/param_defs/mgb_opr_param_defs.py @@ -21,8 +21,8 @@ pdef('PersistentOutputStorage').add_fields( (pdef('ExecutionPolicy', 'specify how to select an algorithm for an operator'). add_enum('Strategy', Doc('HEURISTIC', 'use heuristic to choose the fastest algorithm'), - Doc('HEURISTIC_REPRODUCIBLE', 'use heuristic to choose the fastest algorithm, ' - 'and the chosen algorithm is reproducible'), + Doc('HEURISTIC_REPRODUCIBLE', 'use heuristic to choose the fastest algorithm, ' + 'and the chosen algorithm is reproducible'), Doc('PROFILE', 'run possible algorithms on real device to find the best'), Doc('PROFILE_REPRODUCIBLE',