From 1783b8977a7aa49c6023b3802a2e9e35087a80c4 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 12 Apr 2022 19:23:57 +0800 Subject: [PATCH] feat(profiler): integrate cupti backend GitOrigin-RevId: dec8be1908f12b25a71c541386387ee8b9da3a11 --- CMakeLists.txt | 11 + cmake/cudnn.cmake | 2 +- cmake/cupti.cmake | 85 ++++++ cmake/tensorrt.cmake | 2 +- imperative/CMakeLists.txt | 4 + imperative/python/megengine/utils/profiler.py | 23 +- imperative/python/src/tensor.cpp | 5 + imperative/src/impl/cpp_cupti.cpp | 273 ++++++++++++++++++ imperative/src/impl/profiler.cpp | 17 ++ .../src/impl/profiler/chrome_timeline.cpp | 92 +++++- imperative/src/impl/profiler/events.h | 55 ++++ imperative/src/impl/profiler/states.h | 59 +++- imperative/src/impl/utils/platform.cpp | 25 ++ .../include/megbrain/imperative/cpp_cupti.h | 86 ++++++ .../include/megbrain/imperative/profiler.h | 11 +- .../megbrain/imperative/utils/platform.h | 9 + imperative/test/CMakeLists.txt | 4 + .../whl/manylinux2014/build_wheel_common.sh | 28 +- src/megbrain_build_config.h.in | 1 + 19 files changed, 761 insertions(+), 31 deletions(-) create mode 100644 cmake/cupti.cmake create mode 100644 imperative/src/impl/cpp_cupti.cpp create mode 100644 imperative/src/impl/utils/platform.cpp create mode 100644 imperative/src/include/megbrain/imperative/cpp_cupti.h create mode 100644 imperative/src/include/megbrain/imperative/utils/platform.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 8af8648fc..1dd92aa80 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ endif() include(GNUInstallDirs) include(CheckCXXCompilerFlag) include(CheckIPOSupported) +include(CMakeDependentOption) check_cxx_compiler_flag(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS) @@ -97,6 +98,12 @@ option(MGE_BUILD_WITH_ASAN "Enable build with ASAN, need compiler support" OFF) option(MGE_WITH_CUSTOM_OP "Build with Custom op" OFF) option(MGE_SYNC_THIRD_PARTY "help sync third_party submodule" OFF) +# TODO: add windows support +cmake_dependent_option(MGE_WITH_CUPTI "Build with CUPTI" ON + "MGE_WITH_CUDA;MGE_BUILD_IMPERATIVE_RT;NOT MSVC;NOT WIN32" OFF) + +set(MGB_CUPTI ${MGE_WITH_CUPTI}) + if(MSVC OR WIN32) # FIXME: static link Windows vc runtime with some version from Visual Studio have some # runtime issue at some call PATH, for example: _imperative_rt.pyd --> @@ -686,6 +693,10 @@ if(MGB_WITH_FLATBUFFERS) include(cmake/flatbuffers.cmake) endif() +if(MGE_WITH_CUPTI) + include(cmake/cupti.cmake) +endif() + if(MGE_WITH_CUDA) include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index cb7c8031c..302b6f22e 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -6,7 +6,7 @@ endif() if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) endif() -message("CUDNN ROOT: " ${CUDNN_ROOT_DIR}) +message(STATUS "CUDNN ROOT: ${CUDNN_ROOT_DIR}") if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) find_library( CUDNN_LIBRARY diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake new file mode 100644 index 000000000..791a3cc88 --- /dev/null +++ b/cmake/cupti.cmake @@ -0,0 +1,85 @@ +if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_ROOT_DIR}" STREQUAL "") + set(CUDA_ROOT_DIR $ENV{CUDA_ROOT_DIR}) +endif() +if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_PATH}" STREQUAL "") + set(CUDA_ROOT_DIR $ENV{CUDA_PATH}) +endif() +if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_BIN_PATH}" STREQUAL "") + set(CUDA_ROOT_DIR $ENV{CUDA_BIN_PATH}) +endif() +if("${CUDA_ROOT_DIR}" STREQUAL "") + message( + FATAL_ERROR + "Can not find CUDA, please export cuda sdk path to CUDA_ROOT_DIR or CUDA_PATH or CUDA_BIN_PATH" + ) +endif() +# TODO: find_library(CUDA_ROOT_DIR) in cmake/cuda.cmake + +set(MGE_CUPTI_USE_STATIC ${MGE_CUDA_USE_STATIC}) + +# relates https://stackoverflow.com/questions/67485114 +if(${MGE_CUDA_USE_STATIC} AND ${CXX_SUPPORT_GOLD}) + message(WARNING "static linking CuPTI with gold may break exception handling,\ + use shared one instead") + set(MGE_CUPTI_USE_STATIC OFF) +endif() + +if(MGE_CUPTI_USE_STATIC) + find_library( + CUPTI_LIBRARY + NAMES libcupti_static.a + HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI + PATH_SUFFIXES lib lib64 + DOC "CuPTI library.") + + if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND") + message(WARNING "Can not find static CuPTI Library, use shared one instead") + set(MGE_CUPTI_USE_STATIC OFF) + endif() +endif() + +if(NOT ${MGE_CUPTI_USE_STATIC}) + find_library( + CUPTI_LIBRARY + NAMES libcupti.so + HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI + PATH_SUFFIXES lib lib64 + DOC "CuPTI library.") + set(CUPTI_LIBRARY_TYPE SHARED) +else() + set(CUPTI_LIBRARY_TYPE STATIC) +endif() + +if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND") + message(FATAL_ERROR "Can not find CuPTI Library") +endif() + +find_path( + CUPTI_INCLUDE_DIR + NAMES cupti.h + HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI + PATH_SUFFIXES include + DOC "Path to CuPTI include directory.") + +if(CUPTI_INCLUDE_DIR STREQUAL "CUPTI_INCLUDE_DIR-NOTFOUND") + message(FATAL_ERROR "Can not find CuPTI INCLUDE") +endif() + +if(EXISTS ${CUPTI_INCLUDE_DIR}/cupti_version.h) + file(READ ${CUPTI_INCLUDE_DIR}/cupti_version.h CUPTI_VERSION_FILE_CONTENTS) +else() + file(READ ${CUPTI_INCLUDE_DIR}/cupti.h CUPTI_VERSION_FILE_CONTENTS) +endif() + +string(REGEX MATCH "define CUPTI_API_VERSION * +([0-9]+)" CUPTI_API_VERSION + "${CUPTI_VERSION_FILE_CONTENTS}") +string(REGEX REPLACE "define CUPTI_API_VERSION * +([0-9]+)" "\\1" CUPTI_API_VERSION + "${CUPTI_API_VERSION}") + +add_library(libcupti ${CUPTI_LIBRARY_TYPE} IMPORTED) + +set_target_properties( + libcupti PROPERTIES IMPORTED_LOCATION ${CUPTI_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES + ${CUPTI_INCLUDE_DIR}) + +message(STATUS "Found CuPTI: ${CUPTI_LIBRARY} (found version: ${CUPTI_API_VERSION})") diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index ecd742d02..787907d2e 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -36,7 +36,7 @@ else() PATH_SUFFIXES lib lib64 DOC "TRT plugin library.") endif() -message("TRT_LIBRARY" ${TRT_LIBRARY}) +message(STATUS "TRT_LIBRARY: ${TRT_LIBRARY}") if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND") message( FATAL_ERROR diff --git a/imperative/CMakeLists.txt b/imperative/CMakeLists.txt index 76d35e3a9..1e94ab207 100644 --- a/imperative/CMakeLists.txt +++ b/imperative/CMakeLists.txt @@ -51,6 +51,10 @@ if(ANDROID) target_link_libraries(${MODULE_NAME} PRIVATE ${PYTHON_LIBRARIES}) endif() +if(MGE_WITH_CUPTI) + target_link_libraries(${MODULE_NAME} PRIVATE libcupti) +endif() + add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3) target_link_libraries(${MODULE_NAME} PRIVATE range-v3) diff --git a/imperative/python/megengine/utils/profiler.py b/imperative/python/megengine/utils/profiler.py index 702a8e374..ec6c1ece2 100644 --- a/imperative/python/megengine/utils/profiler.py +++ b/imperative/python/megengine/utils/profiler.py @@ -16,6 +16,10 @@ from weakref import WeakSet from .. import _atexit from ..core._imperative_rt.core2 import ( + cupti_available, + disable_cupti, + enable_cupti, + full_sync, pop_scope, push_scope, start_profile, @@ -50,13 +54,18 @@ class Profiler(ContextDecorator): with profiler: # your code here - + # Then open the profile file in chrome timeline window """ CHROME_TIMELINE = "chrome_timeline.json" - valid_options = {"sample_rate": 0, "profile_device": 1, "num_tensor_watch": 10} + valid_options = { + "sample_rate": 0, + "profile_device": 1, + "num_tensor_watch": 10, + "enable_cupti": 0, + } valid_formats = {"chrome_timeline.json", "memory_flow.svg"} def __init__( @@ -83,6 +92,11 @@ class Profiler(ContextDecorator): self._options[opt] = int(kwargs.pop(opt, optval)) self._pid = "" self._dump_callback = None + if self._options.get("enable_cupti", 0): + if cupti_available(): + enable_cupti() + else: + get_logger().warning("CuPTI unavailable") @property def path(self): @@ -116,7 +130,7 @@ class Profiler(ContextDecorator): assert _running_profiler is self _running_profiler = None - sync() + full_sync() self._dump_callback = stop_profile() self._pid = os.getpid() _living_profilers.add(self) @@ -160,6 +174,9 @@ class Profiler(ContextDecorator): return func def __del__(self): + if self._options.get("enable_cupti", 0): + if cupti_available(): + disable_cupti() self.dump() diff --git a/imperative/python/src/tensor.cpp b/imperative/python/src/tensor.cpp index c7b83ea1b..747249942 100644 --- a/imperative/python/src/tensor.cpp +++ b/imperative/python/src/tensor.cpp @@ -11,6 +11,7 @@ #include "megbrain/common.h" #include "megbrain/dtype.h" +#include "megbrain/imperative/cpp_cupti.h" #include "megbrain/imperative/ops/autogen.h" #include "megbrain/imperative/ops/backward_graph.h" #include "megbrain/imperative/ops/utility.h" @@ -982,6 +983,7 @@ void init_tensor(py::module m) { m.def("stop_profile", [channel]() -> std::function { channel->stop_profile(); channel->sync(); + CompNode::sync_all(); imperative::Profiler::stop_profile(); auto results = std::make_shared( imperative::Profiler::collect()); @@ -990,6 +992,9 @@ void init_tensor(py::module m) { results = nullptr; }; }); + m.def("enable_cupti", &cupti::enable); + m.def("disable_cupti", &cupti::disable); + m.def("cupti_available", &cupti::available); m.def("sync", [channel]() { if (channel->check_available()) { channel->sync(); diff --git a/imperative/src/impl/cpp_cupti.cpp b/imperative/src/impl/cpp_cupti.cpp new file mode 100644 index 000000000..92f87a194 --- /dev/null +++ b/imperative/src/impl/cpp_cupti.cpp @@ -0,0 +1,273 @@ +#include "megbrain/imperative/cpp_cupti.h" + +#include +#include +#include + +#include "megbrain/exception.h" +#include "megbrain/imperative/profiler.h" +#include "megbrain/imperative/utils/platform.h" + +#include "./profiler/events.h" + +#if MGB_CUPTI +#include "cupti.h" + +#define CUPTI_CALL(call) \ + do { \ + CUptiResult _status = call; \ + if (_status != CUPTI_SUCCESS) { \ + const char* errstr; \ + cuptiGetResultString(_status, &errstr); \ + mgb_assert(_status == CUPTI_SUCCESS, "cupti error: %s", errstr); \ + } \ + } while (0) +#endif + +namespace mgb::imperative::cupti { + +#if MGB_CUPTI +namespace { +CUpti_SubscriberHandle cuptiSubscriber; + +void cuptiSubscriberCallback( + void* userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id, + const void* cb_info) { + using namespace profiler; + switch (domain) { + case CUPTI_CB_DOMAIN_DRIVER_API: { + auto cb_data = (const CUpti_CallbackData*)cb_info; + switch (cb_id) { + case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: { + if (cb_data->callbackSite == CUPTI_API_ENTER) { + MGB_RECORD_EVENT( + CUPTIKernelLaunchEvent, cb_data->correlationId, + cb_data->symbolName); + } else if (cb_data->callbackSite == CUPTI_API_EXIT) { + MGB_RECORD_EVENT( + CUPTIKernelLaunchFinishEvent, cb_data->correlationId, + cb_data->symbolName); + } + break; + } + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA: { + } + case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: { + if (cb_data->callbackSite == CUPTI_API_ENTER) { + MGB_RECORD_EVENT( + CUPTIMemcpyLaunchEvent, cb_data->correlationId); + } else if (cb_data->callbackSite == CUPTI_API_EXIT) { + MGB_RECORD_EVENT( + CUPTIMemcpyLaunchFinishEvent, cb_data->correlationId); + } + break; + } + default: { + if (cb_data->callbackSite == CUPTI_API_ENTER) { + MGB_RECORD_EVENT( + CUPTIDriverEvent, cb_data->correlationId, + cb_data->functionName); + } else if (cb_data->callbackSite == CUPTI_API_EXIT) { + MGB_RECORD_EVENT( + CUPTIDriverFinishEvent, cb_data->correlationId, + cb_data->functionName); + } + } + } + break; + } + case CUPTI_CB_DOMAIN_RUNTIME_API: { + auto cb_data = (const CUpti_CallbackData*)cb_info; + if (cb_data->callbackSite == CUPTI_API_ENTER) { + MGB_RECORD_EVENT( + CUPTIRuntimeEvent, cb_data->correlationId, + cb_data->functionName); + } else if (cb_data->callbackSite == CUPTI_API_EXIT) { + MGB_RECORD_EVENT( + CUPTIRuntimeFinishEvent, cb_data->correlationId, + cb_data->functionName); + } + break; + } + } +} + +void handleActivity(CUpti_Activity* record) { + using namespace std::chrono_literals; + auto delta = 16ns; + switch (record->kind) { + case CUPTI_ACTIVITY_KIND_KERNEL: + case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { + auto kernel = cupti::activity(record); + MGB_RECORD_EVENT( + profiler::CUPTIKernelExecuteEvent, kernel->correlationId, + kernel->name, kernel.stream(), kernel.start(), + kernel.end() - delta); + break; + } + case CUPTI_ACTIVITY_KIND_MEMCPY: { + auto memcpy = cupti::activity(record); + MGB_RECORD_EVENT( + profiler::CUPTIMemcpyEvent, memcpy->correlationId, memcpy->srcKind, + memcpy->dstKind, memcpy->bytes, memcpy.stream(), memcpy.start(), + memcpy.end()); + break; + } + case CUPTI_ACTIVITY_KIND_MEMSET: { + auto memset = cupti::activity(record); + MGB_RECORD_EVENT( + profiler::CUPTIMemsetEvent, memset->correlationId, memset->value, + memset->bytes, memset.stream(), memset.start(), + memset.end() - delta); + break; + } + default: + break; + } +} + +using activity_buffer_t = + std::aligned_storage_t<8 * 1024 * 1024, ACTIVITY_RECORD_ALIGNMENT>; + +void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { + *buffer = reinterpret_cast(new activity_buffer_t()); + *size = sizeof(activity_buffer_t); + *maxNumRecords = 0; +} + +void bufferCompleted( + CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size, + size_t validSize) { + CUptiResult status; + CUpti_Activity* record = NULL; + + if (validSize > 0) { + do { + status = cuptiActivityGetNextRecord(buffer, validSize, &record); + if (status == CUPTI_SUCCESS) { + handleActivity(record); + } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) + break; + else { + CUPTI_CALL(status); + } + } while (1); + + size_t dropped; + CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); + mgb_assert(dropped == 0, "%zu records dropped", dropped); + } + + delete reinterpret_cast(buffer); +} + +static bool initialized = false; +} // namespace + +bool available() { + uint32_t compiletime_version = (CUPTI_API_VERSION); + uint32_t runtime_version; + CUPTI_CALL(cuptiGetVersion(&runtime_version)); + if (compiletime_version != runtime_version) { + static std::once_flag once; + std::call_once(once, [&] { + mgb_log_warn( + "CuPTI version %d mismatch against compiletime version %d. " + "This may caused by user config LD_LIBRARY_PATH" + "at unix-like env or config PATH at Windows env", + (int)compiletime_version, (int)runtime_version); + }); + return false; + } + return true; +} + +void enable() { + // not thread safe + mgb_assert(!initialized, "cupti already initialized"); + // callback + CUPTI_CALL(cuptiSubscribe( + &cuptiSubscriber, (CUpti_CallbackFunc)cuptiSubscriberCallback, + (void*)nullptr)); + CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_DRIVER_API)); + CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_RUNTIME_API)); + + // activity + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME)); + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER)); + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); + CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); + initialized = true; +} + +void disable() { + mgb_assert(initialized, "cupti not initialized yet"); + flush(); + CUPTI_CALL(cuptiFinalize()); + initialized = false; +} + +void flush() { + if (initialized) { + CUPTI_CALL(cuptiActivityFlushAll(1)); + } +} + +bool enabled() { + return initialized; +} + +time_point clock::now() { + uint64_t timestamp; + CUPTI_CALL(cuptiGetTimestamp(×tamp)); + using namespace std::chrono; + // overflow? + return time_point(duration((int64_t)timestamp)); +} + +#else + +class CuPTIUnavailableError : public MegBrainError { +public: + CuPTIUnavailableError() + : MegBrainError( +#if MGB_CUDA + "CuPTI disabled at compile time" +#else + "CuPTI unsupported on non cuda platform" +#endif + ) { + } +}; + +bool available() { + return false; +} + +void enable() { + throw CuPTIUnavailableError(); +} + +void disable() { + throw CuPTIUnavailableError(); +} + +void flush() {} + +bool enabled() { + return false; +} + +time_point clock::now() { + throw CuPTIUnavailableError(); +} + +#endif +} // namespace mgb::imperative::cupti diff --git a/imperative/src/impl/profiler.cpp b/imperative/src/impl/profiler.cpp index d540cafd9..12204d5a1 100644 --- a/imperative/src/impl/profiler.cpp +++ b/imperative/src/impl/profiler.cpp @@ -12,7 +12,9 @@ #include "megbrain/imperative/profiler.h" #include +#include +#include "megbrain/imperative/cpp_cupti.h" #include "megbrain/imperative/ops/opr_attr.h" #include "megbrain/imperative/physical_tensor.h" @@ -48,6 +50,21 @@ bool Profiler::sm_profiling = false; thread_local Profiler* Profiler::tm_profiler = nullptr; std::atomic_size_t Profiler::sm_preferred_capacity; +void Profiler::start_profile() { + mgb_assert(!sm_profiling); + sm_start_at = Timer::record_host(); + sm_profiling = true; + if (cupti::enabled()) { + MGB_RECORD_EVENT(profiler::CUPTITimestampEvent, cupti::clock::now()); + } +} + +void Profiler::stop_profile() { + mgb_assert(sm_profiling); + cupti::flush(); + sm_profiling = false; +} + auto Profiler::get_thread_dict() -> thread_dict_t { thread_dict_t thread_dict; for (auto&& [tid, profiler] : sm_profilers) { diff --git a/imperative/src/impl/profiler/chrome_timeline.cpp b/imperative/src/impl/profiler/chrome_timeline.cpp index e2d3548aa..2715a0755 100644 --- a/imperative/src/impl/profiler/chrome_timeline.cpp +++ b/imperative/src/impl/profiler/chrome_timeline.cpp @@ -19,6 +19,7 @@ #include "nlohmann/json.hpp" +#include "megbrain/imperative/utils/platform.h" #include "megbrain/utils/debug.h" #include "./formats.h" @@ -198,6 +199,8 @@ struct ChromeTimelineEventVisitor : EventVisitor { decltype(getpid()) pid = getpid(); std::string pid_str = std::to_string(pid); + ChromeTimelineEventVisitor() {} + ChromeTraceEvent& new_event( std::string name, char ph, size_t tid, profiler::HostTime time) { return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts( @@ -213,8 +216,13 @@ struct ChromeTimelineEventVisitor : EventVisitor { .ts(since_start(current->time)); } + ChromeTraceEvent& new_cupti_event( + std::string name, char ph, cupti::stream_t stream, + cupti::time_point timestamp) { + return new_event(name, ph, to_tid(stream), time_from_cupti(timestamp)); + } + ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { - using namespace std::literals::chrono_literals; auto time = since_start(to_device_time(current->time, device)); return trace_events.new_event() .name(name) @@ -391,6 +399,80 @@ struct ChromeTimelineEventVisitor : EventVisitor { auto device_ahead = std::chrono::duration_cast( current_device_time - current_host_time); new_host_event("device_ahead_ms", 'C').arg("value", device_ahead.count()); + } else if constexpr (std::is_same_v) { + new_host_event(demangle(event.name), 'B'); + new_host_event(pid_str, 's') + .id(event.correlation_id) + .cat("KernelLink") + .scope(pid_str); + } else if constexpr (std::is_same_v) { + new_host_event(demangle(event.name), 'E'); + } else if constexpr (std::is_same_v) { + new_cupti_event(demangle(event.name), 'B', event.stream, event.start) + .arg("execution_time", (event.end - event.start).count()); + new_cupti_event(pid_str, 'f', event.stream, event.end) + .id(event.correlation_id) + .bp('e') + .cat("KernelLink") + .scope(pid_str); + new_cupti_event(demangle(event.name), 'E', event.stream, event.end) + .arg("execution_time", (event.end - event.start).count()); + } else if constexpr (std::is_same_v) { + new_host_event("Memcpy", 'B'); + new_host_event(pid_str, 's') + .id(event.correlation_id) + .cat("CUPTILink") + .scope(pid_str); + } else if constexpr (std::is_same_v) { + new_host_event("Memcpy", 'E'); + } else if constexpr (std::is_same_v) { + auto memkind2str = [](uint8_t kind) { + const char* const valid_kinds[] = { + "CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN", + "CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE", + "CUPTI_ACTIVITY_MEMORY_KIND_PINNED", + "CUPTI_ACTIVITY_MEMORY_KIND_DEVICE", + "CUPTI_ACTIVITY_MEMORY_KIND_ARRAY", + "CUPTI_ACTIVITY_MEMORY_KIND_MANAGED", + "CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC", + "CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC"}; + if (kind > (sizeof(valid_kinds) / sizeof(const char*))) { + return "invalid"; + } + return valid_kinds[kind]; + }; + new_cupti_event("Memcpy", 'B', event.stream, event.start) + .arg("bytes", imperative::to_string(event.bytes)) + .arg("src_kind", memkind2str(event.src_kind)) + .arg("dst_kind", memkind2str(event.dst_kind)); + new_cupti_event(pid_str, 'f', event.stream, event.start) + .id(event.correlation_id) + .bp('e') + .cat("CUPTILink") + .scope(pid_str); + new_cupti_event("Memcpy", 'E', event.stream, event.end) + .arg("bytes", imperative::to_string(event.bytes)) + .arg("src_kind", memkind2str(event.src_kind)) + .arg("dst_kind", memkind2str(event.dst_kind)); + } else if constexpr (std::is_same_v) { + new_cupti_event("Memset", 'B', event.stream, event.start) + .arg("value", imperative::to_string(event.value)) + .arg("bytes", imperative::to_string(event.bytes)); + new_cupti_event("Memset", 'E', event.stream, event.start) + .arg("value", imperative::to_string(event.value)) + .arg("bytes", imperative::to_string(event.bytes)); + } else if constexpr (std::is_same_v) { + new_host_event(event.name, 'B'); + } else if constexpr (std::is_same_v) { + new_host_event(event.name, 'E'); + } else if constexpr (std::is_same_v) { + new_host_event(event.name, 'B'); + new_host_event(pid_str, 's') + .id(event.correlation_id) + .cat("CUPTILink") + .scope(pid_str); + } else if constexpr (std::is_same_v) { + new_host_event(event.name, 'E'); } } @@ -403,7 +485,8 @@ struct ChromeTimelineEventVisitor : EventVisitor { if (thread_dict.count(host)) { trace_events.new_event() .name("thread_name") - .pid('M') + .ph('M') + .pid(pid) .tid(to_tid(host)) .arg("name", thread_dict.at(host)); } @@ -411,7 +494,8 @@ struct ChromeTimelineEventVisitor : EventVisitor { for (auto&& device : devices()) { trace_events.new_event() .name("thread_name") - .pid('M') + .ph('M') + .pid(pid) .tid(to_tid(device)) .arg("name", device.to_string_logical()); } @@ -419,7 +503,7 @@ struct ChromeTimelineEventVisitor : EventVisitor { }; void dump_chrome_timeline(std::string filename, Profiler::bundle_t result) { - ChromeTimelineEventVisitor visitor; + ChromeTimelineEventVisitor visitor{}; visitor.process_events(result); visitor.name_threads(result.thread_dict); auto trace_events = std::move(visitor.trace_events); diff --git a/imperative/src/impl/profiler/events.h b/imperative/src/impl/profiler/events.h index 9bba25cc0..e882ff0aa 100644 --- a/imperative/src/impl/profiler/events.h +++ b/imperative/src/impl/profiler/events.h @@ -16,6 +16,7 @@ #include "../interpreter/stack_manager.h" #include "../op_trait.h" +#include "megbrain/imperative/cpp_cupti.h" namespace mgb::imperative::profiler { @@ -181,6 +182,60 @@ DEF_DUR_EVENT(HostToDevice, { void* device_ptr; }); +// cupti events +DEF_EVENT(CUPTITimestamp, { cupti::clock::time_point timestamp; }); + +DEF_DUR_EVENT(CUPTIKernelLaunch, { + uint32_t correlation_id; + const char* name; +}); + +DEF_EVENT(CUPTIKernelExecute, { + uint32_t correlation_id; + const char* name; + cupti::stream_t stream; + cupti::time_point start; + cupti::time_point end; +}); + +DEF_DUR_EVENT(CUPTIMemcpyLaunch, { uint32_t correlation_id; }); + +DEF_EVENT(CUPTIMemcpy, { + uint32_t correlation_id; + uint8_t src_kind; + uint8_t dst_kind; + uint64_t bytes; + cupti::stream_t stream; + cupti::time_point start; + cupti::time_point end; +}); + +DEF_EVENT(CUPTIMemset, { + uint32_t correlation_id; + uint32_t value; + uint64_t bytes; + cupti::stream_t stream; + cupti::time_point start; + cupti::time_point end; +}); + +DEF_EVENT(CUPTIUnknownDevice, {}); + +DEF_DUR_EVENT(CUPTIRuntime, { + uint32_t correlation_id; + const char* name; +}); + +DEF_DUR_EVENT(CUPTIDriver, { + uint32_t correlation_id; + const char* name; +}); + +DEF_EVENT(CUPTIIdentifyStream, { + cupti::stream_t stream; + CompNode device; +}); + #undef DEF_EVENT #undef DEF_DUR_EVENT diff --git a/imperative/src/impl/profiler/states.h b/imperative/src/impl/profiler/states.h index 1f452cf39..33cebd070 100644 --- a/imperative/src/impl/profiler/states.h +++ b/imperative/src/impl/profiler/states.h @@ -180,10 +180,13 @@ private: HostTime m_start_time; CompNode::UnorderedMap m_device_tid_table; std::unordered_map m_host_tid_table; + std::unordered_map m_cupti_tid_table; CompNode::UnorderedMap> m_device_timeline; std::unordered_map> m_trace_stack; std::unordered_map m_counter_table; + std::optional> m_cupti_timestamp = + {}; protected: Profiler::Record* current; @@ -191,6 +194,11 @@ protected: ProfileTensorState* current_tensor; protected: + size_t next_tid() { + return m_host_tid_table.size() + m_device_tid_table.size() + + m_cupti_tid_table.size(); + } + profiler::Duration since_start(profiler::HostTime time) { return time - m_start_time; } @@ -229,6 +237,10 @@ protected: size_t to_tid(CompNode device) { return m_device_tid_table.at(device); } + size_t to_tid(cupti::stream_t cupti_stream) { + return m_cupti_tid_table.at(cupti_stream); + } + SmallVector host_threads() { SmallVector host_threads; for (auto&& [host, _] : m_host_tid_table) { @@ -254,6 +266,13 @@ protected: value += delta; } + profiler::HostTime time_from_cupti(cupti::time_point timestamp) { + mgb_assert(m_cupti_timestamp.has_value()); + return m_cupti_timestamp->first + + std::chrono::duration_cast( + timestamp - m_cupti_timestamp->second); + } + public: void process_events(Profiler::bundle_t& bundle) { m_start_time = bundle.start_at; @@ -272,7 +291,11 @@ public: TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent, HostToDeviceEvent, - HostToDeviceFinishEvent> + HostToDeviceFinishEvent, CUPTITimestampEvent, CUPTIKernelLaunchEvent, + CUPTIKernelLaunchFinishEvent, CUPTIKernelExecuteEvent, + CUPTIMemcpyLaunchEvent, CUPTIMemcpyLaunchFinishEvent, CUPTIMemcpyEvent, + CUPTIRuntimeEvent, CUPTIRuntimeFinishEvent, CUPTIDriverEvent, + CUPTIDriverFinishEvent, CUPTIMemsetEvent> converter; auto for_each_entry = [&](auto&& handler) { @@ -289,7 +312,9 @@ public: std::shared_ptr device; }; CompNode::UnorderedMap device_start_table; + std::unordered_map cupti_stream_table; + // record device time for_each_entry([&](auto&& event) { using T = std::decay_t; if constexpr (std::is_same_v) { @@ -313,8 +338,7 @@ public: // register host threads for_each_entry([&](auto&& event) { if (!m_host_tid_table.count(current->tid)) { - m_host_tid_table[current->tid] = { - m_device_tid_table.size() + m_host_tid_table.size()}; + m_host_tid_table[current->tid] = next_tid(); } }); @@ -340,14 +364,39 @@ public: } else if constexpr (std::is_same_v) { auto& tensor = m_tensors[event.tensor_id]; if (!m_device_tid_table.count(event.device)) { - m_device_tid_table[event.device] = { - m_device_tid_table.size() + m_host_tid_table.size()}; + m_device_tid_table[event.device] = next_tid(); } tensor.device = event.device; tensor.layout = event.layout; } }); + for_each_entry([&](auto&& event) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + if (!m_cupti_tid_table.count(event.stream)) { + m_cupti_tid_table[event.stream] = + m_device_tid_table.at(event.device); + } + } + }); + + // record cupti streams + for_each_entry([&](auto&& event) { + using T = std::decay_t; + if constexpr ( + std::is_same_v || + std::is_same_v || + std::is_same_v) { + if (!m_cupti_tid_table.count(event.stream)) { + m_cupti_tid_table[event.stream] = next_tid(); + } + } else if constexpr (std::is_same_v) { + mgb_assert(!m_cupti_timestamp.has_value()); + m_cupti_timestamp.emplace(current->time, event.timestamp); + } + }); + // replay execution using namespace std::placeholders; for_each_entry([&](auto&& event) { diff --git a/imperative/src/impl/utils/platform.cpp b/imperative/src/impl/utils/platform.cpp new file mode 100644 index 000000000..482acbb49 --- /dev/null +++ b/imperative/src/impl/utils/platform.cpp @@ -0,0 +1,25 @@ +#include "megbrain/imperative/utils/platform.h" + +#ifdef __GNUG__ +#include +#include +#include +#endif + +using namespace mgb; +using namespace imperative; + +/* + * demangle typeid, see + * http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname + */ +std::string mgb::imperative::demangle(std::string mangled) { +#ifdef __GNUG__ + int status = -1; + std::unique_ptr res{ + abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status), std::free}; + return (status == 0) ? res.get() : mangled; +#else + return mangled; +#endif +} diff --git a/imperative/src/include/megbrain/imperative/cpp_cupti.h b/imperative/src/include/megbrain/imperative/cpp_cupti.h new file mode 100644 index 000000000..bf9030ae4 --- /dev/null +++ b/imperative/src/include/megbrain/imperative/cpp_cupti.h @@ -0,0 +1,86 @@ +#pragma once + +#include +#include + +#include "megbrain/common.h" +#include "megbrain/imperative/utils/to_string.h" + +namespace mgb::imperative::cupti { + +struct clock { + typedef std::chrono::nanoseconds duration; + typedef duration::rep rep; + typedef duration::period period; + typedef std::chrono::time_point time_point; + static const bool is_steady = false; + + static time_point now() /* noexcept */; +}; + +using time_point = clock::time_point; + +using duration = clock::duration; + +struct device_t { + uint32_t device_id; + + bool operator==(const device_t& rhs) const { return device_id == rhs.device_id; } +}; + +struct context_t : device_t { + uint32_t context_id; + + bool operator==(const context_t& rhs) const { + return device_t::operator==(rhs) && context_id == rhs.context_id; + } +}; + +struct stream_t : context_t { + uint32_t stream_id; + + bool operator==(const stream_t& rhs) const { + return context_t::operator==(rhs) && stream_id == rhs.stream_id; + } +}; + +bool available(); + +void enable(); + +void disable(); + +void flush(); + +bool enabled(); + +template +struct activity { +private: + TActivity* m_ptr; + +public: + activity(void* ptr) : m_ptr((TActivity*)ptr) {} + + time_point start() const { return time_point(duration(m_ptr->start)); } + + time_point end() const { return time_point(duration(m_ptr->end)); } + + device_t device() const { return {m_ptr->deviceId}; } + + context_t context() const { return {device(), m_ptr->contextId}; } + + stream_t stream() const { return {context(), m_ptr->streamId}; } + + TActivity* operator->() const { return m_ptr; } +}; + +} // namespace mgb::imperative::cupti + +template <> +class std::hash { +public: + size_t operator()(const mgb::imperative::cupti::stream_t& value) const { + return value.stream_id; + } +}; diff --git a/imperative/src/include/megbrain/imperative/profiler.h b/imperative/src/include/megbrain/imperative/profiler.h index da50ad449..ad76b2b46 100644 --- a/imperative/src/include/megbrain/imperative/profiler.h +++ b/imperative/src/include/megbrain/imperative/profiler.h @@ -194,16 +194,9 @@ public: static bool is_profiling() { return sm_profiling; } - static void start_profile() { - mgb_assert(!sm_profiling); - sm_start_at = Timer::record_host(); - sm_profiling = true; - } + static void start_profile(); - static void stop_profile() { - mgb_assert(sm_profiling); - sm_profiling = false; - } + static void stop_profile(); static thread_dict_t get_thread_dict(); diff --git a/imperative/src/include/megbrain/imperative/utils/platform.h b/imperative/src/include/megbrain/imperative/utils/platform.h new file mode 100644 index 000000000..89685b5b1 --- /dev/null +++ b/imperative/src/include/megbrain/imperative/utils/platform.h @@ -0,0 +1,9 @@ +#pragma once + +#include + +namespace mgb::imperative { + +std::string demangle(std::string mangled); + +} diff --git a/imperative/test/CMakeLists.txt b/imperative/test/CMakeLists.txt index 68fc59f1f..e2e8314a6 100644 --- a/imperative/test/CMakeLists.txt +++ b/imperative/test/CMakeLists.txt @@ -37,6 +37,10 @@ if(MGE_WITH_CUDA) list(APPEND LINK_LIBS cudart) endif() +if(MGE_WITH_CUPTI) + list(APPEND LINK_LIBS libcupti) +endif() + if(MGE_WITH_DISTRIBUTED) list(APPEND LINK_LIBS megray) endif() diff --git a/scripts/whl/manylinux2014/build_wheel_common.sh b/scripts/whl/manylinux2014/build_wheel_common.sh index 5cd80d3d7..2ea03aaf5 100755 --- a/scripts/whl/manylinux2014/build_wheel_common.sh +++ b/scripts/whl/manylinux2014/build_wheel_common.sh @@ -61,11 +61,11 @@ echo "Build with ${SDK_NAME}" if [ $SDK_NAME == "cu101" ];then CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" - EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" - BUILD_GCC8="ON" - REQUIR_CUDA_VERSION="10010" - REQUIR_CUDNN_VERSION="7.6.3" - REQUIR_TENSORRT_VERSION="6.0.1.5" + EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" + BUILD_GCC8="ON" + REQUIR_CUDA_VERSION="10010" + REQUIR_CUDNN_VERSION="7.6.3" + REQUIR_TENSORRT_VERSION="6.0.1.5" REQUIR_CUBLAS_VERSION="10.2.1.243" elif [ $SDK_NAME == "cu102_JetsonNano" ];then @@ -87,6 +87,12 @@ elif [ $SDK_NAME == "cu102_JetsonNano" ];then ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ ${CUDNN_LIB_DIR}/libcudnn.so.8" + if [ ${machine} == "aarch64" ];then + CUDA_COPY_LIB_LIST="\ + ${CUDA_LIB_DIR}/libcupti.so.10.2:\ + ${CUDA_COPY_LIB_LIST}" + fi + EXTRA_CMAKE_FLAG="-DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_53,code=sm_53\" " elif [ $SDK_NAME == "cu111" ];then @@ -118,6 +124,12 @@ elif [ $SDK_NAME == "cu111" ];then ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ ${CUDNN_LIB_DIR}/libcudnn.so.8" + if [ ${machine} == "aarch64" ];then + CUDA_COPY_LIB_LIST="\ + ${CUDA_LIB_DIR}/libcupti.so.11.1:\ + ${CUDA_COPY_LIB_LIST}" + fi + if [ ${IN_CI} = "true" ] && [ ${machine} == "aarch64" ]; then EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_75,code=sm_75\" " else @@ -152,9 +164,9 @@ elif [ $SDK_NAME == "cu112" ];then -gencode arch=compute_86,code=sm_86 \ -gencode arch=compute_86,code=compute_86\" " - REQUIR_CUDA_VERSION="11020" - REQUIR_CUDNN_VERSION="8.0.4" - REQUIR_TENSORRT_VERSION="7.2.2.3" + REQUIR_CUDA_VERSION="11020" + REQUIR_CUDNN_VERSION="8.0.4" + REQUIR_TENSORRT_VERSION="7.2.2.3" REQUIR_CUBLAS_VERSION="11.3.1.68" elif [ $SDK_NAME == "cpu" ];then diff --git a/src/megbrain_build_config.h.in b/src/megbrain_build_config.h.in index 802d316ea..dd7c66858 100644 --- a/src/megbrain_build_config.h.in +++ b/src/megbrain_build_config.h.in @@ -35,6 +35,7 @@ #cmakedefine01 MGB_ENABLE_FBS_SERIALIZATION #cmakedefine01 MGB_IS_DEV #cmakedefine01 MGB_CUSTOM_OP +#cmakedefine01 MGB_CUPTI // DNN related flags // Platform macro's #cmakedefine01 MEGDNN_WITH_CUDA -- GitLab