提交 1783b897 编写于 作者: M Megvii Engine Team

feat(profiler): integrate cupti backend

GitOrigin-RevId: dec8be1908f12b25a71c541386387ee8b9da3a11
上级 e98049d7
......@@ -29,6 +29,7 @@ endif()
include(GNUInstallDirs)
include(CheckCXXCompilerFlag)
include(CheckIPOSupported)
include(CMakeDependentOption)
check_cxx_compiler_flag(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS)
......@@ -97,6 +98,12 @@ option(MGE_BUILD_WITH_ASAN "Enable build with ASAN, need compiler support" OFF)
option(MGE_WITH_CUSTOM_OP "Build with Custom op" OFF)
option(MGE_SYNC_THIRD_PARTY "help sync third_party submodule" OFF)
# TODO: add windows support
cmake_dependent_option(MGE_WITH_CUPTI "Build with CUPTI" ON
"MGE_WITH_CUDA;MGE_BUILD_IMPERATIVE_RT;NOT MSVC;NOT WIN32" OFF)
set(MGB_CUPTI ${MGE_WITH_CUPTI})
if(MSVC OR WIN32)
# FIXME: static link Windows vc runtime with some version from Visual Studio have some
# runtime issue at some call PATH, for example: _imperative_rt.pyd -->
......@@ -686,6 +693,10 @@ if(MGB_WITH_FLATBUFFERS)
include(cmake/flatbuffers.cmake)
endif()
if(MGE_WITH_CUPTI)
include(cmake/cupti.cmake)
endif()
if(MGE_WITH_CUDA)
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES})
......
......@@ -6,7 +6,7 @@ endif()
if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "")
set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR})
endif()
message("CUDNN ROOT: " ${CUDNN_ROOT_DIR})
message(STATUS "CUDNN ROOT: ${CUDNN_ROOT_DIR}")
if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED)
find_library(
CUDNN_LIBRARY
......
if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_ROOT_DIR}" STREQUAL "")
set(CUDA_ROOT_DIR $ENV{CUDA_ROOT_DIR})
endif()
if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_PATH}" STREQUAL "")
set(CUDA_ROOT_DIR $ENV{CUDA_PATH})
endif()
if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_BIN_PATH}" STREQUAL "")
set(CUDA_ROOT_DIR $ENV{CUDA_BIN_PATH})
endif()
if("${CUDA_ROOT_DIR}" STREQUAL "")
message(
FATAL_ERROR
"Can not find CUDA, please export cuda sdk path to CUDA_ROOT_DIR or CUDA_PATH or CUDA_BIN_PATH"
)
endif()
# TODO: find_library(CUDA_ROOT_DIR) in cmake/cuda.cmake
set(MGE_CUPTI_USE_STATIC ${MGE_CUDA_USE_STATIC})
# relates https://stackoverflow.com/questions/67485114
if(${MGE_CUDA_USE_STATIC} AND ${CXX_SUPPORT_GOLD})
message(WARNING "static linking CuPTI with gold may break exception handling,\
use shared one instead")
set(MGE_CUPTI_USE_STATIC OFF)
endif()
if(MGE_CUPTI_USE_STATIC)
find_library(
CUPTI_LIBRARY
NAMES libcupti_static.a
HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI
PATH_SUFFIXES lib lib64
DOC "CuPTI library.")
if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND")
message(WARNING "Can not find static CuPTI Library, use shared one instead")
set(MGE_CUPTI_USE_STATIC OFF)
endif()
endif()
if(NOT ${MGE_CUPTI_USE_STATIC})
find_library(
CUPTI_LIBRARY
NAMES libcupti.so
HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI
PATH_SUFFIXES lib lib64
DOC "CuPTI library.")
set(CUPTI_LIBRARY_TYPE SHARED)
else()
set(CUPTI_LIBRARY_TYPE STATIC)
endif()
if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND")
message(FATAL_ERROR "Can not find CuPTI Library")
endif()
find_path(
CUPTI_INCLUDE_DIR
NAMES cupti.h
HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI
PATH_SUFFIXES include
DOC "Path to CuPTI include directory.")
if(CUPTI_INCLUDE_DIR STREQUAL "CUPTI_INCLUDE_DIR-NOTFOUND")
message(FATAL_ERROR "Can not find CuPTI INCLUDE")
endif()
if(EXISTS ${CUPTI_INCLUDE_DIR}/cupti_version.h)
file(READ ${CUPTI_INCLUDE_DIR}/cupti_version.h CUPTI_VERSION_FILE_CONTENTS)
else()
file(READ ${CUPTI_INCLUDE_DIR}/cupti.h CUPTI_VERSION_FILE_CONTENTS)
endif()
string(REGEX MATCH "define CUPTI_API_VERSION * +([0-9]+)" CUPTI_API_VERSION
"${CUPTI_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define CUPTI_API_VERSION * +([0-9]+)" "\\1" CUPTI_API_VERSION
"${CUPTI_API_VERSION}")
add_library(libcupti ${CUPTI_LIBRARY_TYPE} IMPORTED)
set_target_properties(
libcupti PROPERTIES IMPORTED_LOCATION ${CUPTI_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES
${CUPTI_INCLUDE_DIR})
message(STATUS "Found CuPTI: ${CUPTI_LIBRARY} (found version: ${CUPTI_API_VERSION})")
......@@ -36,7 +36,7 @@ else()
PATH_SUFFIXES lib lib64
DOC "TRT plugin library.")
endif()
message("TRT_LIBRARY" ${TRT_LIBRARY})
message(STATUS "TRT_LIBRARY: ${TRT_LIBRARY}")
if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND")
message(
FATAL_ERROR
......
......@@ -51,6 +51,10 @@ if(ANDROID)
target_link_libraries(${MODULE_NAME} PRIVATE ${PYTHON_LIBRARIES})
endif()
if(MGE_WITH_CUPTI)
target_link_libraries(${MODULE_NAME} PRIVATE libcupti)
endif()
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3
${PROJECT_BINARY_DIR}/third_party/range-v3)
target_link_libraries(${MODULE_NAME} PRIVATE range-v3)
......
......@@ -16,6 +16,10 @@ from weakref import WeakSet
from .. import _atexit
from ..core._imperative_rt.core2 import (
cupti_available,
disable_cupti,
enable_cupti,
full_sync,
pop_scope,
push_scope,
start_profile,
......@@ -50,13 +54,18 @@ class Profiler(ContextDecorator):
with profiler:
# your code here
# Then open the profile file in chrome timeline window
"""
CHROME_TIMELINE = "chrome_timeline.json"
valid_options = {"sample_rate": 0, "profile_device": 1, "num_tensor_watch": 10}
valid_options = {
"sample_rate": 0,
"profile_device": 1,
"num_tensor_watch": 10,
"enable_cupti": 0,
}
valid_formats = {"chrome_timeline.json", "memory_flow.svg"}
def __init__(
......@@ -83,6 +92,11 @@ class Profiler(ContextDecorator):
self._options[opt] = int(kwargs.pop(opt, optval))
self._pid = "<PID>"
self._dump_callback = None
if self._options.get("enable_cupti", 0):
if cupti_available():
enable_cupti()
else:
get_logger().warning("CuPTI unavailable")
@property
def path(self):
......@@ -116,7 +130,7 @@ class Profiler(ContextDecorator):
assert _running_profiler is self
_running_profiler = None
sync()
full_sync()
self._dump_callback = stop_profile()
self._pid = os.getpid()
_living_profilers.add(self)
......@@ -160,6 +174,9 @@ class Profiler(ContextDecorator):
return func
def __del__(self):
if self._options.get("enable_cupti", 0):
if cupti_available():
disable_cupti()
self.dump()
......
......@@ -11,6 +11,7 @@
#include "megbrain/common.h"
#include "megbrain/dtype.h"
#include "megbrain/imperative/cpp_cupti.h"
#include "megbrain/imperative/ops/autogen.h"
#include "megbrain/imperative/ops/backward_graph.h"
#include "megbrain/imperative/ops/utility.h"
......@@ -982,6 +983,7 @@ void init_tensor(py::module m) {
m.def("stop_profile", [channel]() -> std::function<void(std::string, std::string)> {
channel->stop_profile();
channel->sync();
CompNode::sync_all();
imperative::Profiler::stop_profile();
auto results = std::make_shared<imperative::Profiler::bundle_t>(
imperative::Profiler::collect());
......@@ -990,6 +992,9 @@ void init_tensor(py::module m) {
results = nullptr;
};
});
m.def("enable_cupti", &cupti::enable);
m.def("disable_cupti", &cupti::disable);
m.def("cupti_available", &cupti::available);
m.def("sync", [channel]() {
if (channel->check_available()) {
channel->sync();
......
#include "megbrain/imperative/cpp_cupti.h"
#include <cinttypes>
#include <cstddef>
#include <cstdlib>
#include "megbrain/exception.h"
#include "megbrain/imperative/profiler.h"
#include "megbrain/imperative/utils/platform.h"
#include "./profiler/events.h"
#if MGB_CUPTI
#include "cupti.h"
#define CUPTI_CALL(call) \
do { \
CUptiResult _status = call; \
if (_status != CUPTI_SUCCESS) { \
const char* errstr; \
cuptiGetResultString(_status, &errstr); \
mgb_assert(_status == CUPTI_SUCCESS, "cupti error: %s", errstr); \
} \
} while (0)
#endif
namespace mgb::imperative::cupti {
#if MGB_CUPTI
namespace {
CUpti_SubscriberHandle cuptiSubscriber;
void cuptiSubscriberCallback(
void* userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id,
const void* cb_info) {
using namespace profiler;
switch (domain) {
case CUPTI_CB_DOMAIN_DRIVER_API: {
auto cb_data = (const CUpti_CallbackData*)cb_info;
switch (cb_id) {
case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
if (cb_data->callbackSite == CUPTI_API_ENTER) {
MGB_RECORD_EVENT(
CUPTIKernelLaunchEvent, cb_data->correlationId,
cb_data->symbolName);
} else if (cb_data->callbackSite == CUPTI_API_EXIT) {
MGB_RECORD_EVENT(
CUPTIKernelLaunchFinishEvent, cb_data->correlationId,
cb_data->symbolName);
}
break;
}
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA: {
}
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
if (cb_data->callbackSite == CUPTI_API_ENTER) {
MGB_RECORD_EVENT(
CUPTIMemcpyLaunchEvent, cb_data->correlationId);
} else if (cb_data->callbackSite == CUPTI_API_EXIT) {
MGB_RECORD_EVENT(
CUPTIMemcpyLaunchFinishEvent, cb_data->correlationId);
}
break;
}
default: {
if (cb_data->callbackSite == CUPTI_API_ENTER) {
MGB_RECORD_EVENT(
CUPTIDriverEvent, cb_data->correlationId,
cb_data->functionName);
} else if (cb_data->callbackSite == CUPTI_API_EXIT) {
MGB_RECORD_EVENT(
CUPTIDriverFinishEvent, cb_data->correlationId,
cb_data->functionName);
}
}
}
break;
}
case CUPTI_CB_DOMAIN_RUNTIME_API: {
auto cb_data = (const CUpti_CallbackData*)cb_info;
if (cb_data->callbackSite == CUPTI_API_ENTER) {
MGB_RECORD_EVENT(
CUPTIRuntimeEvent, cb_data->correlationId,
cb_data->functionName);
} else if (cb_data->callbackSite == CUPTI_API_EXIT) {
MGB_RECORD_EVENT(
CUPTIRuntimeFinishEvent, cb_data->correlationId,
cb_data->functionName);
}
break;
}
}
}
void handleActivity(CUpti_Activity* record) {
using namespace std::chrono_literals;
auto delta = 16ns;
switch (record->kind) {
case CUPTI_ACTIVITY_KIND_KERNEL:
case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
auto kernel = cupti::activity<CUpti_ActivityKernel4>(record);
MGB_RECORD_EVENT(
profiler::CUPTIKernelExecuteEvent, kernel->correlationId,
kernel->name, kernel.stream(), kernel.start(),
kernel.end() - delta);
break;
}
case CUPTI_ACTIVITY_KIND_MEMCPY: {
auto memcpy = cupti::activity<CUpti_ActivityMemcpy>(record);
MGB_RECORD_EVENT(
profiler::CUPTIMemcpyEvent, memcpy->correlationId, memcpy->srcKind,
memcpy->dstKind, memcpy->bytes, memcpy.stream(), memcpy.start(),
memcpy.end());
break;
}
case CUPTI_ACTIVITY_KIND_MEMSET: {
auto memset = cupti::activity<CUpti_ActivityMemset>(record);
MGB_RECORD_EVENT(
profiler::CUPTIMemsetEvent, memset->correlationId, memset->value,
memset->bytes, memset.stream(), memset.start(),
memset.end() - delta);
break;
}
default:
break;
}
}
using activity_buffer_t =
std::aligned_storage_t<8 * 1024 * 1024, ACTIVITY_RECORD_ALIGNMENT>;
void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) {
*buffer = reinterpret_cast<uint8_t*>(new activity_buffer_t());
*size = sizeof(activity_buffer_t);
*maxNumRecords = 0;
}
void bufferCompleted(
CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size,
size_t validSize) {
CUptiResult status;
CUpti_Activity* record = NULL;
if (validSize > 0) {
do {
status = cuptiActivityGetNextRecord(buffer, validSize, &record);
if (status == CUPTI_SUCCESS) {
handleActivity(record);
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED)
break;
else {
CUPTI_CALL(status);
}
} while (1);
size_t dropped;
CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
mgb_assert(dropped == 0, "%zu records dropped", dropped);
}
delete reinterpret_cast<activity_buffer_t*>(buffer);
}
static bool initialized = false;
} // namespace
bool available() {
uint32_t compiletime_version = (CUPTI_API_VERSION);
uint32_t runtime_version;
CUPTI_CALL(cuptiGetVersion(&runtime_version));
if (compiletime_version != runtime_version) {
static std::once_flag once;
std::call_once(once, [&] {
mgb_log_warn(
"CuPTI version %d mismatch against compiletime version %d. "
"This may caused by user config LD_LIBRARY_PATH"
"at unix-like env or config PATH at Windows env",
(int)compiletime_version, (int)runtime_version);
});
return false;
}
return true;
}
void enable() {
// not thread safe
mgb_assert(!initialized, "cupti already initialized");
// callback
CUPTI_CALL(cuptiSubscribe(
&cuptiSubscriber, (CUpti_CallbackFunc)cuptiSubscriberCallback,
(void*)nullptr));
CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_DRIVER_API));
CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_RUNTIME_API));
// activity
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME));
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER));
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
initialized = true;
}
void disable() {
mgb_assert(initialized, "cupti not initialized yet");
flush();
CUPTI_CALL(cuptiFinalize());
initialized = false;
}
void flush() {
if (initialized) {
CUPTI_CALL(cuptiActivityFlushAll(1));
}
}
bool enabled() {
return initialized;
}
time_point clock::now() {
uint64_t timestamp;
CUPTI_CALL(cuptiGetTimestamp(&timestamp));
using namespace std::chrono;
// overflow?
return time_point(duration((int64_t)timestamp));
}
#else
class CuPTIUnavailableError : public MegBrainError {
public:
CuPTIUnavailableError()
: MegBrainError(
#if MGB_CUDA
"CuPTI disabled at compile time"
#else
"CuPTI unsupported on non cuda platform"
#endif
) {
}
};
bool available() {
return false;
}
void enable() {
throw CuPTIUnavailableError();
}
void disable() {
throw CuPTIUnavailableError();
}
void flush() {}
bool enabled() {
return false;
}
time_point clock::now() {
throw CuPTIUnavailableError();
}
#endif
} // namespace mgb::imperative::cupti
......@@ -12,7 +12,9 @@
#include "megbrain/imperative/profiler.h"
#include <chrono>
#include <unordered_map>
#include "megbrain/imperative/cpp_cupti.h"
#include "megbrain/imperative/ops/opr_attr.h"
#include "megbrain/imperative/physical_tensor.h"
......@@ -48,6 +50,21 @@ bool Profiler::sm_profiling = false;
thread_local Profiler* Profiler::tm_profiler = nullptr;
std::atomic_size_t Profiler::sm_preferred_capacity;
void Profiler::start_profile() {
mgb_assert(!sm_profiling);
sm_start_at = Timer::record_host();
sm_profiling = true;
if (cupti::enabled()) {
MGB_RECORD_EVENT(profiler::CUPTITimestampEvent, cupti::clock::now());
}
}
void Profiler::stop_profile() {
mgb_assert(sm_profiling);
cupti::flush();
sm_profiling = false;
}
auto Profiler::get_thread_dict() -> thread_dict_t {
thread_dict_t thread_dict;
for (auto&& [tid, profiler] : sm_profilers) {
......
......@@ -19,6 +19,7 @@
#include "nlohmann/json.hpp"
#include "megbrain/imperative/utils/platform.h"
#include "megbrain/utils/debug.h"
#include "./formats.h"
......@@ -198,6 +199,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
decltype(getpid()) pid = getpid();
std::string pid_str = std::to_string(pid);
ChromeTimelineEventVisitor() {}
ChromeTraceEvent& new_event(
std::string name, char ph, size_t tid, profiler::HostTime time) {
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts(
......@@ -213,8 +216,13 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
.ts(since_start(current->time));
}
ChromeTraceEvent& new_cupti_event(
std::string name, char ph, cupti::stream_t stream,
cupti::time_point timestamp) {
return new_event(name, ph, to_tid(stream), time_from_cupti(timestamp));
}
ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) {
using namespace std::literals::chrono_literals;
auto time = since_start(to_device_time(current->time, device));
return trace_events.new_event()
.name(name)
......@@ -391,6 +399,80 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
auto device_ahead = std::chrono::duration_cast<std::chrono::milliseconds>(
current_device_time - current_host_time);
new_host_event("device_ahead_ms", 'C').arg("value", device_ahead.count());
} else if constexpr (std::is_same_v<TEvent, CUPTIKernelLaunchEvent>) {
new_host_event(demangle(event.name), 'B');
new_host_event(pid_str, 's')
.id(event.correlation_id)
.cat("KernelLink")
.scope(pid_str);
} else if constexpr (std::is_same_v<TEvent, CUPTIKernelLaunchFinishEvent>) {
new_host_event(demangle(event.name), 'E');
} else if constexpr (std::is_same_v<TEvent, CUPTIKernelExecuteEvent>) {
new_cupti_event(demangle(event.name), 'B', event.stream, event.start)
.arg("execution_time", (event.end - event.start).count());
new_cupti_event(pid_str, 'f', event.stream, event.end)
.id(event.correlation_id)
.bp('e')
.cat("KernelLink")
.scope(pid_str);
new_cupti_event(demangle(event.name), 'E', event.stream, event.end)
.arg("execution_time", (event.end - event.start).count());
} else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyLaunchEvent>) {
new_host_event("Memcpy", 'B');
new_host_event(pid_str, 's')
.id(event.correlation_id)
.cat("CUPTILink")
.scope(pid_str);
} else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyLaunchFinishEvent>) {
new_host_event("Memcpy", 'E');
} else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyEvent>) {
auto memkind2str = [](uint8_t kind) {
const char* const valid_kinds[] = {
"CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN",
"CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE",
"CUPTI_ACTIVITY_MEMORY_KIND_PINNED",
"CUPTI_ACTIVITY_MEMORY_KIND_DEVICE",
"CUPTI_ACTIVITY_MEMORY_KIND_ARRAY",
"CUPTI_ACTIVITY_MEMORY_KIND_MANAGED",
"CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC",
"CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC"};
if (kind > (sizeof(valid_kinds) / sizeof(const char*))) {
return "invalid";
}
return valid_kinds[kind];
};
new_cupti_event("Memcpy", 'B', event.stream, event.start)
.arg("bytes", imperative::to_string(event.bytes))
.arg("src_kind", memkind2str(event.src_kind))
.arg("dst_kind", memkind2str(event.dst_kind));
new_cupti_event(pid_str, 'f', event.stream, event.start)
.id(event.correlation_id)
.bp('e')
.cat("CUPTILink")
.scope(pid_str);
new_cupti_event("Memcpy", 'E', event.stream, event.end)
.arg("bytes", imperative::to_string(event.bytes))
.arg("src_kind", memkind2str(event.src_kind))
.arg("dst_kind", memkind2str(event.dst_kind));
} else if constexpr (std::is_same_v<TEvent, CUPTIMemsetEvent>) {
new_cupti_event("Memset", 'B', event.stream, event.start)
.arg("value", imperative::to_string(event.value))
.arg("bytes", imperative::to_string(event.bytes));
new_cupti_event("Memset", 'E', event.stream, event.start)
.arg("value", imperative::to_string(event.value))
.arg("bytes", imperative::to_string(event.bytes));
} else if constexpr (std::is_same_v<TEvent, CUPTIRuntimeEvent>) {
new_host_event(event.name, 'B');
} else if constexpr (std::is_same_v<TEvent, CUPTIRuntimeFinishEvent>) {
new_host_event(event.name, 'E');
} else if constexpr (std::is_same_v<TEvent, CUPTIDriverEvent>) {
new_host_event(event.name, 'B');
new_host_event(pid_str, 's')
.id(event.correlation_id)
.cat("CUPTILink")
.scope(pid_str);
} else if constexpr (std::is_same_v<TEvent, CUPTIDriverFinishEvent>) {
new_host_event(event.name, 'E');
}
}
......@@ -403,7 +485,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
if (thread_dict.count(host)) {
trace_events.new_event()
.name("thread_name")
.pid('M')
.ph('M')
.pid(pid)
.tid(to_tid(host))
.arg("name", thread_dict.at(host));
}
......@@ -411,7 +494,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
for (auto&& device : devices()) {
trace_events.new_event()
.name("thread_name")
.pid('M')
.ph('M')
.pid(pid)
.tid(to_tid(device))
.arg("name", device.to_string_logical());
}
......@@ -419,7 +503,7 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
};
void dump_chrome_timeline(std::string filename, Profiler::bundle_t result) {
ChromeTimelineEventVisitor visitor;
ChromeTimelineEventVisitor visitor{};
visitor.process_events(result);
visitor.name_threads(result.thread_dict);
auto trace_events = std::move(visitor.trace_events);
......
......@@ -16,6 +16,7 @@
#include "../interpreter/stack_manager.h"
#include "../op_trait.h"
#include "megbrain/imperative/cpp_cupti.h"
namespace mgb::imperative::profiler {
......@@ -181,6 +182,60 @@ DEF_DUR_EVENT(HostToDevice, {
void* device_ptr;
});
// cupti events
DEF_EVENT(CUPTITimestamp, { cupti::clock::time_point timestamp; });
DEF_DUR_EVENT(CUPTIKernelLaunch, {
uint32_t correlation_id;
const char* name;
});
DEF_EVENT(CUPTIKernelExecute, {
uint32_t correlation_id;
const char* name;
cupti::stream_t stream;
cupti::time_point start;
cupti::time_point end;
});
DEF_DUR_EVENT(CUPTIMemcpyLaunch, { uint32_t correlation_id; });
DEF_EVENT(CUPTIMemcpy, {
uint32_t correlation_id;
uint8_t src_kind;
uint8_t dst_kind;
uint64_t bytes;
cupti::stream_t stream;
cupti::time_point start;
cupti::time_point end;
});
DEF_EVENT(CUPTIMemset, {
uint32_t correlation_id;
uint32_t value;
uint64_t bytes;
cupti::stream_t stream;
cupti::time_point start;
cupti::time_point end;
});
DEF_EVENT(CUPTIUnknownDevice, {});
DEF_DUR_EVENT(CUPTIRuntime, {
uint32_t correlation_id;
const char* name;
});
DEF_DUR_EVENT(CUPTIDriver, {
uint32_t correlation_id;
const char* name;
});
DEF_EVENT(CUPTIIdentifyStream, {
cupti::stream_t stream;
CompNode device;
});
#undef DEF_EVENT
#undef DEF_DUR_EVENT
......
......@@ -180,10 +180,13 @@ private:
HostTime m_start_time;
CompNode::UnorderedMap<size_t> m_device_tid_table;
std::unordered_map<std::thread::id, size_t> m_host_tid_table;
std::unordered_map<cupti::stream_t, size_t> m_cupti_tid_table;
CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>>
m_device_timeline;
std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack;
std::unordered_map<std::string, int64_t> m_counter_table;
std::optional<std::pair<profiler::HostTime, cupti::time_point>> m_cupti_timestamp =
{};
protected:
Profiler::Record* current;
......@@ -191,6 +194,11 @@ protected:
ProfileTensorState* current_tensor;
protected:
size_t next_tid() {
return m_host_tid_table.size() + m_device_tid_table.size() +
m_cupti_tid_table.size();
}
profiler::Duration since_start(profiler::HostTime time) {
return time - m_start_time;
}
......@@ -229,6 +237,10 @@ protected:
size_t to_tid(CompNode device) { return m_device_tid_table.at(device); }
size_t to_tid(cupti::stream_t cupti_stream) {
return m_cupti_tid_table.at(cupti_stream);
}
SmallVector<std::thread::id> host_threads() {
SmallVector<std::thread::id> host_threads;
for (auto&& [host, _] : m_host_tid_table) {
......@@ -254,6 +266,13 @@ protected:
value += delta;
}
profiler::HostTime time_from_cupti(cupti::time_point timestamp) {
mgb_assert(m_cupti_timestamp.has_value());
return m_cupti_timestamp->first +
std::chrono::duration_cast<profiler::HostTime::duration>(
timestamp - m_cupti_timestamp->second);
}
public:
void process_events(Profiler::bundle_t& bundle) {
m_start_time = bundle.start_at;
......@@ -272,7 +291,11 @@ public:
TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent,
AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent,
ScopeEvent, ScopeFinishEvent, HostToDeviceEvent,
HostToDeviceFinishEvent>
HostToDeviceFinishEvent, CUPTITimestampEvent, CUPTIKernelLaunchEvent,
CUPTIKernelLaunchFinishEvent, CUPTIKernelExecuteEvent,
CUPTIMemcpyLaunchEvent, CUPTIMemcpyLaunchFinishEvent, CUPTIMemcpyEvent,
CUPTIRuntimeEvent, CUPTIRuntimeFinishEvent, CUPTIDriverEvent,
CUPTIDriverFinishEvent, CUPTIMemsetEvent>
converter;
auto for_each_entry = [&](auto&& handler) {
......@@ -289,7 +312,9 @@ public:
std::shared_ptr<CompNode::Event> device;
};
CompNode::UnorderedMap<DeviceStartPair> device_start_table;
std::unordered_map<cupti::stream_t, CompNode> cupti_stream_table;
// record device time
for_each_entry([&](auto&& event) {
using T = std::decay_t<decltype(event)>;
if constexpr (std::is_same_v<T, RecordDeviceEvent>) {
......@@ -313,8 +338,7 @@ public:
// register host threads
for_each_entry([&](auto&& event) {
if (!m_host_tid_table.count(current->tid)) {
m_host_tid_table[current->tid] = {
m_device_tid_table.size() + m_host_tid_table.size()};
m_host_tid_table[current->tid] = next_tid();
}
});
......@@ -340,14 +364,39 @@ public:
} else if constexpr (std::is_same_v<T, TensorProduceEvent>) {
auto& tensor = m_tensors[event.tensor_id];
if (!m_device_tid_table.count(event.device)) {
m_device_tid_table[event.device] = {
m_device_tid_table.size() + m_host_tid_table.size()};
m_device_tid_table[event.device] = next_tid();
}
tensor.device = event.device;
tensor.layout = event.layout;
}
});
for_each_entry([&](auto&& event) {
using T = std::decay_t<decltype(event)>;
if constexpr (std::is_same_v<T, CUPTIIdentifyStreamEvent>) {
if (!m_cupti_tid_table.count(event.stream)) {
m_cupti_tid_table[event.stream] =
m_device_tid_table.at(event.device);
}
}
});
// record cupti streams
for_each_entry([&](auto&& event) {
using T = std::decay_t<decltype(event)>;
if constexpr (
std::is_same_v<T, CUPTIKernelExecuteEvent> ||
std::is_same_v<T, CUPTIMemcpyEvent> ||
std::is_same_v<T, CUPTIMemsetEvent>) {
if (!m_cupti_tid_table.count(event.stream)) {
m_cupti_tid_table[event.stream] = next_tid();
}
} else if constexpr (std::is_same_v<T, CUPTITimestampEvent>) {
mgb_assert(!m_cupti_timestamp.has_value());
m_cupti_timestamp.emplace(current->time, event.timestamp);
}
});
// replay execution
using namespace std::placeholders;
for_each_entry([&](auto&& event) {
......
#include "megbrain/imperative/utils/platform.h"
#ifdef __GNUG__
#include <cxxabi.h>
#include <cstdlib>
#include <memory>
#endif
using namespace mgb;
using namespace imperative;
/*
* demangle typeid, see
* http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
*/
std::string mgb::imperative::demangle(std::string mangled) {
#ifdef __GNUG__
int status = -1;
std::unique_ptr<char, void (*)(void*)> res{
abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status), std::free};
return (status == 0) ? res.get() : mangled;
#else
return mangled;
#endif
}
#pragma once
#include <chrono>
#include <ctime>
#include "megbrain/common.h"
#include "megbrain/imperative/utils/to_string.h"
namespace mgb::imperative::cupti {
struct clock {
typedef std::chrono::nanoseconds duration;
typedef duration::rep rep;
typedef duration::period period;
typedef std::chrono::time_point<clock> time_point;
static const bool is_steady = false;
static time_point now() /* noexcept */;
};
using time_point = clock::time_point;
using duration = clock::duration;
struct device_t {
uint32_t device_id;
bool operator==(const device_t& rhs) const { return device_id == rhs.device_id; }
};
struct context_t : device_t {
uint32_t context_id;
bool operator==(const context_t& rhs) const {
return device_t::operator==(rhs) && context_id == rhs.context_id;
}
};
struct stream_t : context_t {
uint32_t stream_id;
bool operator==(const stream_t& rhs) const {
return context_t::operator==(rhs) && stream_id == rhs.stream_id;
}
};
bool available();
void enable();
void disable();
void flush();
bool enabled();
template <typename TActivity>
struct activity {
private:
TActivity* m_ptr;
public:
activity(void* ptr) : m_ptr((TActivity*)ptr) {}
time_point start() const { return time_point(duration(m_ptr->start)); }
time_point end() const { return time_point(duration(m_ptr->end)); }
device_t device() const { return {m_ptr->deviceId}; }
context_t context() const { return {device(), m_ptr->contextId}; }
stream_t stream() const { return {context(), m_ptr->streamId}; }
TActivity* operator->() const { return m_ptr; }
};
} // namespace mgb::imperative::cupti
template <>
class std::hash<mgb::imperative::cupti::stream_t> {
public:
size_t operator()(const mgb::imperative::cupti::stream_t& value) const {
return value.stream_id;
}
};
......@@ -194,16 +194,9 @@ public:
static bool is_profiling() { return sm_profiling; }
static void start_profile() {
mgb_assert(!sm_profiling);
sm_start_at = Timer::record_host();
sm_profiling = true;
}
static void start_profile();
static void stop_profile() {
mgb_assert(sm_profiling);
sm_profiling = false;
}
static void stop_profile();
static thread_dict_t get_thread_dict();
......
#pragma once
#include <string>
namespace mgb::imperative {
std::string demangle(std::string mangled);
}
......@@ -37,6 +37,10 @@ if(MGE_WITH_CUDA)
list(APPEND LINK_LIBS cudart)
endif()
if(MGE_WITH_CUPTI)
list(APPEND LINK_LIBS libcupti)
endif()
if(MGE_WITH_DISTRIBUTED)
list(APPEND LINK_LIBS megray)
endif()
......
......@@ -61,11 +61,11 @@ echo "Build with ${SDK_NAME}"
if [ $SDK_NAME == "cu101" ];then
CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF"
BUILD_GCC8="ON"
REQUIR_CUDA_VERSION="10010"
REQUIR_CUDNN_VERSION="7.6.3"
REQUIR_TENSORRT_VERSION="6.0.1.5"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF"
BUILD_GCC8="ON"
REQUIR_CUDA_VERSION="10010"
REQUIR_CUDNN_VERSION="7.6.3"
REQUIR_TENSORRT_VERSION="6.0.1.5"
REQUIR_CUBLAS_VERSION="10.2.1.243"
elif [ $SDK_NAME == "cu102_JetsonNano" ];then
......@@ -87,6 +87,12 @@ elif [ $SDK_NAME == "cu102_JetsonNano" ];then
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn.so.8"
if [ ${machine} == "aarch64" ];then
CUDA_COPY_LIB_LIST="\
${CUDA_LIB_DIR}/libcupti.so.10.2:\
${CUDA_COPY_LIB_LIST}"
fi
EXTRA_CMAKE_FLAG="-DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_53,code=sm_53\" "
elif [ $SDK_NAME == "cu111" ];then
......@@ -118,6 +124,12 @@ elif [ $SDK_NAME == "cu111" ];then
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn.so.8"
if [ ${machine} == "aarch64" ];then
CUDA_COPY_LIB_LIST="\
${CUDA_LIB_DIR}/libcupti.so.11.1:\
${CUDA_COPY_LIB_LIST}"
fi
if [ ${IN_CI} = "true" ] && [ ${machine} == "aarch64" ]; then
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_75,code=sm_75\" "
else
......@@ -152,9 +164,9 @@ elif [ $SDK_NAME == "cu112" ];then
-gencode arch=compute_86,code=sm_86 \
-gencode arch=compute_86,code=compute_86\" "
REQUIR_CUDA_VERSION="11020"
REQUIR_CUDNN_VERSION="8.0.4"
REQUIR_TENSORRT_VERSION="7.2.2.3"
REQUIR_CUDA_VERSION="11020"
REQUIR_CUDNN_VERSION="8.0.4"
REQUIR_TENSORRT_VERSION="7.2.2.3"
REQUIR_CUBLAS_VERSION="11.3.1.68"
elif [ $SDK_NAME == "cpu" ];then
......
......@@ -35,6 +35,7 @@
#cmakedefine01 MGB_ENABLE_FBS_SERIALIZATION
#cmakedefine01 MGB_IS_DEV
#cmakedefine01 MGB_CUSTOM_OP
#cmakedefine01 MGB_CUPTI
// DNN related flags
// Platform macro's
#cmakedefine01 MEGDNN_WITH_CUDA
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册