提交 1d64792b 编写于 作者: M Megvii Engine Team 提交者: huangxinda

refactor(profiler): detach profiler from interpreter

GitOrigin-RevId: f3954728d1dd8e93e2eb5a94ee5f3a030a54fb5a
上级 f2027b8d
......@@ -7,9 +7,14 @@
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import json
from contextlib import contextmanager
import os
import re
from contextlib import ContextDecorator, contextmanager
from functools import wraps
from typing import List
from weakref import WeakSet
from .. import _atexit
from ..core._imperative_rt.core2 import (
pop_scope,
push_scope,
......@@ -17,9 +22,13 @@ from ..core._imperative_rt.core2 import (
stop_profile,
sync,
)
from ..logger import get_logger
_running_profiler = None
_living_profilers = WeakSet()
class Profiler:
class Profiler(ContextDecorator):
r"""
Profile graph execution in imperative mode.
......@@ -35,9 +44,10 @@ class Profiler:
from megengine.utils.profiler import Profiler
# With Learnable Parameters
profiler = Profiler()
for iter in range(0, 10):
# Only profile record of last iter would be saved
with Profiler("profile"):
with profiler:
# your code here
# Then open the profile file in chrome timeline window
......@@ -45,46 +55,105 @@ class Profiler:
CHROME_TIMELINE = "chrome_timeline.json"
COMMAND = 1 << 0
OPERATOR = 1 << 1
TENSOR_LIFETIME = 1 << 2
TENSOR_PROP = 1 << 3
SYNC = 1 << 4
SCOPE = 1 << 5
ALL = (1 << 6) - 1
valid_options = {"sample_rate": 0, "profile_device": 1, "num_tensor_watch": 10}
valid_formats = {"chrome_timeline.json", "memory_flow.svg"}
def __init__(
self,
path: str = "profile",
format: str = CHROME_TIMELINE,
*,
topic=OPERATOR | SCOPE,
align_time=True,
show_operator_name=True
format: str = "chrome_timeline.json",
formats: List[str] = None,
**kwargs
) -> None:
self._path = path
self._format = format
self._options = {
"topic": int(topic),
"align_time": int(align_time),
"show_operator_name": int(show_operator_name),
}
if not formats:
formats = [format]
def __enter__(self):
assert not isinstance(formats, str), "formats excepts list, got str"
for format in formats:
assert format in Profiler.valid_formats, "unsupported format {}".format(
format
)
self._path = path
self._formats = formats
self._options = {}
for opt, optval in Profiler.valid_options.items():
self._options[opt] = int(kwargs.pop(opt, optval))
self._pid = "<PID>"
@property
def path(self):
if len(self._formats) == 0:
format = "<FORMAT>"
elif len(self._formats) == 1:
format = self._formats[0]
else:
format = "{" + ",".join(self._formats) + "}"
return self.format_path(self._path, self._pid, format)
@property
def directory(self):
return self._path
@property
def formats(self):
return list(self._formats)
def start(self):
global _running_profiler
assert _running_profiler is None
_running_profiler = self
self._pid = os.getpid()
start_profile(self._options)
return self
def __exit__(self, val, tp, trace):
stop_profile(self._path, self._format)
# dump is async, so it's necessary to sync interpreter
def stop(self):
global _running_profiler
assert _running_profiler is self
_running_profiler = None
sync()
self._dump_callback = stop_profile()
self._pid = os.getpid()
_living_profilers.add(self)
def dump(self):
if self._dump_callback is not None:
if not os.path.exists(self._path):
os.makedirs(self._path)
if not os.path.isdir(self._path):
get_logger().warning(
"{} is not a directory, cannot write profiling results".format(
self._path
)
)
return
for format in self._formats:
path = self.format_path(self._path, self._pid, format)
get_logger().info("process {} generating {}".format(self._pid, format))
self._dump_callback(path, format)
get_logger().info("profiling results written to {}".format(path))
self._dump_callback = None
_living_profilers.remove(self)
def format_path(self, path, pid, format):
return os.path.join(path, "{}.{}".format(pid, format))
def __enter__(self):
self.start()
def __exit__(self, val, tp, trace):
self.stop()
def __call__(self, func):
def wrapper(*args, **kwargs):
with self:
return func(*args, **kwargs)
func = super().__call__(func)
func.__profiler__ = self
return func
return wrapper
def __del__(self):
self.dump()
@contextmanager
......@@ -94,16 +163,77 @@ def scope(name):
pop_scope(name)
profile = Profiler
def profile(*args, **kwargs):
if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
return Profiler()(args[0])
return Profiler(*args, **kwargs)
def merge_trace_events(directory: str):
names = filter(
lambda x: re.match(r"\d+\.chrome_timeline\.json", x), os.listdir(directory)
)
def load_trace_events(name):
with open(os.path.join(directory, name), "r", encoding="utf-8") as f:
return json.load(f)
def find_metadata(content):
if isinstance(content, dict):
assert "traceEvents" in content
content = content["traceEvents"]
if len(content) == 0:
return None
assert content[0]["name"] == "Metadata"
return content[0]["args"]
contents = list(map(load_trace_events, names))
metadata_list = list(map(find_metadata, contents))
min_local_time = min(
map(lambda x: x["localTime"], filter(lambda x: x is not None, metadata_list))
)
events = []
for content, metadata in zip(contents, metadata_list):
local_events = content["traceEvents"]
if len(local_events) == 0:
continue
local_time = metadata["localTime"]
time_shift = local_time - min_local_time
for event in local_events:
if "ts" in event:
event["ts"] = int(event["ts"] + time_shift)
events.extend(filter(lambda x: x["name"] != "Metadata", local_events))
result = {
"traceEvents": events,
}
path = os.path.join(directory, "merge.chrome_timeline.json")
with open(path, "w") as f:
json.dump(result, f, ensure_ascii=False, separators=(",", ":"))
get_logger().info("profiling results written to {}".format(path))
def is_profiling():
return _running_profiler is not None
def _stop_current_profiler():
global _running_profiler
if _running_profiler is not None:
_running_profiler.stop()
living_profilers = [*_living_profilers]
for profiler in living_profilers:
profiler.dump()
def merge_trace_events(sources: List[str], target: str):
names = list(map(lambda x: x + ".chrome_timeline.json", sources))
result = []
for name in names:
with open(name, "r", encoding="utf-8") as f:
content = json.load(f)
for entry in content:
result.append(entry)
with open(target + ".chrome_timeline.json", "w") as f:
json.dump(result, f, ensure_ascii=False, indent=4)
_atexit(_stop_current_profiler)
......@@ -13,6 +13,7 @@
#include "megbrain/common.h"
#include "megbrain/imperative/ops/utility.h"
#include "megbrain/imperative/ops/backward_graph.h"
#include "megbrain/imperative/profiler.h"
#include "megbrain/opr/io.h"
#include "./tensor.h"
......@@ -927,9 +928,23 @@ void init_tensor(py::module m) {
m.def("pop_scope",
[](std::string name) { interpreter_for_py->pop_scope(name); });
m.def("start_profile",
[](std::unordered_map<std::string, int> option) { return interpreter_for_py->start_profile(option); });
[](imperative::Profiler::options_t options) {
interpreter_for_py->sync();
imperative::Profiler::load_options(std::move(options));
imperative::Profiler::start_profile();
interpreter_for_py->start_profile();
});
m.def("stop_profile",
[](std::string basename, std::string format) { interpreter_for_py->stop_profile(basename, format); });
[]() -> std::function<void(std::string, std::string)> {
interpreter_for_py->stop_profile();
interpreter_for_py->sync();
imperative::Profiler::stop_profile();
auto results = imperative::Profiler::collect();
auto options = imperative::Profiler::get_options();
return [results=std::move(results), options=std::move(options)](std::string basename, std::string format){
imperative::Profiler::dump_profile(basename, format, results, options);
};
});
m.def("sync",
[]() {
interpreter_for_py->sync();
......
......@@ -8,6 +8,7 @@
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied
import json
import os
import tempfile
import pytest
......@@ -28,15 +29,18 @@ class Simple(Module):
def test_profiler():
profile_prefix = "pytest_profile"
tempdir = tempfile.NamedTemporaryFile()
profile_prefix = tempdir.name
profile_format = "chrome_timeline.json"
profile_path = "{}.{}".format(profile_prefix, profile_format)
with Profiler(profile_prefix, format=profile_format):
with scope("my_scope"):
oup = Simple()(tensor([1.23], dtype="float32"))
profile_path = os.path.join(
profile_prefix, "{}.{}".format(os.getpid(), profile_format)
)
with option("enable_host_compute", 0):
with Profiler(profile_prefix, format=profile_format):
with scope("my_scope"):
oup = Simple()(tensor([1.23], dtype="float32"))
with open(profile_path, "r") as f:
events = json.load(f)
os.remove(profile_path)
prev_ts = {}
scope_count = 0
for event in events:
......
......@@ -13,11 +13,14 @@
#include <string>
#include <variant>
#include <unordered_set>
#include "megbrain/tensor.h"
#include "megbrain/imperative/op_def.h"
#include "megbrain/imperative/utils/to_string.h"
#include "./tensor_info.h"
namespace mgb::imperative {
namespace interpreter::intl {
......@@ -43,7 +46,7 @@ struct Put {
};
struct ApplyOp {
uint64_t id;
uint64_t id; //used by profiler to identify unique apply
std::shared_ptr<OpDef> op;
SmallVector<TensorInfo*> inputs;
SmallVector<TensorInfo*> outputs;
......@@ -143,7 +146,7 @@ struct SetOption {
};
struct StartProfile {
InterpreterProfiler* profiler;
std::unordered_set<TensorInfo*> capture_tensors;
template <typename TFunctor>
void get_props(TFunctor&& functor) const {}
......@@ -154,14 +157,10 @@ struct StartProfile {
};
struct StopProfile {
std::string basename;
std::string format;
std::unordered_set<TensorInfo*> escape_tensors;
template <typename TFunctor>
void get_props(TFunctor&& functor) const {
functor("basename", basename);
functor("format", format);
}
void get_props(TFunctor&& functor) const {}
const char* get_name() const {
return "StopProfile";
......
......@@ -24,10 +24,10 @@
#include "megbrain/imperative/profiler.h"
#include "./commands.h"
#include "./events.h"
#include "./tensor_info.h"
#include "./option_manager.h"
#include "./profiler.h"
#include "../profiler/events.h"
namespace mgb::imperative::interpreter::intl {
......@@ -37,7 +37,6 @@ struct InterpreterImpl : Interpreter {
std::unique_ptr<Channel> create_channel() override;
};
struct ChannelImpl : Interpreter::Channel {
ChannelImpl();
~ChannelImpl() override;
......@@ -67,19 +66,27 @@ struct ChannelImpl : Interpreter::Channel {
size_t get_option(std::string name) override;
void set_option(std::string name, size_t value) override;
void start_profile(std::unordered_map<std::string, int> option) override;
void stop_profile(std::string basename, std::string format) override;
void start_profile() override;
void stop_profile() override;
void push_scope(std::string) override;
void pop_scope(std::string) override;
private:
struct WorkQueue;
struct State;
TensorInfo* alloc();
void init(TensorInfo*, LogicalTensorDesc desc);
void free(TensorInfo*);
void real_free(TensorInfo*);
void recursive_free(TensorInfo*);
void do_drop(TensorInfo*, bool);
void detach_users(TensorInfo*);
TensorInfo* put_impl(const HostTensorND& value, bool no_cache);
TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop);
void notify_tensor_unsafe(TensorInfo* info);
void process_one_task(IdentifiedCommand&);
void check_worker_exc_unsafe();
......@@ -105,24 +112,31 @@ private:
bool check_available();
void push_scope(std::string, State&);
void pop_scope(std::string, State&);
void assert_in_channel();
void assert_in_worker();
std::thread::id get_worker_tid();
void sync_device_scope(CompNode device);
template <typename TCommand>
void enqueue_command(TCommand&& cmd) {
m_buffer.enqueue(Command{std::forward<TCommand>(cmd)});
}
void sample_on_device(CompNode device, bool force);
// valid => status != Deleted
std::unordered_set<TensorInfo*> collect_valid_tensors();
std::mutex m_mutex;
std::condition_variable m_cv;
MemPool<TensorInfo> m_pool;
std::unordered_set<Handle> m_valid_handle;
TensorInfo* m_waitee = nullptr;
uint64_t m_waitee_id = 0;
std::exception_ptr m_worker_exc;
std::atomic_uint64_t m_last_id = 0;
std::function<void(std::string, std::string)> m_profile_dump_callback;
bool m_closed = false;
......@@ -191,27 +205,98 @@ private:
//! level 0: both sync.
int m_async_level = 2;
struct State {
OptionManager options;
std::vector<std::string> scopes;
std::unique_ptr<InterpreterProfiler> profiler;
struct Scope {
std::string name;
std::unordered_map<std::string, std::unique_ptr<Scope>> children;
size_t version = 0;
size_t parent_version = 0;
size_t tensor_count = 0;
Scope* active_child = nullptr;
Scope* parent = nullptr;
Scope* enter(std::string name) {
auto& child = children[name];
if (!child) {
child = std::make_unique<Scope>();
child->name = name;
child->parent = this;
}
if (version != child->parent_version) {
child->version = 0;
child->parent_version = version;
} else {
child->version++;
}
child->tensor_count = 0;
return active_child = child.get();
}
State() {
profiler = std::make_unique<InterpreterProfiler>();
Scope* exit(std::string name) {
mgb_assert(this->name == name, "scope name mismatch");
parent->active_child = nullptr;
return parent;
}
};
struct ChannelState: State {};
class ScopeManager {
private:
Scope m_root;
Scope* m_current_scope = &m_root;
public:
class ScopeGuard{
private:
ScopeManager* m_manager;
std::string m_name;
public:
ScopeGuard(ScopeManager* manager, std::string name): m_manager{manager}, m_name{name} {
m_manager->push(m_name);
}
~ScopeGuard() {
m_manager->pop(m_name);
}
};
void push(std::string name) {
m_current_scope = m_current_scope->enter(name);
}
void pop(std::string name) {
m_current_scope = m_current_scope->exit(name);
}
std::string next_tensor_name() {
std::string builder;
Scope* scope = &m_root;
while (true) {
builder.append(scope->name);
if (scope->version != 0) {
builder.append(ssprintf("(%ld)", scope->version));
}
if (scope != &m_root) {
builder.append(".");
}
if (scope->active_child == nullptr) {
builder.append(ssprintf(":%%%ld", scope->tensor_count++));
break;
} else {
scope = scope->active_child;
}
}
return builder;
}
};
struct WorkerState: State {
struct State {
std::thread::id tid;
CompNode::UnorderedMap<std::vector<std::string>> device_scope_map;
OptionManager options;
};
struct ChannelState: State {
ScopeManager scopes;
};
struct WorkerState: State {};
ChannelState m_channel_state;
WorkerState m_worker_state;
/*!
* \brief A framework of dynamic sublienar memory optimization
*
......@@ -327,7 +412,6 @@ private:
// assert thread id when call get_xxx_state to avoid misuse
ChannelState& get_channel_state();
WorkerState& get_worker_state();
};
} // namespace mgb::imperative::interpreter::intl
/**
* \file imperative/src/impl/interpreter/profiler.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megbrain/imperative/profiler.h"
#include "./commands.h"
#include "./events.h"
#include "./option_manager.h"
namespace mgb::imperative::interpreter::intl {
class InterpreterProfiler: public Profiler<
CommandEnqueueEvent, CommandExecuteEvent, CommandFinishEvent,
OpExecuteEvent, OpExecuteFinishEvent,
KernelExecuteEvent, KernelExecuteFinishEvent,
TensorDeclareEvent, TensorProduceEvent, TensorEraseEvent,
TensorGetPropEvent, TensorWaitPropEvent, TensorNotifyPropEvent, TensorWaitPropFinishEvent,
SyncEvent, SyncFinishEvent,
ScopeEvent, ScopeFinishEvent,
DeviceScopeEvent, DeviceScopeFinishEvent> {
public:
enum Topic {
Command = 0b000001,
Operator = 0b000010,
TensorLifetime = 0b000100,
TensorProp = 0b001000,
Sync = 0b010000,
Scope = 0b100000,
};
struct Option {
Topic topic;
bool align_time;
bool show_operator_name;
static Option from_dict(std::unordered_map<std::string, int> dict) {
Option option;
option.topic = Topic(dict.at("topic"));
option.align_time = bool(dict.at("align_time"));
option.show_operator_name = bool(dict.at("show_operator_name"));
return option;
}
};
Option get_option() const {
return m_option;
}
void set_option(const Option& option) {
m_option = option;
}
static Mask topic_to_mask(Topic topic) {
Mask result;
if (topic & Command) {
result |= mask_of<CommandEnqueueEvent, CommandExecuteEvent, CommandFinishEvent>();
}
if (topic & Operator) {
result |= mask_of<OpExecuteEvent, OpExecuteFinishEvent>();
result |= mask_of<KernelExecuteEvent, KernelExecuteFinishEvent>();
}
if (topic & TensorLifetime) {
result |= mask_of<TensorDeclareEvent, TensorProduceEvent, TensorEraseEvent>();
}
if (topic & TensorProp) {
result |= mask_of<TensorGetPropEvent, TensorWaitPropEvent, TensorNotifyPropEvent, TensorWaitPropFinishEvent>();
}
if (topic & Sync) {
result |= mask_of<SyncEvent, SyncFinishEvent>();
}
if (topic & Scope) {
result |= mask_of<ScopeEvent, ScopeFinishEvent>();
result |= mask_of<DeviceScopeEvent, DeviceScopeFinishEvent>();
}
return result;
}
private:
Option m_option;
};
}
......@@ -27,19 +27,19 @@ enum EvictType {
/*!
* \brief an identifier to specify a component of evicted tensors
*
*
* Each component tracks the sum of the compute costs of its elements, with the
* union of two components having the sum of each constituent cost.
*/
struct DsuNode {
DsuNode(double _t): t(_t) {}
std::shared_ptr<DsuNode> parent;
bool is_root() {
return !bool(parent);
}
double t;
};
......@@ -47,25 +47,33 @@ struct TensorInfo;
using TensorInfoPtr = std::shared_ptr<TensorInfo>;
struct TensorInfo {
enum Prop {
Device, Shape, DType, DevValue, HostValue
enum Status {
InvalidStatus, Allocated, Produced, Swapped, Dropped, Deleted,
};
uint64_t id;
uint64_t id = -1;
std::string name;
// Most attrs of TensorInfo, except `ptr` and `h_value`,
// were visited read and written in main thread.
// Lock interpreter when visiting `ptr`.
TensorPtr ptr;
LogicalTensorDesc desc;
double compute_time;
size_t memory;
double last_used_time;
// FIXME: broken by drop
bool value_fetched = false;
bool invalid = false;
bool allow_delete = false;
EvictType evict_type = NONE;
// Status should be only modified in worker thread
Status status = InvalidStatus;
// Used by HostCompute and Memory Swap.
// HostCompute and Swap does not happen in one thread.
// Maybe a barrier is needed.
HostTensorND h_value;
// reserved for auto drop
......@@ -74,6 +82,10 @@ struct TensorInfo {
size_t ref_cnt = 0;
std::shared_ptr<DsuNode> dsu_ptr;
// Not reference count, inc when used as input
size_t ptr_use_count = 0;
// Used by `Drop` action
struct ComputePath {
uint64_t id;
std::shared_ptr<OpDef> op;
......@@ -111,7 +123,7 @@ struct TensorInfo {
return path;
}
}* producer = nullptr;
double eval_func(double cost, double free_mem, double cur_time,
double param_cost, double param_mem, double param_time, double param_recompute_times) {
return pow(cost + 1e-3, param_cost) * pow(param_recompute_times, (double)recompute_times)
......@@ -126,20 +138,24 @@ struct TensorInfo {
--pinned;
}
void detach_producer() {
// returns true if producer is deleted
bool detach_producer() {
if (!producer) {
return;
return false;
}
auto output = std::find(producer->outputs.begin(), producer->outputs.end(), this);
mgb_assert(output != producer->outputs.end());
*output = nullptr;
bool deleted = false;
if (producer->ref_cnt() == 0) {
for (auto* input: producer->unique_inputs) {
input->users.erase(std::find(input->users.begin(), input->users.end(), producer));
}
delete producer;
deleted = true;
}
producer = nullptr;
return deleted;
}
bool size_exceeds_thd(size_t thd) {
......@@ -150,26 +166,4 @@ struct TensorInfo {
};
}
template <>
struct ToStringTrait<interpreter::intl::TensorInfo::Prop>{
using TensorInfo = interpreter::intl::TensorInfo;
std::string operator()(TensorInfo::Prop prop) const {
switch(prop) {
case TensorInfo::DType:
return "dtype";
case TensorInfo::DevValue:
return "dev_value";
case TensorInfo::Device:
return "device";
case TensorInfo::HostValue:
return "host_value";
case TensorInfo::Shape:
return "shape";
default:
return "unknown";
}
}
};
}
......@@ -22,47 +22,58 @@
#include "./event_pool.h"
#include "./op_trait.h"
#include "./profiler/formats.h"
namespace mgb {
namespace imperative {
namespace {
DeviceTimer::SharedEvent alloc_recorded_event(CompNode device) {
auto event = EventPool::with_timer().alloc_shared(device);
event->record();
return event;
uint64_t Timer::get_nsecs() {
using namespace std::chrono;
auto finish = steady_clock::now();
auto duration = duration_cast<nanoseconds>(finish - m_start);
return duration.count();
}
} // namespace
DeviceTimer::SharedEvent DeviceTimer::get_device_time(CompNode device) {
return alloc_recorded_event(device);
uint64_t Timer::get_started_at() {
return m_started_at;
}
SmallVector<DeviceTimer::SharedEvent> DeviceTimer::get_all(SmallVector<CompNode> device_list) {
SmallVector<DeviceTimer::SharedEvent> results;
for (auto&& device: device_list) {
results.push_back(alloc_recorded_event(device));
}
return results;
void Timer::reset() {
using namespace std::chrono;
m_start = steady_clock::now();
auto now_ns = duration_cast<nanoseconds>(std::chrono::system_clock::now().time_since_epoch());
m_started_at = now_ns.count();
}
double HostTimer::get_msecs() {
using namespace std::chrono;
auto finish = steady_clock::now();
auto duration = duration_cast<microseconds>(finish - m_start);
return (double)duration.count() / 1e3;
std::shared_ptr<CompNode::Event> Timer::record_event(CompNode device) {
auto event = EventPool::with_timer().alloc_shared(device);
event->record();
return event;
}
double HostTimer::get_started_at() {
return m_started_at;
Profiler::options_t Profiler::sm_profile_options;
std::mutex Profiler::sm_mutex;
std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers;
Timer Profiler::sm_timer;
std::atomic_uint64_t Profiler::sm_last_id = 0;
bool Profiler::sm_profiling = false;
thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>();
std::atomic_size_t Profiler::sm_preferred_capacity;
auto Profiler::get_thread_dict() -> thread_dict_t {
MGB_LOCK_GUARD(sm_mutex);
thread_dict_t thread_dict;
for (auto&& [tid, profiler]: sm_profilers) {
thread_dict[tid] = profiler->m_thread_name;
}
return thread_dict;
}
void HostTimer::reset() {
using namespace std::chrono;
m_start = steady_clock::now();
auto now_us = duration_cast<microseconds>(std::chrono::system_clock::now().time_since_epoch());
m_started_at = (double)(now_us.count()) / 1e3;
void Profiler::dump_profile(std::string basename, std::string format, results_t results, options_t options) {
auto thread_dict = get_thread_dict();
{
mgb_log_error("unsupported profiling format %s", format.c_str());
}
}
} // namespace imperative
......
#include <string>
#include <memory>
#include "megbrain/utils/json.h"
namespace mgb {
namespace imperative {
class ChromeTraceEvent {
public:
ChromeTraceEvent& name(std::string name) {
m_name = std::move(name);
return *this;
}
ChromeTraceEvent& tid(uint64_t tid) {
m_tid = std::move(tid);
return *this;
}
ChromeTraceEvent& cat(std::string cat) {
m_cat = std::move(cat);
return *this;
}
ChromeTraceEvent& pid(uint64_t pid) {
m_pid = pid;
return *this;
}
ChromeTraceEvent& id(uint64_t id) {
m_id = id;
return *this;
}
ChromeTraceEvent& idx(uint64_t idx) {
m_idx = idx;
return *this;
}
ChromeTraceEvent& ts(double ts) {
m_ts = ts;
return *this;
}
ChromeTraceEvent& dur(double dur) {
m_dur = dur;
return *this;
}
ChromeTraceEvent& ph(char ph) {
m_ph = ph;
return *this;
}
ChromeTraceEvent& bp(char bp) {
m_bp = bp;
return *this;
}
ChromeTraceEvent& args(std::shared_ptr<json::Object> args) {
m_args = std::move(args);
return *this;
}
ChromeTraceEvent& arg(std::string key, std::string value) {
if (!m_args) {
m_args = json::Object::make();
}
(*m_args)[key] = json::String::make(value);
return *this;
}
ChromeTraceEvent& arg(std::string key, double value) {
if (!m_args) {
m_args = json::Object::make();
}
(*m_args)[key] = json::Number::make(value);
return *this;
}
ChromeTraceEvent& arg(std::string key, std::shared_ptr<json::Value> value) {
if (!m_args) {
m_args = json::Object::make();
}
(*m_args)[key] = value;
return *this;
}
std::shared_ptr<json::Object> to_json() const {
auto result = json::Object::make();
auto prop_str = [&](auto key, auto value) {
if (value.empty()) {
return;
}
(*result)[key] = json::String::make(value);
};
auto prop_num = [&](auto key, auto value) {
if (!value) {
return;
}
(*result)[key] = json::Number::make(value.value());
};
auto prop_char = [&](auto key, auto value) {
if (!value) {
return;
}
(*result)[key] = json::String::make(std::string{} + value.value());
};
prop_str("name", m_name);
prop_num("tid", m_tid);
prop_str("cat", m_cat);
prop_num("pid", m_pid);
prop_num("id", m_id);
prop_num("idx", m_idx);
prop_num("ts", m_ts);
prop_num("dur", m_dur);
prop_char("ph", m_ph);
prop_char("bp", m_bp);
if (m_args) {
(*result)["args"] = m_args;
}
return result;
}
private:
std::string m_name;
std::string m_cat;
std::optional<uint64_t> m_tid;
std::optional<uint64_t> m_pid;
std::optional<uint64_t> m_id;
std::optional<uint64_t> m_idx;
std::optional<double> m_ts;
std::optional<double> m_dur;
std::optional<char> m_ph;
std::optional<char> m_bp;
std::shared_ptr<json::Object> m_args;
};
class ChromeTraceEventList {
public:
ChromeTraceEvent& new_event() {
m_content.emplace_back();
return m_content.back();
}
std::shared_ptr<json::Array> to_json() const {
auto result = json::Array::make();
for (auto&& event: m_content) {
result->add(event.to_json());
}
return result;
}
private:
std::vector<ChromeTraceEvent> m_content;
};
} // namespace imperative
} // namespace mgb
......@@ -11,65 +11,176 @@
#pragma once
#include "./commands.h"
#include "./tensor_info.h"
#include "megbrain/utils/small_vector.h"
namespace mgb::imperative::interpreter::intl {
#include "../op_trait.h"
namespace mgb::imperative::profiler {
enum class TensorProp {
InvalidProp, Device, Shape, DType, DevValue, HostValue,
};
using OpParams = std::unordered_map<std::string, std::string>;
}
namespace mgb::imperative {
template <>
struct ToStringTrait<profiler::TensorProp>{
using TensorProp = profiler::TensorProp;
std::string operator()(TensorProp prop) const {
switch(prop) {
case TensorProp::DType:
return "dtype";
case TensorProp::DevValue:
return "dev_value";
case TensorProp::Device:
return "device";
case TensorProp::HostValue:
return "host_value";
case TensorProp::Shape:
return "shape";
default:
return "unknown";
}
}
};
}
namespace mgb::imperative::profiler {
#define DEF_EVENT(X, ...) struct X##Event __VA_ARGS__;
#define DEF_DUR_EVENT(X, ...) struct X##Event __VA_ARGS__; struct X##FinishEvent __VA_ARGS__;
DEF_EVENT(Command, {
IdentifiedCommand icmd;
DEF_EVENT(OpDispatch, {
uint64_t op_id;
std::string op_name;
std::function<OpParams()> op_params;
SmallVector<uint64_t> inputs;
SmallVector<uint64_t> outputs;
});
DEF_DUR_EVENT(OpInput, {
uint64_t tensor_id;
TensorShape shape;
});
DEF_DUR_EVENT(OpDel, {
uint64_t tensor_id;
TensorShape shape;
});
DEF_DUR_EVENT(OpOutput, {
uint64_t tensor_id;
TensorShape shape;
});
DEF_EVENT(CommandEnqueue, :CommandEvent {});
DEF_EVENT(CommandExecute, :CommandEvent {});
DEF_EVENT(CommandFinish, :CommandEvent {});
DEF_DUR_EVENT(OpExecute, {
uint64_t id;
std::shared_ptr<OpDef> op;
SmallVector<uint64_t> inputs;
SmallVector<uint64_t> outputs;
uint64_t op_id;
});
DEF_DUR_EVENT(OpPostExecute, {
uint64_t op_id;
});
DEF_DUR_EVENT(KernelExecute, {
uint64_t id;
std::shared_ptr<OpDef> op;
SmallVector<uint64_t> inputs;
SmallVector<uint64_t> outputs;
uint64_t op_id;
uint64_t kernel_id;
std::shared_ptr<CompNode::Event> event;
});
DEF_EVENT(TensorDeclare, {
uint64_t tensor_id;
std::string name;
});
DEF_EVENT(TensorProduce, {
uint64_t tensor_id;
TensorLayout layout;
CompNode device;
void* ptr;
});
DEF_EVENT(TensorUsage, {
uint64_t tensor_id;
});
DEF_EVENT(TensorRelease, {
uint64_t tensor_id;
});
DEF_EVENT(TensorErase, {
uint64_t tensor_id;
size_t use_count;
});
DEF_EVENT(TensorGetProp, {
uint64_t tensor_id;
TensorInfo::Prop prop;
std::string prop_desc;
TensorProp prop;
});
DEF_EVENT(TensorNotifyProp, {
uint64_t tensor_id;
uint64_t wait_id;
TensorProp prop;
});
DEF_DUR_EVENT(TensorWaitProp, {
DEF_EVENT(TensorWaitProp, {
uint64_t tensor_id;
TensorInfo::Prop prop;
std::string prop_desc;
uint64_t wait_id;
TensorProp prop;
});
DEF_EVENT(TensorNotifyProp, {
DEF_EVENT(TensorWaitPropFinish, {
uint64_t tensor_id;
TensorInfo::Prop prop;
std::string prop_desc;
uint64_t wait_id;
TensorProp prop;
bool notified;
});
DEF_DUR_EVENT(Sync, {});
DEF_DUR_EVENT(SampleDevice, {
CompNode device;
size_t total_memory;
size_t free_memory;
});
DEF_EVENT(WorkerException, {});
DEF_EVENT(ShapeInfer, {
bool success;
});
DEF_DUR_EVENT(Scope, {
std::string name;
});
DEF_DUR_EVENT(DeviceScope, {
std::string name;
std::shared_ptr<CompNode::Event> event;
});
DEF_DUR_EVENT(Sync, {});
DEF_DUR_EVENT(StartProfile, {
size_t capture_count;
});
DEF_DUR_EVENT(StopProfile, {
size_t escape_count;
});
DEF_DUR_EVENT(TensorCommand, {
enum Kind {
Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue
};
uint64_t tensor_id;
Kind kind;
});
#undef DEF_EVENT
#undef DEF_DUR_EVENT
}
/**
* \file imperative/src/impl/interpreter/profiler.cpp
* \file imperative/src/impl/interpreter/profiler.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
......@@ -9,22 +9,12 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "./profiler.h"
#pragma once
#include <sstream>
#include <cinttypes>
#include <unordered_set>
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
#include <unistd.h>
#elif defined(_WIN32)
#include <process.h>
#else
#error Unsupported platform
#endif
#include "../op_trait.h"
namespace mgb::imperative::interpreter::intl {
#include "megbrain/imperative/profiler.h"
namespace mgb::imperative::profiler {
}
......@@ -6,6 +6,8 @@
#include "megbrain/tensor.h"
#include "./events.h"
namespace mgb::imperative::profiler {
struct ProfileDeviceState {
......@@ -53,6 +55,7 @@ struct ProfileStaticsState {
struct ProfileOperatorState {
uint64_t id;
std::string name;
OpParams params;
SmallVector<uint64_t> inputs;
SmallVector<uint64_t> outputs;
CompNode device;
......
......@@ -47,8 +47,8 @@ struct Interpreter {
virtual size_t get_option(std::string name) = 0;
virtual void set_option(std::string name, size_t value) = 0;
virtual void start_profile(std::unordered_map<std::string, int> option) = 0;
virtual void stop_profile(std::string basename, std::string format) = 0;
virtual void start_profile() = 0;
virtual void stop_profile() = 0;
virtual void push_scope(std::string name) = 0;
virtual void pop_scope(std::string name) = 0;
......
......@@ -17,6 +17,9 @@
#include <fstream>
#include <chrono>
#include <bitset>
#include <deque>
#include <any>
#include <typeindex>
#include "megbrain/comp_node.h"
#include "megbrain/graph/event.h"
......@@ -29,165 +32,188 @@
namespace mgb {
namespace imperative {
class DeviceTimer {
public:
using SharedEvent = std::shared_ptr<CompNode::Event>;
DeviceTimer() = default;
SharedEvent get_device_time(CompNode device);
SmallVector<SharedEvent> get_all(SmallVector<CompNode> device_list);
};
class HostTimer {
class Timer {
public:
void reset();
double get_msecs();
double get_started_at();
uint64_t get_nsecs();
uint64_t get_started_at();
static std::shared_ptr<CompNode::Event> record_event(CompNode device);
private:
decltype(std::chrono::steady_clock::now()) m_start;
double m_started_at;
uint64_t m_started_at;
};
class ProfilerBase {
class Profiler {
public:
using Host = std::thread::id;
using Device = CompNode;
struct HostInstant {
Host tid;
double time;
void wait() const {}
struct Record {
uint64_t id;
uint64_t time; //in ns
std::any data;
};
struct DeviceInstant {
double before;
std::shared_ptr<CompNode::Event> event;
double after;
void wait() const {
event->host_wait();
}
enum Status: uint8_t {
Running = 0,
Recording = 1,
Collecting = 2,
};
using ProfileCollector = std::function<void(std::thread::id, Record)>;
using option_t = uint64_t;
using options_t = std::unordered_map<std::string, option_t>;
using result_t = std::pair<std::thread::id, Record>;
using results_t = std::vector<result_t>;
using thread_dict_t = std::unordered_map<std::thread::id, std::string>;
private:
std::thread::id m_thread_id;
std::vector<Record> m_records;
std::atomic<Status> m_status = Running;
uint64_t m_last_time = 0;
std::string m_thread_name;
static options_t sm_profile_options;
static std::mutex sm_mutex;
static std::unordered_map<std::thread::id, Profiler*> sm_profilers;
static Timer sm_timer;
static std::atomic_uint64_t sm_last_id;
static std::atomic_size_t sm_preferred_capacity;
static bool sm_profiling;
static constexpr bool sm_debug = false;
thread_local static std::unique_ptr<Profiler> tm_profiler;
public:
Profiler() {
m_thread_id = std::this_thread::get_id();
MGB_LOCK_GUARD(sm_mutex);
if (sm_profilers.size() == 0) {
reset();
}
mgb_assert(sm_profilers.count(m_thread_id) == 0);
sm_profilers[m_thread_id] = this;
}
~Profiler() {
MGB_LOCK_GUARD(sm_mutex);
mgb_assert(sm_profilers.count(m_thread_id) == 1);
sm_profilers.erase(m_thread_id);
}
public:
static Profiler& get_instance() {
return *tm_profiler;
}
using Instant = std::variant<HostInstant, DeviceInstant>;
static void reset() {
mgb_assert(sm_profilers.size() == 0, "profiler already running");
sm_timer.reset();
}
template <typename TEvent>
struct EventRecord {
Instant instant;
TEvent data;
static uint64_t next_id() {
return sm_last_id++;
}
const HostInstant& host() const {
return std::get<HostInstant>(instant);
template <typename T, typename... TArgs>
static uint64_t record(TArgs&&... args) {
auto& profiler = get_instance();
auto last_time = profiler.m_last_time;
if constexpr (sm_debug) {
Status expected = Running;
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording));
}
const DeviceInstant& device() const {
return std::get<DeviceInstant>(instant);
uint64_t id = next_id();
uint64_t time = sm_timer.get_nsecs();
time = std::max(time, last_time + 2000);
profiler.m_last_time = time;
profiler.m_records.push_back({id, time, T{std::forward<TArgs>(args)...}});
if constexpr (sm_debug) {
Status expected = Recording;
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running));
}
return id;
}
void wait() const {
std::visit([&](const auto& instant){ instant.wait(); }, instant);
static results_t collect() {
MGB_LOCK_GUARD(sm_mutex);
if constexpr (sm_debug) {
for (auto&& [tid, profiler]: sm_profilers) {
Status expected = Running;
mgb_assert(profiler->m_status.compare_exchange_strong(expected, Collecting));
}
}
};
protected:
HostInstant record_host() {
return {std::this_thread::get_id(), m_host_timer.get_msecs()};
std::vector<std::pair<std::thread::id, Record>> profile_data;
for (auto&& [tid, profiler]: sm_profilers) {
sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size());
for (auto& record: profiler->m_records) {
profile_data.push_back({tid, std::move(record)});
}
profiler->m_records.clear();
profiler->m_records.reserve(sm_preferred_capacity);
}
std::sort(profile_data.begin(), profile_data.end(), [](auto& lhs, auto& rhs){
return lhs.second.id < rhs.second.id;
});
if constexpr (sm_debug) {
for (auto&& [tid, profiler]: sm_profilers) {
Status expected = Collecting;
mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running));
}
}
return profile_data;
}
DeviceInstant record_device(Device device) {
auto before = m_host_timer.get_msecs();
auto event = m_device_timer.get_device_time(device);
auto after = m_host_timer.get_msecs();
return {before, event, after};
static option_t get_option(std::string key, option_t default_val) {
if (!sm_profile_options.count(key)) {
return default_val;
}
return sm_profile_options.at(key);
}
protected:
std::atomic_int64_t m_last_id = 0;
HostTimer m_host_timer;
DeviceTimer m_device_timer;
Spinlock m_lock;
};
static void load_options(options_t options) {
sm_profile_options = std::move(options);
}
template <typename... TEvents>
class Profiler: public ProfilerBase {
public:
using Record = std::variant<EventRecord<TEvents>...>;
using Mask = std::bitset<sizeof...(TEvents)>;
static options_t get_options() {
return sm_profile_options;
}
struct Data {
std::vector<Record> records;
double started_at;
};
static bool is_profiling() {
return sm_profiling;
}
template <typename TEvent, size_t index = 0>
static constexpr size_t index_of() {
if constexpr (index == std::variant_size_v<Record>) {
return index;
} else if constexpr (std::is_same_v<EventRecord<TEvent>, std::variant_alternative_t<index, Record>>) {
return index;
} else {
return index_of<TEvent, index+1>();
}
};
static void start_profile() {
mgb_assert(!sm_profiling);
sm_profiling = true;
}
template <typename... TEvents2>
static Mask mask_of() {
return Mask{} | (Mask{}.set(index_of<TEvents2>()) |...);
static void stop_profile() {
mgb_assert(sm_profiling);
sm_profiling = false;
}
enum Status {
NotStarted, Profiling, Stopped
};
static thread_dict_t get_thread_dict();
static void dump_profile(std::string basename, std::string format, results_t results, options_t options);
};
class ProfileDataCollector {
public:
template <typename TEvent, typename... TArgs>
void record_host(TArgs&&... args) {
MGB_LOCK_GUARD(m_lock);
if (!m_event_mask.test(index_of<TEvent>())) {
return;
}
mgb_assert(m_status != Stopped, "record after stop");
auto instant = HostInstant{std::this_thread::get_id(), m_host_timer.get_msecs()};
m_record_list.emplace_back(EventRecord<TEvent>{std::move(instant), {std::forward<TArgs>(args)...}});
template <typename T>
using SubCollector = std::function<void(uint64_t, std::thread::id, uint64_t, T)>;
private:
std::unordered_map<std::type_index, SubCollector<std::any>> m_collectors;
public:
template <typename T>
ProfileDataCollector& handle(SubCollector<T> collector) {
auto erased = [collector](uint64_t id, std::thread::id tid, uint64_t time, std::any data){
collector(id, tid, time, std::any_cast<T>(std::move(data)));
};
m_collectors[typeid(T)] = erased;
return *this;
}
template <typename TEvent, typename... TArgs>
void record_device(Device device, TArgs&&... args) {
MGB_LOCK_GUARD(m_lock);
if (!m_event_mask.test(index_of<TEvent>())) {
void operator()(uint64_t id, std::thread::id tid, uint64_t time, std::any event) {
std::type_index type = event.type();
if (m_collectors.count(type) == 0) {
return;
}
mgb_assert(m_status != Stopped, "record after stop");
auto before = m_host_timer.get_msecs();
auto event = m_device_timer.get_device_time(device);
auto after = m_host_timer.get_msecs();
auto instant = DeviceInstant{before, event, after};
m_record_list.emplace_back(EventRecord<TEvent>{std::move(instant), {std::forward<TArgs>(args)...}});
}
// unsafe
bool is_profiling() {
return m_status == Profiling;
}
void start(Mask mask) {
MGB_LOCK_GUARD(m_lock);
mgb_assert(m_status == NotStarted, "profiler already started");
m_status = Profiling;
m_event_mask = mask;
m_host_timer.reset();
}
Data stop() {
MGB_LOCK_GUARD(m_lock);
mgb_assert(m_status == Profiling, "profiler not active");
m_status = Stopped;
for (auto&& record: m_record_list) {
std::visit([&](const auto& record){
record.wait();
}, record);
}
auto records = std::move(m_record_list);
return { records, m_host_timer.get_started_at() };
auto& handler = m_collectors.at(type);
handler(id, tid, time, std::move(event));
}
protected:
std::vector<Record> m_record_list;
Mask m_event_mask;
std::atomic<Status> m_status = NotStarted;
};
} // namespace imperative
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册