提交 d4c71f92 编写于 作者: M Megvii Engine Team

perf(profiler): reduce profiler host overhead

GitOrigin-RevId: 92cea560f55c23a7de7bbef254cde935a1e91735
上级 8fddd808
......@@ -1012,9 +1012,10 @@ void init_tensor(py::module m) {
interpreter_for_py->stop_profile();
interpreter_for_py->sync();
imperative::Profiler::stop_profile();
auto results = imperative::Profiler::collect();
return [results=std::move(results)](std::string basename, std::string format){
imperative::Profiler::dump_profile(basename, format, results);
auto results = std::make_shared<imperative::Profiler::bundle_t>(imperative::Profiler::collect());
return [results=results](std::string basename, std::string format) mutable {
imperative::Profiler::dump_profile(basename, format, std::move(*results));
results = nullptr;
};
}, py::call_guard<py::gil_scoped_release>());
m.def("sync",
......
......@@ -1133,9 +1133,7 @@ void ChannelImpl::process_one_task(Command& icmd) {
}
}
CompNode::foreach([&](CompNode device){
if (Profiler::get_option("sample_rate", 0)) {
sample_on_device(device, true);
}
sample_on_device(device, true);
MGB_RECORD_EVENT_IF((Profiler::get_option("profile_device", 0)), RecordDeviceEvent, Timer::record_device(device));
});
MGB_RECORD_EVENT(StartProfileFinishEvent);
......@@ -1149,9 +1147,7 @@ void ChannelImpl::process_one_task(Command& icmd) {
MGB_RECORD_EVENT(TensorEraseEvent, info->id);
}
CompNode::foreach([&](CompNode device){
if (Profiler::get_option("sample_rate", 0)) {
sample_on_device(device, true);
}
sample_on_device(device, true);
});
MGB_RECORD_EVENT(StopProfileFinishEvent);
} else if constexpr (std::is_same_v<T, PushScope>) {
......
......@@ -28,7 +28,7 @@ namespace mgb {
namespace imperative {
profiler::Time Timer::record_host() {
return std::chrono::high_resolution_clock::now();
return std::chrono::system_clock::now();
}
std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) {
......@@ -40,12 +40,12 @@ std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) {
std::vector<Profiler::entry_t> Profiler::sm_records;
Profiler::options_t Profiler::sm_profile_options;
std::mutex Profiler::sm_mutex;
std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers;
std::unordered_map<std::thread::id, std::unique_ptr<Profiler>> Profiler::sm_profilers;
Timer Profiler::sm_timer;
profiler::HostTime Profiler::sm_start_at = profiler::HostTime::min();
std::atomic_uint64_t Profiler::sm_last_id = 0;
bool Profiler::sm_profiling = false;
thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>();
thread_local Profiler* Profiler::tm_profiler = nullptr;
std::atomic_size_t Profiler::sm_preferred_capacity;
auto Profiler::get_thread_dict() -> thread_dict_t {
......@@ -65,7 +65,7 @@ void Profiler::dump_profile(std::string basename, std::string format, bundle_t r
if (iter == format_table.end()) {
mgb_log_error("unsupported profiling format %s", format.c_str());
}
return (iter->second)(basename, result);
return (iter->second)(basename, std::move(result));
}
} // namespace imperative
......
......@@ -268,7 +268,7 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
.cat("Kernel")
.args(current_op->detail());
} else if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) {
if (current_tensor->living_time != profiler::Duration::zero()) {
if (current_tensor->living_time == profiler::Duration::zero()) {
new_host_event(pid_str, 's')
.id(event.tensor_id)
.cat("TensorLink")
......@@ -319,8 +319,8 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
.scope(pid_str);
} else if constexpr (std::is_same_v<TEvent, SampleDeviceFinishEvent>) {
std::string device_name = event.device.locator().to_string();
new_host_event("memory", 'C')
.arg(ssprintf("%s_alloc_mem", device_name.c_str()), event.total_memory - event.free_memory);
new_host_event(ssprintf("%s_alloc_mem", device_name.c_str()), 'C')
.arg("value", event.total_memory - event.free_memory);
} else if constexpr (std::is_same_v<TEvent, TensorCommandEvent>) {
new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'B');
} else if constexpr (std::is_same_v<TEvent, TensorCommandFinishEvent>) {
......@@ -366,6 +366,12 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
.arg("dtype", event.layout.dtype.name())
.arg("nr_elements", event.layout.total_nr_elems())
.arg("device", event.device.to_string());
} else if constexpr (std::is_same_v<TEvent, RecordDeviceEvent>) {
auto current_host_time = current->time;
auto current_device_time = to_device_time(current->time, event.event->comp_node());
auto device_ahead = std::chrono::duration_cast<std::chrono::milliseconds>(current_device_time-current_host_time);
new_host_event("device_ahead_ms", 'C')
.arg("value", device_ahead.count());
}
}
......
......@@ -261,7 +261,7 @@ struct MemoryFlowVisitor: EventVisitor<MemoryFlowVisitor> {
void dump_memory_flow(std::string filename, Profiler::bundle_t result) {
MemoryFlowVisitor visitor;
visitor.process_events(std::move(result));
visitor.process_events(result);
debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string());
}
......
......@@ -139,22 +139,22 @@ struct is_trace_event<T, decltype(std::declval<T>().trace, void())> : std::true_
template <typename... TItems>
class AnyToVariantConverter {
public:
using any_t = std::any;
using any_t = AnyPtr;
using variant_t = std::variant<TItems...>;
private:
std::unordered_map<std::type_index, std::function<variant_t(any_t)>> m_table;
std::unordered_map<std::type_index, std::function<variant_t(const any_t&)>> m_table;
template <typename TItem>
void register_converter() {
m_table[typeid(TItem)] = [](any_t input) {
return variant_t(std::any_cast<TItem>(std::move(input)));
m_table[typeid(TItem)] = [](const any_t& input) {
return variant_t(*input.as<TItem>());
};
}
public:
AnyToVariantConverter() {
(register_converter<TItems>(), ...);
}
variant_t operator()(any_t input) {
variant_t operator()(const any_t& input) {
return m_table[input.type()](std::move(input));
}
};
......@@ -222,7 +222,7 @@ protected:
value += delta;
}
public:
void process_events(Profiler::bundle_t bundle) {
void process_events(Profiler::bundle_t& bundle) {
m_start_time = bundle.start_at;
auto& self = static_cast<TSelf&>(*this);
......@@ -231,7 +231,7 @@ public:
OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent,
TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent,
TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent,
SampleDeviceEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent,
SampleDeviceEvent, SampleDeviceFinishEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent,
StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent,
TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent,
CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent,
......@@ -298,6 +298,7 @@ public:
m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()};
}
tensor.device = event.device;
tensor.layout = event.layout;
}
});
......
......@@ -34,9 +34,10 @@ namespace imperative {
namespace profiler {
using HostTime = std::chrono::time_point<std::chrono::high_resolution_clock>;
using HostTime = std::chrono::time_point<std::chrono::system_clock>;
using Duration = std::chrono::nanoseconds;
using RealDuration = std::chrono::duration<double, std::nano>;
using Time = HostTime;
......@@ -50,6 +51,52 @@ public:
static std::shared_ptr<CompNode::Event> record_device(CompNode device);
};
class AnyPtr {
public:
struct Deleter {
void* object;
void (*method)(void*, void*);
void operator() (void* ptr) {
method(object, ptr);
}
};
private:
using holder_t = std::unique_ptr<void, Deleter>;
const std::type_info* m_type = nullptr;
holder_t m_holder = nullptr;
public:
AnyPtr() = default;
template <typename T, typename=std::enable_if_t<!std::is_same_v<std::decay_t<T>, AnyPtr>>>
explicit AnyPtr(T* value, Deleter deleter) {
m_type = &typeid(T);
m_holder = {value, deleter};
}
template <typename T>
T* as() {
mgb_assert(is_exactly<T>(), "type mismatch");
return reinterpret_cast<T*>(m_holder.get());
}
template <typename T>
const T* as() const {
mgb_assert(is_exactly<T>(), "type mismatch");
return reinterpret_cast<const T*>(m_holder.get());
}
template <typename T>
bool is_exactly() const {
return std::type_index{typeid(T)} == std::type_index{*m_type};
}
const std::type_info& type() const {
return *m_type;
}
bool operator==(std::nullptr_t nptr) const {
return m_holder == nullptr;
}
operator bool() const {
return m_holder != nullptr;
}
};
class Profiler {
public:
......@@ -57,7 +104,10 @@ public:
uint64_t id;
std::thread::id tid;
profiler::Time time;
std::any data;
AnyPtr data;
Record() = default;
Record(uint64_t id, std::thread::id tid, profiler::Time time, AnyPtr data):
id{id}, tid{tid}, time{time}, data{std::move(data)} {};
};
enum Status: uint8_t {
Running = 0,
......@@ -82,36 +132,52 @@ private:
std::thread::id m_thread_id;
std::vector<Record> m_records;
std::atomic<Status> m_status = Running;
std::unordered_map<std::type_index, AnyPtr> m_mem_pools;
static std::vector<entry_t> sm_records;
static options_t sm_profile_options;
static std::mutex sm_mutex;
static std::unordered_map<std::thread::id, Profiler*> sm_profilers;
// assume std::thread::id is unique
static std::unordered_map<std::thread::id, std::unique_ptr<Profiler>> sm_profilers;
static Timer sm_timer;
static profiler::HostTime sm_start_at;
static std::atomic_uint64_t sm_last_id;
static std::atomic_size_t sm_preferred_capacity;
static bool sm_profiling;
static constexpr bool sm_debug = false;
thread_local static std::unique_ptr<Profiler> tm_profiler;
thread_local static Profiler* tm_profiler;
public:
Profiler() {
m_thread_id = std::this_thread::get_id();
MGB_LOCK_GUARD(sm_mutex);
mgb_assert(sm_profilers.count(m_thread_id) == 0);
sm_profilers[m_thread_id] = this;
}
~Profiler() {
MGB_LOCK_GUARD(sm_mutex);
mgb_assert(sm_profilers.count(m_thread_id) == 1);
sm_profilers.erase(m_thread_id);
sm_records.insert(sm_records.end(), m_records.begin(), m_records.end());
explicit Profiler(std::thread::id tid): m_thread_id{tid} {
mgb_assert(tid == std::this_thread::get_id(), "thread id mismatch");
}
public:
static Profiler& get_instance() {
if (!tm_profiler) {
MGB_LOCK_GUARD(sm_mutex);
auto& profiler = sm_profilers[std::this_thread::get_id()];
if (!profiler) {
profiler = std::make_unique<Profiler>(std::this_thread::get_id());
}
tm_profiler = profiler.get();
}
return *tm_profiler;
}
template <typename T>
static MemPool<T>& get_mem_pool() {
thread_local MemPool<T>* t_pool = nullptr;
if (t_pool == nullptr) {
auto& pool = get_instance().m_mem_pools[typeid(MemPool<T>)];
if (pool == nullptr) {
pool = AnyPtr(new MemPool<T>(), {nullptr, [](void*, void* ptr){
delete reinterpret_cast<MemPool<T>*>(ptr);
}});
}
t_pool = pool.as<MemPool<T>>();
}
return *t_pool;
}
static uint64_t next_id() {
return sm_last_id++;
}
......@@ -119,13 +185,19 @@ public:
template <typename T, typename... TArgs>
static uint64_t record(TArgs&&... args) {
auto& profiler = get_instance();
auto& mem_pool = get_mem_pool<T>();
if constexpr (sm_debug) {
Status expected = Running;
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording));
}
uint64_t id = next_id();
profiler::Time time = sm_timer.record_host();
profiler.m_records.push_back({id, std::this_thread::get_id(), time, T{std::forward<TArgs>(args)...}});
auto deleter = [](void* obj, void* ptr){
reinterpret_cast<MemPool<T>*>(obj)->free(reinterpret_cast<T*>(ptr));
};
profiler.m_records.emplace_back(id, profiler.m_thread_id, time, AnyPtr{
mem_pool.alloc(T{std::forward<TArgs>(args)...}), {&mem_pool, deleter}
});
if constexpr (sm_debug) {
Status expected = Recording;
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running));
......@@ -146,7 +218,9 @@ public:
std::vector<entry_t> profile_data = std::move(sm_records);
for (auto&& [tid, profiler]: sm_profilers) {
sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size());
profile_data.insert(profile_data.end(), profiler->m_records.begin(), profiler->m_records.end());
profile_data.insert(profile_data.end(),
std::make_move_iterator(profiler->m_records.begin()),
std::make_move_iterator(profiler->m_records.end()));
profiler->m_records.clear();
profiler->m_records.reserve(sm_preferred_capacity);
}
......@@ -160,11 +234,11 @@ public:
mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running));
}
}
bundle.entries = profile_data;
bundle.entries = std::move(profile_data);
bundle.options = get_options();
bundle.start_at = sm_start_at;
bundle.thread_dict = get_thread_dict();
return bundle;
return std::move(bundle);
}
static option_t get_option(std::string key, option_t default_val) {
......@@ -203,31 +277,6 @@ public:
};
class ProfileDataCollector {
public:
template <typename T>
using SubCollector = std::function<void(uint64_t, std::thread::id, uint64_t, T)>;
private:
std::unordered_map<std::type_index, SubCollector<std::any>> m_collectors;
public:
template <typename T>
ProfileDataCollector& handle(SubCollector<T> collector) {
auto erased = [collector](uint64_t id, std::thread::id tid, uint64_t time, std::any data){
collector(id, tid, time, std::any_cast<T>(std::move(data)));
};
m_collectors[typeid(T)] = erased;
return *this;
}
void operator()(uint64_t id, std::thread::id tid, uint64_t time, std::any event) {
std::type_index type = event.type();
if (m_collectors.count(type) == 0) {
return;
}
auto& handler = m_collectors.at(type);
handler(id, tid, time, std::move(event));
}
};
#define MGB_RECORD_EVENT(type, ...) \
if (mgb::imperative::Profiler::is_profiling()) { \
mgb::imperative::Profiler::record<type>(type{__VA_ARGS__}); \
......
......@@ -25,11 +25,11 @@ TEST(TestProfiler, ImperativeLogProfile) {
imperative_log_profile("XXX");
auto results = imperative::Profiler::collect();
imperative::Profiler::stop_profile();
mgb_assert(results.size() == 2);
auto* event_start = std::any_cast<profiler::CustomEvent>(&results[0].second.data);
auto* event_finish = std::any_cast<profiler::CustomFinishEvent>(&results[1].second.data);
mgb_assert(results.entries.size() == 2);
auto* event_start = results.entries[0].data.as<profiler::CustomEvent>();
auto* event_finish = results.entries[1].data.as<profiler::CustomFinishEvent>();
mgb_assert(event_start && event_start->title == "XXX");
mgb_assert(event_finish && event_finish->title == "XXX");
mgb_assert(results[0].second.time < results[1].second.time);
mgb_assert(results[0].second.id < results[1].second.id);
mgb_assert(results.entries[0].time < results.entries[1].time);
mgb_assert(results.entries[0].id < results.entries[1].id);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册