states.h 15.4 KB
Newer Older
1 2 3
#pragma once

#include <any>
M
Megvii Engine Team 已提交
4
#include <set>
5
#include <sstream>
M
Megvii Engine Team 已提交
6
#include <typeindex>
7 8

#include "nlohmann/json.hpp"
9 10 11

#include "megbrain/tensor.h"

12 13
#include "./events.h"

14 15
namespace mgb::imperative::profiler {

16
using StackManager = interpreter::intl::StackManager;
17 18

struct ProfileTensorState {
19 20
    uint64_t id = 0;
    std::optional<uint64_t> source;
21 22 23
    TensorLayout layout;
    CompNode device;
    std::string name;
24 25
    profiler::HostTime produced = profiler::HostTime::min();
    profiler::Duration living_time = profiler::Duration::zero();
26 27 28 29 30 31 32 33

    size_t size_in_bytes() const {
        if (!layout.dtype.valid()) {
            return 0;
        }
        return layout.dtype.size(layout.total_nr_elems());
    }

34 35 36
    std::string info(HostTime current_time) {
        std::string shape = layout.TensorShape::to_string();
        std::string dtype = layout.dtype.name();
M
Megvii Engine Team 已提交
37 38 39
        return ssprintf(
                "%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(),
                device.to_string().c_str());
40 41 42 43 44 45 46 47 48 49 50
    }

    nlohmann::json detail(HostTime current_time) {
        nlohmann::json args;
        args["id"] = id;
        args["name"] = name;
        args["shape"] = layout.TensorShape::to_string();
        args["dtype"] = layout.dtype.name();
        args["nr_elements"] = layout.total_nr_elems();
        args["device"] = device.to_string();
        if (produced != produced.min()) {
M
Megvii Engine Team 已提交
51 52 53 54
            double ms_count = std::chrono::duration_cast<
                                      std::chrono::duration<double, std::micro>>(
                                      current_time - produced + living_time)
                                      .count();
55 56 57 58
            args["living_time"] = ssprintf("%lf ms", ms_count);
        }
        return args;
    }
59 60 61
};

struct ProfileOperatorState {
62
    uint64_t id = 0;
63
    std::string name;
64
    OpParams params;
65 66 67
    SmallVector<uint64_t> inputs;
    SmallVector<uint64_t> outputs;
    CompNode device;
68
    Trace trace;
69

70 71 72 73 74 75 76
    struct Execution {
        std::string reason;
        profiler::HostTime begin;
        profiler::HostTime end;
    };

    SmallVector<Execution> executions;
77

78 79
    nlohmann::json detail() {
        nlohmann::json args;
M
Megvii Engine Team 已提交
80
        for (auto&& [name, value] : params) {
81 82 83 84 85 86 87
            args[name] = value;
        }
        args["__id__"] = id;
        args["__name__"] = name;
        args["__device__"] = device.to_string();
        return args;
    }
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
};

template <typename TProp>
struct ProfileTensorPropPair {
    uint64_t id;
    TProp value;

    bool operator<(const ProfileTensorPropPair& lhs) const {
        return value == lhs.value ? id < lhs.id : value < lhs.value;
    }

    bool operator==(const ProfileTensorPropPair& lhs) const {
        return id == lhs.id && value == lhs.value;
    }

    bool operator>(const ProfileTensorPropPair& lhs) const {
        return value == lhs.value ? id > lhs.id : value > lhs.value;
    }
};

using ProfileTensorSizePair = ProfileTensorPropPair<size_t>;
using ProfileTensorProducedPair = ProfileTensorPropPair<uint64_t>;

struct ProfileState {
    std::unordered_map<uint64_t, ProfileTensorState> tensors;
    std::unordered_map<uint64_t, ProfileOperatorState> operators;
    std::unordered_map<std::string, uint64_t> tensor_name_counter;
    std::set<ProfileTensorSizePair> tensors_by_size;
    std::set<ProfileTensorSizePair> tensors_by_produced;

    std::vector<uint64_t> top_k_tensor_in_device(CompNode device, size_t k) {
        std::vector<uint64_t> results;
M
Megvii Engine Team 已提交
120 121
        for (auto iter = tensors_by_size.rbegin(); iter != tensors_by_size.rend();
             ++iter) {
122 123 124 125 126 127 128 129 130 131
            if (!k) {
                break;
            }
            if (tensors[iter->id].device == device) {
                results.push_back(iter->id);
                --k;
            }
        }
        return results;
    }
132
};
133

M
Megvii Engine Team 已提交
134 135
template <typename T, typename = void>
struct is_op_event : std::false_type {};
136

M
Megvii Engine Team 已提交
137 138
template <typename T>
struct is_op_event<T, decltype(std::declval<T>().op_id, void())> : std::true_type {};
139

M
Megvii Engine Team 已提交
140 141
template <typename T, typename = void>
struct is_tensor_event : std::false_type {};
142

M
Megvii Engine Team 已提交
143 144 145 146 147 148 149
template <typename T>
struct is_tensor_event<T, decltype(std::declval<T>().tensor_id, void())>
        : std::true_type {};
template <typename T, typename = void>
struct is_trace_event : std::false_type {};
template <typename T>
struct is_trace_event<T, decltype(std::declval<T>().trace, void())> : std::true_type {};
150 151 152 153

template <typename... TItems>
class AnyToVariantConverter {
public:
154
    using any_t = AnyPtr;
155
    using variant_t = std::variant<TItems...>;
M
Megvii Engine Team 已提交
156

157
private:
158
    std::unordered_map<std::type_index, std::function<variant_t(const any_t&)>> m_table;
159 160 161

    template <typename TItem>
    void register_converter() {
162
        m_table[typeid(TItem)] = [](const any_t& input) {
163
            return variant_t(input.cast<TItem>());
164 165
        };
    }
M
Megvii Engine Team 已提交
166

167
public:
M
Megvii Engine Team 已提交
168
    AnyToVariantConverter() { (register_converter<TItems>(), ...); }
169
    variant_t operator()(const any_t& input) {
170 171 172 173 174 175 176 177 178 179 180 181 182
        return m_table[input.type()](std::move(input));
    }
};

template <typename TSelf>
class EventVisitor {
private:
    std::unordered_map<size_t, ProfileOperatorState> m_operators;
    std::unordered_map<size_t, ProfileTensorState> m_tensors;
    std::unordered_map<size_t, std::vector<Profiler::Record>> m_duration_stack;
    HostTime m_start_time;
    CompNode::UnorderedMap<size_t> m_device_tid_table;
    std::unordered_map<std::thread::id, size_t> m_host_tid_table;
M
Megvii Engine Team 已提交
183 184
    CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>>
            m_device_timeline;
185 186
    std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack;
    std::unordered_map<std::string, int64_t> m_counter_table;
M
Megvii Engine Team 已提交
187

188 189 190 191
protected:
    Profiler::Record* current;
    ProfileOperatorState* current_op;
    ProfileTensorState* current_tensor;
M
Megvii Engine Team 已提交
192

193 194 195 196 197 198 199 200 201 202 203 204 205
protected:
    profiler::Duration since_start(profiler::HostTime time) {
        return time - m_start_time;
    }

    profiler::HostTime to_device_time(profiler::HostTime time, CompNode device) {
        auto& device_timeline = m_device_timeline[device];
        auto upper = device_timeline.lower_bound(time);
        if (upper == device_timeline.end()) {
            if (upper == device_timeline.begin()) {
                return time;
            } else {
                --upper;
M
Megvii Engine Team 已提交
206 207
                return time +
                       std::chrono::duration_cast<profiler::Duration>(upper->second);
208 209 210 211 212
            }
        } else if (upper->first == time) {
            return time + std::chrono::duration_cast<profiler::Duration>(upper->second);
        } else if (upper == device_timeline.begin()) {
            return time + std::chrono::duration_cast<profiler::Duration>(upper->second);
213
        }
214
        auto lower = upper;
M
Megvii Engine Team 已提交
215 216 217 218
        --lower;
        double ratio =
                ((double)(time - lower->first).count() /
                 (double)(upper->first - lower->first).count());
219
        mgb_assert(ratio > 0 && ratio < 1, "invalid ratio");
M
Megvii Engine Team 已提交
220 221 222
        mgb_assert(
                lower->first + lower->second <= upper->first + upper->second,
                "device time corr");
223 224
        auto shift = lower->second + ratio * (upper->second - lower->second);
        auto result = time + std::chrono::duration_cast<profiler::Duration>(shift);
225 226
        return result;
    }
227

M
Megvii Engine Team 已提交
228
    size_t to_tid(std::thread::id host_tid) { return m_host_tid_table.at(host_tid); }
229

M
Megvii Engine Team 已提交
230
    size_t to_tid(CompNode device) { return m_device_tid_table.at(device); }
231

232 233
    SmallVector<std::thread::id> host_threads() {
        SmallVector<std::thread::id> host_threads;
M
Megvii Engine Team 已提交
234
        for (auto&& [host, _] : m_host_tid_table) {
235 236 237 238 239 240 241
            host_threads.push_back(host);
        }
        return host_threads;
    }

    SmallVector<CompNode> devices() {
        SmallVector<CompNode> devices;
M
Megvii Engine Team 已提交
242
        for (auto&& [device, _] : m_device_tid_table) {
243 244 245 246 247
            devices.push_back(device);
        }
        return devices;
    }

248 249 250 251 252 253 254 255
    void inc_counter(const char* key, int64_t delta) {
        if (!m_counter_table.count(key)) {
            m_counter_table[key] = 0;
        }
        auto& value = m_counter_table[key];
        static_cast<TSelf&>(*this).notify_counter(key, value, value + delta);
        value += delta;
    }
M
Megvii Engine Team 已提交
256

257
public:
258
    void process_events(Profiler::bundle_t& bundle) {
259 260 261
        m_start_time = bundle.start_at;

        auto& self = static_cast<TSelf&>(*this);
M
Megvii Engine Team 已提交
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
        AnyToVariantConverter<
                OpDispatchEvent, OpExecuteEvent, OpExecuteFinishEvent,
                KernelLaunchEvent, KernelLaunchFinishEvent, OpInputEvent,
                OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent,
                TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent,
                TensorReleaseEvent, TensorEraseEvent, TensorGetPropEvent,
                TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent,
                SampleDeviceEvent, SampleDeviceFinishEvent, WorkerExceptionEvent,
                ShapeInferEvent, SyncEvent, SyncFinishEvent, StartProfileEvent,
                StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent,
                TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent,
                AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent,
                ScopeEvent, ScopeFinishEvent, HostToDeviceEvent,
                HostToDeviceFinishEvent>
                converter;
277 278

        auto for_each_entry = [&](auto&& handler) {
M
Megvii Engine Team 已提交
279
            for (auto& entry : bundle.entries) {
280 281 282 283 284 285 286 287 288 289 290 291 292
                current = &entry;
                std::visit(handler, converter(entry.data));
            }
            current = nullptr;
        };

        // build device timeline
        struct DeviceStartPair {
            profiler::HostTime host;
            std::shared_ptr<CompNode::Event> device;
        };
        CompNode::UnorderedMap<DeviceStartPair> device_start_table;

M
Megvii Engine Team 已提交
293
        for_each_entry([&](auto&& event) {
294 295 296
            using T = std::decay_t<decltype(event)>;
            if constexpr (std::is_same_v<T, RecordDeviceEvent>) {
                using namespace std::chrono_literals;
M
Megvii Engine Team 已提交
297 298
                DeviceStartPair& device_start =
                        device_start_table[event.event->comp_node()];
299
                if (!device_start.device) {
M
Megvii Engine Team 已提交
300
                    device_start = {current->time, event.event};
301 302
                }
                event.event->host_wait();
M
Megvii Engine Team 已提交
303 304 305 306 307 308 309
                auto device_time =
                        (device_start.host - current->time) +
                        std::chrono::duration_cast<profiler::RealDuration>(
                                device_start.device->elapsed_time_until(*event.event) *
                                1s);
                m_device_timeline[event.event->comp_node()][current->time] =
                        device_time;
310 311 312 313
            }
        });

        // register host threads
M
Megvii Engine Team 已提交
314
        for_each_entry([&](auto&& event) {
315
            if (!m_host_tid_table.count(current->tid)) {
M
Megvii Engine Team 已提交
316 317
                m_host_tid_table[current->tid] = {
                        m_device_tid_table.size() + m_host_tid_table.size()};
318 319 320
            }
        });

M
Megvii Engine Team 已提交
321
        for_each_entry([&](auto&& event) {
322 323 324 325 326 327 328 329 330 331
            using T = std::decay_t<decltype(event)>;
            if constexpr (std::is_same_v<T, OpDispatchEvent>) {
                auto& op = m_operators[event.op_id];
                mgb_assert(op.id == 0, "duplicate operator id");
                op.id = event.op_id;
                op.name = event.op_name;
                op.params = event.op_params();
                op.inputs = event.inputs;
                op.outputs = event.outputs;
                op.trace = event.trace;
M
Megvii Engine Team 已提交
332
                for (auto&& output : event.outputs) {
333
                    m_tensors[output].source = op.id;
334 335 336 337 338 339 340
                }
            } else if constexpr (std::is_same_v<T, TensorDeclareEvent>) {
                auto& tensor = m_tensors[event.tensor_id];
                mgb_assert(tensor.id == 0, "duplicated tensor id");
                tensor.id = event.tensor_id;
                tensor.name = event.name;
            } else if constexpr (std::is_same_v<T, TensorProduceEvent>) {
341
                auto& tensor = m_tensors[event.tensor_id];
342
                if (!m_device_tid_table.count(event.device)) {
M
Megvii Engine Team 已提交
343 344
                    m_device_tid_table[event.device] = {
                            m_device_tid_table.size() + m_host_tid_table.size()};
345 346
                }
                tensor.device = event.device;
347
                tensor.layout = event.layout;
348 349 350 351 352
            }
        });

        // replay execution
        using namespace std::placeholders;
M
Megvii Engine Team 已提交
353
        for_each_entry([&](auto&& event) {
354 355 356
            using T = std::decay_t<decltype(event)>;
            // update current_op/tensor
            if constexpr (is_op_event<T>::value) {
357 358 359 360 361
                current_op = &m_operators[event.op_id];
                if (current_op->id == 0) {
                    current_op->id = event.op_id;
                    current_op->name = "UnknownOperator";
                }
362
            } else if constexpr (is_tensor_event<T>::value) {
363 364 365 366 367
                current_tensor = &m_tensors[event.tensor_id];
                if (current_tensor->id == 0) {
                    current_tensor->id = event.tensor_id;
                    current_tensor->name = "UnknownTensor";
                }
368 369
            }
            if constexpr (std::is_same_v<T, OpExecuteEvent>) {
370 371 372
                current_op->executions.emplace_back();
                current_op->executions.back().reason = event.reason;
                current_op->executions.back().begin = current->time;
373
            } else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) {
374
                current_op->executions.back().end = current->time;
375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
            }
            // update counters
            if constexpr (std::is_same_v<T, OpDispatchEvent>) {
                inc_counter("nr_op_pending", 1);
            } else if constexpr (std::is_same_v<T, OpExecuteEvent>) {
                inc_counter("nr_op_pending", -1);
            } else if constexpr (std::is_same_v<T, TensorProduceEvent>) {
                inc_counter("nr_alive_tensor", 1);
            } else if constexpr (std::is_same_v<T, TensorReleaseEvent>) {
                inc_counter("nr_alive_tensor", -1);
            } else if constexpr (std::is_same_v<T, TensorEraseEvent>) {
                if (event.use_count == 0) {
                    inc_counter("nr_redunant_tensor", 1);
                }
            } else if constexpr (std::is_same_v<T, ShapeInferEvent>) {
                if (!event.success) {
                    inc_counter("nr_shape_infer_failure", 1);
                }
            } else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) {
                inc_counter("nr_exception", 1);
395 396 397
            } else if constexpr (std::is_same_v<T, KernelLaunchFinishEvent>) {
                auto& execution = current_op->executions.back();
                if (execution.reason == "dtr") {
M
Megvii Engine Team 已提交
398 399 400 401 402 403 404
                    auto overhead = to_device_time(current->time, event.device) -
                                    to_device_time(execution.begin, event.device);
                    inc_counter(
                            "dtr_overhead_us",
                            std::chrono::duration_cast<std::chrono::microseconds>(
                                    overhead)
                                    .count());
405
                }
406 407 408 409 410 411 412 413 414 415 416
            }
            // visit_event_impl
            self.visit_event(event);
            // reset current_op/tensor
            if constexpr (is_op_event<T>::value) {
                current_op = nullptr;
            } else if constexpr (is_tensor_event<T>::value) {
                current_tensor = nullptr;
            }
        });
    }
417 418
};

M
Megvii Engine Team 已提交
419
}  // namespace mgb::imperative::profiler