profiler.cpp 8.0 KB
Newer Older
1
/**
M
Megvii Engine Team 已提交
2 3
 * \file imperative/src/impl/profiler.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
4
 *
M
Megvii Engine Team 已提交
5
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
6
 *
M
Megvii Engine Team 已提交
7 8 9
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 11 12 13
 */

#include "megbrain/imperative/profiler.h"

14
#include "./function_hook.h"
15 16 17
#include "megbrain/imperative/ops/opr_attr.h"
#include "megbrain/imperative/physical_tensor.h"

18 19
#include "megbrain/plugin/opr_footprint.h"

20
#include "./event_pool.h"
21 22 23 24 25 26
#include "./op_trait.h"

namespace mgb {
namespace imperative {

namespace {
27

28 29 30
CompNode::UnorderedSet collect_comp_nodes(
        const OpDef& def, const SmallVector<TensorPtr>& inputs) {
    CompNode::UnorderedSet comp_nodes;
31
    SmallVector<LogicalTensorDesc> inp_descs;
32 33
    for (auto&& i : inputs) {
        comp_nodes.insert(i->comp_node());
34
        inp_descs.push_back({i->layout(), i->comp_node(), {}});
35
    }
36 37
    SmallVector<LogicalTensorDesc> oup_descs = std::get<0>(def.infer_output_attrs_fallible(def, inp_descs));
    for (auto&& output_attr : oup_descs) {
38 39 40 41 42
        comp_nodes.insert(output_attr.comp_node);
    }
    return comp_nodes;
}

43 44 45 46 47 48 49 50
DeviceTimer::SharedEvent alloc_recorded_event(CompNode device) {
    auto event = EventPool::with_timer().alloc_shared(device);
    event->record();
    return event;
}

OprFootprint footprint{};

51
}  // namespace
52

53 54
void DeviceTimer::reset(thin_function<double()> host_timer) {
    CompNode::foreach ([this, host_timer](CompNode device) {
55
        m_base_event_table[device] = {alloc_recorded_event(device), host_timer()};
56
    });
57
    m_host_timer = host_timer;
58 59
}

60 61 62
thin_function<double()> DeviceTimer::get_device_time(CompNode device) {
    auto event = EventPool::with_timer().alloc_shared(device);
    event->record();
63 64 65
    if(m_base_event_table.count(device) == 0) {
        m_base_event_table[device] = {alloc_recorded_event(device), m_host_timer()};
    }
66 67 68
    auto base = m_base_event_table[device];
    return [base, event] {
        auto [base_event, host_time] = base;
69
        // TODO: sync once for each compnode
70 71 72
        event->host_wait();
        return base_event->elapsed_time_until(*event) * 1000 + host_time;
    };
73 74
}

75 76 77 78 79 80 81 82 83 84 85 86 87 88
void DeviceTimer::clear() {
    m_base_event_table.clear();
}

size_t TensorRecorder::record_tensor(const TensorPtr& tensor) {
    if (m_tensor_map.count(tensor.get()) > 0) {
        auto& [prev, id] = m_tensor_map[tensor.get()];
        if (prev.lock() != tensor) {
            prev = tensor;
            id = m_next_id++;
        }
        return id;
    } else {
        auto id = m_next_id++;
89 90
        m_tensor_map.insert(
                {tensor.get(), {std::weak_ptr<Tensor>{tensor}, id}});
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
        return id;
    }
}

void TensorRecorder::clear() {
    m_next_id = 0;
    m_tensor_map.clear();
}

Profile& Profiler::get_profile() {
    for (auto& entry : m_profile) {
        for (auto& [device, device_begin, device_end] : entry.device_list) {
            MGB_MARK_USED_VAR(device);
            device_begin = [value = device_begin()] { return value; };
            device_end = [value = device_end()] { return value; };
        }
    }
    return m_profile;
}

void Profiler::start(uint32_t flags) {
112
    m_host_timer.reset();
113 114 115 116 117 118 119
    m_device_timer.reset([&] { return m_host_timer.get_msecs(); });
    OpTrait::for_each_trait([this, flags](OpTrait& trait) {
        auto hook_apply_on_physical_tensor =
                make_shared_hook(&trait.apply_on_physical_tensor);
        auto hook_apply_on_var_node =
                make_shared_hook(&trait.apply_on_var_node);
        hook_apply_on_physical_tensor->apply_hook([this, flags]
120
                (auto&& apply, const OpDef& def, SmallVector<TensorPtr> inputs) {
121 122 123 124 125 126 127
            auto shape2vector = [](const TensorShape& shape) {
                std::vector<size_t> vector_shape;
                for (size_t i = 0; i < shape.ndim; i++) {
                    vector_shape.push_back(shape[i]);
                }
                return vector_shape;
            };
128
            ProfileEntry entry;
129 130 131 132
            entry.id = m_entry_count++;
            // TODO: assign parent
            entry.parent = 0;
            // Record apply context and save to m_profile
133
            entry.op = const_cast<OpDef&>(def).shared_from_this();
134 135 136 137 138
            for (auto&& input : inputs) {
                entry.inputs.push_back({m_tensor_recorder.record_tensor(input),
                                        shape2vector(input->layout()),
                                        input->comp_node()});
            }
139 140 141 142 143 144 145
            double host_begin = m_host_timer.get_msecs();
            auto&& comp_nodes = collect_comp_nodes(def, inputs);
            for (auto&& comp_node : comp_nodes) {
                entry.device_list.push_back(
                        {comp_node,
                         m_device_timer.get_device_time(comp_node),
                         {}});
146
            }
147 148 149 150 151
            if (flags & PROFILE_FOOTPRINT) {
                MGB_LOCK_GUARD(m_lock);
                m_entry_stack.push({&def, &entry, std::this_thread::get_id()});
            }
            // Do real apply
152 153 154 155 156
            auto outputs = apply(def, inputs);
            for (auto& [cn, dev_begin, dev_end] : entry.device_list) {
                MGB_MARK_USED_VAR(cn);
                MGB_MARK_USED_VAR(dev_begin);
                dev_end = m_device_timer.get_device_time(cn);
157
            }
158
            entry.host = {host_begin, m_host_timer.get_msecs()};
159 160 161 162 163 164 165 166 167 168 169
            for (auto&& output : outputs) {
                entry.outputs.push_back(
                        {m_tensor_recorder.record_tensor(output),
                         shape2vector(output->layout()), output->comp_node()});
            }
            if (flags & PROFILE_FOOTPRINT) {
                mgb_assert(std::get<1>(m_entry_stack.top()) == &entry);
                MGB_LOCK_GUARD(m_lock);
                m_entry_stack.pop();
            }
            m_profile.push_back(std::move(entry));
170 171
            return outputs;
        });
172 173 174
        if (flags & PROFILE_FOOTPRINT) {
            hook_apply_on_var_node->apply_hook(
                    [this](auto&& apply, const OpDef& def,
175 176
                           VarNodeArray inputs) -> VarNodeArray {
                        auto vars = apply(def, std::move(inputs));
177 178 179 180 181
                        std::remove_reference_t<decltype(m_entry_stack.top())>
                                top;
                        {
                            MGB_LOCK_GUARD(m_lock);
                            if (m_entry_stack.empty()) {
182
                                return vars;
183 184 185 186 187 188
                            }
                            top = m_entry_stack.top();
                        }
                        auto [current_op, current_entry, thread_id] = top;
                        if (current_op != &def ||
                            thread_id != std::this_thread::get_id()) {
189
                            return vars;
190 191
                        }
                        auto&& footprint_result =
192
                                footprint.calc_footprint(vars[0]->owner_opr());
193 194 195 196 197 198
                        current_entry->memory = footprint_result.memory;
                        current_entry->computation =
                                footprint_result.computation;
#if MGB_ENABLE_JSON
                        current_entry->param = footprint_result.param;
#endif
199
                        return vars;
200 201 202 203
                    });
        }
        m_hooker_list.push_back(std::move(hook_apply_on_physical_tensor));
        m_hooker_list.push_back(std::move(hook_apply_on_var_node));
204
    });
205 206
}

207 208
void Profiler::stop() {
    m_hooker_list.clear();
209
    for (auto& entry : m_profile) {
210
        entry.wait_device();
211 212 213
    }
}

214 215 216 217 218 219 220 221 222 223
void Profiler::clear() {
    mgb_assert(m_entry_stack.empty(),
               "entry_stack should be empty after profile");
    mgb_assert(m_hooker_list.empty(), "hooks should be released");
    m_profile.clear();
    m_entry_count = 0;
    m_device_timer.clear();
    m_tensor_recorder.clear();
}

224 225 226
}  // namespace imperative

}  // namespace mgb