interpreter_impl.h 12.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/**
 * \file imperative/src/impl/interpreter/interpreter_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#pragma once

#include <deque>
#include <future>
#include <list>
17
#include <stack>
18 19 20
#include <thread>
#include <unordered_set>
#include <variant>
21
#include "megbrain/comp_node.h"
22 23
#include "megbrain/imperative/interpreter.h"
#include "megbrain/imperative/profiler.h"
M
Megvii Engine Team 已提交
24
#include "megbrain/utils/mempool.h"
25 26 27

#include "./commands.h"
#include "./option_manager.h"
28
#include "./stack_manager.h"
M
Megvii Engine Team 已提交
29
#include "./tensor_info.h"
30 31

#include "../profiler/events.h"
32 33 34 35 36 37 38 39 40 41 42 43 44 45

namespace mgb::imperative::interpreter::intl {

using Handle = Interpreter::Handle;

struct InterpreterImpl : Interpreter {
    std::unique_ptr<Channel> create_channel() override;
};

struct ChannelImpl : Interpreter::Channel {
    ChannelImpl();
    ~ChannelImpl() override;

    Handle put(const HostTensorND& value, bool no_cache) override;
46
    Handle put(const DeviceTensorND& value, const HostTensorND& hvalue) override;
47 48 49 50 51

    void del(Handle) override;
    void drop(Handle) override;

    SmallVector<Handle> apply_op(
M
Megvii Engine Team 已提交
52
            std::shared_ptr<OpDef> op, const SmallVector<Handle>& inputs) override;
53 54 55 56 57 58 59 60

    HostTensorND get_value(Handle) override;
    TensorShape get_shape(Handle) override;
    DType get_dtype(Handle) override;
    CompNode get_device(Handle) override;

    DeviceTensorND get_dev_tensor(Handle) override;

61
    bool check_available() override;
62 63 64
    void sync() override;
    void close() override;

65 66
    size_t get_option(std::string name) override;
    void set_option(std::string name, size_t value) override;
67

68 69
    void start_profile() override;
    void stop_profile() override;
70 71 72

    void push_scope(std::string) override;
    void pop_scope(std::string) override;
M
Megvii Engine Team 已提交
73

74
private:
75 76 77
    struct WorkQueue;
    struct State;

78
    TensorInfo* alloc();
79
    void init(TensorInfo*, LogicalTensorDesc desc);
80
    void free(TensorInfo*);
81 82 83
    void real_free(TensorInfo*);
    void recursive_free(TensorInfo*);
    void do_drop(TensorInfo*, bool);
84 85
    void detach_users(TensorInfo*);

86
    TensorInfo* put_impl(const HostTensorND& value, bool no_cache);
87
    TensorInfo* put_impl(const DeviceTensorND& value, const HostTensorND& hvalue);
88 89
    void del_impl(Handle);
    void sync_impl();
90
    SmallVector<Handle> apply_op_impl(
M
Megvii Engine Team 已提交
91
            std::shared_ptr<OpDef> op, const SmallVector<Handle>& inputs);
92 93 94
    TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop);
    void notify_tensor_unsafe(TensorInfo* info);

95
    void process_one_task(Command&);
96 97 98

    void check_worker_exc_unsafe();

99
    void produce_tensor(TensorInfo* dest, TensorPtr ptr);
100 101 102 103

    void release_tensor(TensorInfo* dest);

    void regenerate(TensorInfo* dest);
104
    void flush_apply_stack();
105
    void do_apply_op(const ApplyOp& cmd, std::string reason);
M
Megvii Engine Team 已提交
106 107 108 109 110

    std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPtr>>
    init_output_and_workspace(
            const OpDef& def, SmallVector<TensorPtr> inputs,
            SmallVector<MemoryDesc> inputs_mem_desc);
111 112

    void dispatch_default_cpu(
M
Megvii Engine Team 已提交
113 114 115
            std::shared_ptr<OpDef> op, const SmallVector<TensorInfo*>& input_infos,
            const SmallVector<LogicalTensorDesc>& input_descs,
            SmallVector<Handle>* outputs);
116
    void dispatch_kernel(
M
Megvii Engine Team 已提交
117 118 119
            std::shared_ptr<OpDef> op, const SmallVector<TensorInfo*>& input_infos,
            const SmallVector<LogicalTensorDesc>& input_descs,
            SmallVector<Handle>* outputs);
120

121 122 123
    void push_scope(std::string, State&);
    void pop_scope(std::string, State&);

124 125 126 127
    void assert_in_channel();
    void assert_in_worker();
    std::thread::id get_worker_tid();

128 129 130 131
    // template <typename TCommand>
    // void enqueue_command(TCommand&& cmd) {
    //     m_buffer.enqueue(Command{std::forward<TCommand>(cmd)});
    // }
132

133 134 135 136 137
    void sample_on_device(CompNode device, bool force);

    // valid => status != Deleted
    std::unordered_set<TensorInfo*> collect_valid_tensors();

138
    std::mutex m_mutex;
139
    Spinlock m_spin;
140 141 142 143
    std::condition_variable m_cv;
    MemPool<TensorInfo> m_pool;
    std::unordered_set<Handle> m_valid_handle;
    TensorInfo* m_waitee = nullptr;
144
    uint64_t m_waitee_id = 0;
145
    std::exception_ptr m_worker_exc;
146
    std::function<void(std::string, std::string)> m_profile_dump_callback;
147
    size_t m_storage_id = 0;
148 149
    // TODO: use explicit struct
    std::stack<std::tuple<ApplyOp, size_t, TensorInfo*, std::string>> m_apply_stack;
150
    bool m_applying = false;
151 152
    bool m_closed = false;

153
    struct WorkQueue : AsyncQueueSC<Command, WorkQueue> {
154 155
        // set max_spin=0 to prevent Queue fetch task in busy wait manner.
        // this won't affect throughput when python interpreter is sending enough task,
M
Megvii Engine Team 已提交
156 157
        // but will significantly save CPU time when waiting for task, e.g. wait for
        // data input limit pending tasks to 10000
158
        WorkQueue(ChannelImpl* owner)
159
                : AsyncQueueSC<Command, WorkQueue>(0, 10000), m_owner(owner) {
160
            sys::set_thread_name("interpreter");
161 162
            if (const char* env_val = MGB_GETENV("MEGENGINE_ASYNC_QUEUE_SIZE")) {
                int len = strlen(env_val);
M
Megvii Engine Team 已提交
163 164 165 166
                for (int i = 0; i < len; i++) {
                    mgb_assert(
                            env_val[i] >= '0' && env_val[i] <= '9',
                            "async queue size should be an integer");
167 168 169 170 171
                }
                size_t val;
                sscanf(env_val, "%zu", &val);
                update_max_items(val);
            }
172
        }
M
Megvii Engine Team 已提交
173
        void process_one_task(Command& icmd) { m_owner->process_one_task(icmd); }
174
        void on_async_queue_worker_thread_start() override;
M
Megvii Engine Team 已提交
175

176 177 178 179 180 181 182 183 184 185 186 187 188 189
    private:
        ChannelImpl* m_owner;
    } m_worker;

    /**
     * Buf a command window for following fuse
     * example:
     *     ---------------------------------------------------------------------
     *     | ..., Apply{in: (i0, i1), out: (o0, o1)}, ... + Del{i0} + Del{i1}  |
     *     ---------------------------------------------------------------------
     *     | ..., Apply{in: (i0, i1), out: (o0, o1), del: (i0)}, ... + Del{i1} |
     *     ---------------------------------------------------------------------
     *     | ..., Apply{in: (i0, i1), out: (o0, o1), del: (i0, i1)}, ...       |
     *     ---------------------------------------------------------------------
M
Megvii Engine Team 已提交
190 191
     *     Then the fused Apply may be invoked inplace. see:
     * ChannelImpl::process_one_task
192 193 194
     */
    struct CommandBuffer {
        CommandBuffer(ChannelImpl* owner) : m_owner(owner) {}
195
        void enqueue(CommandData cmd);
M
Megvii Engine Team 已提交
196
        bool empty() const { return m_commands.empty(); }
197
        void flush();
M
Megvii Engine Team 已提交
198

199 200 201 202 203 204 205 206 207 208 209 210 211 212
    private:
        ChannelImpl* m_owner;
        std::deque<Command> m_commands;

        using Handle = decltype(m_commands)::iterator;
        // [begin, end)
        using Range = std::array<Handle, 2>;

        // Launch commands in range [m_commands.begin(), pos)
        void flush(Handle pos);
        // Select flush position for incoming cmd
        Handle flush_pos_for(const Command& cmd);
        // Fuse del command into suitable ApplyOp
        bool fuse_del(const Del& cmd);
M
Megvii Engine Team 已提交
213 214
        // Returns the last handle that dest is used within range. If dest is not used,
        // returns range[1]
215 216 217 218 219 220 221 222 223 224 225
        Handle find_last_usage(TensorInfo* dest, Range range);
        // Returns the produce position of dest. If not found, returns range[1]
        Handle find_produce(TensorInfo* dest, Range range);
    } m_buffer;

    //! config whether raise error exactly when invoking op.
    //! level 2: both device and user side errors are async;
    //! level 1: user side errors are sync;
    //! level 0: both sync.
    int m_async_level = 2;

226
    struct State {
227
        std::thread::id tid;
228
        OptionManager options;
229 230
    };

M
Megvii Engine Team 已提交
231
    struct ChannelState : State {
232
        StackManager stack_manager;
233 234
    };

M
Megvii Engine Team 已提交
235
    struct WorkerState : State {};
236

237 238
    ChannelState m_channel_state;
    WorkerState m_worker_state;
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254

    /*!
     * \brief A framework of dynamic sublienar memory optimization
     *
     * Note: The main idea is that during the training process, if the memory
     * usage exceeds the threshold, select some tensors to evict until the
     * memory usage is below the threshold.
     */
    struct DynamicSublinear {
        /*!
         * \brief find an available tensor with the largest evaluation function
         *
         * Note: An available tensor must satisfy: (1) has computing path,
         * (2) is in memory, (3) is not pinned. Evaluation function refers to:
         * @see: TensorInfo::eval_func.
         *
M
Megvii Engine Team 已提交
255
         * \return the pointer of the best tensor; nullptr is returned if no
256 257
         * available tensor is found
         */
258
        TensorInfo* find_best_tensor(bool);
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276

        /*!
         * \brief estimate the cost of recomputing tensor ptr
         *
         * Note: We define the cost as the sum of the costs of each evicted
         * components where all the neighbors of ptr are located.
         */
        double estimate_neighbor_cost(TensorInfo* ptr);

        /*!
         * \brief update the last used time of the tensor ptr
         */
        void update_used_time(TensorInfo* ptr);

        /*!
         * \brief merge the two specified sets (the set in which the element x
         * is located, and the set in which the element y is located)
         */
M
Megvii Engine Team 已提交
277
        void merge(std::shared_ptr<DsuNode>& x, std::shared_ptr<DsuNode>& y);
278 279 280 281 282

        /*!
         * \brief return the representative of the set that contains the
         * element x
         */
M
Megvii Engine Team 已提交
283
        std::shared_ptr<DsuNode> find_father(std::shared_ptr<DsuNode>& x);
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335

        /*!
         * \brief update DSU after recomputing tensor ptr
         *
         * Delete ptr from the set where ptr is located. Since DSU does not
         * support this operation, instead, we reset the DSU father of ptr, and
         * subtract the recomputation cost of ptr from the cost of the original
         * set.
         */
        void update_dsu_after_recompute(TensorInfo* ptr);

        /*!
         * \brief update DSU after evicting tensor ptr
         *
         * Check the neighbors of x, that is, the input and output tensors, and
         * if they are evicted, merge their respective sets.
         */
        void update_dsu_after_evict(TensorInfo* ptr);

        /*!
         * \brief pin the tensors in vec
         */
        void pin(const SmallVector<TensorInfo*>& vec);

        /*!
         * \brief unpin the tensors in vec
         */
        void unpin(const SmallVector<TensorInfo*>& vec);

        /*!
         * \brief add the tensor to the candidate set
         *
         * If the size of the tensor does not exceed the minimum threshold,
         * it will do nothing.
         */
        void insert_candidate(TensorInfo* ptr);

        /*!
         * \brief erase the tensor from the candidate set
         *
         * If the size of the tensor does not exceed the minimum threshold,
         * it will do nothing.
         */
        void erase_candidate(TensorInfo* ptr);

        //! estimate the current time, in order to reduce the overhead of timer
        double estimate_timestamp = 0;

        //! the comp node where dynamic sublinear memory optimization works
        CompNode comp_node;

        //! store all tensors that may be evicted
336
        SmallVector<TensorInfo*> candidates;
337

338
        bool is_bad_op(std::string op_name) {
M
Megvii Engine Team 已提交
339 340
            return std::find(op_blacklist.begin(), op_blacklist.end(), op_name) !=
                   op_blacklist.end();
341 342
        }

M
Megvii Engine Team 已提交
343 344 345 346
        std::vector<std::string> op_blacklist = {
                "CollectiveComm", "InplaceAdd", "ParamPackSplit", "ParamPackConcat",
                "GaussianRNG",    "UniformRNG", "GammaRNG",       "PermutationRNG",
                "PoissonRNG",     "BetaRNG"};
347 348 349
    } m_dtr;

    //! automatically evict an optimal tensor
350 351
    bool auto_evict(size_t);

352
    void alloc_tensor_with_evict(Blob*);
353 354 355 356

    // assert thread id when call get_xxx_state to avoid misuse
    ChannelState& get_channel_state();
    WorkerState& get_worker_state();
357 358
};

M
Megvii Engine Team 已提交
359
}  // namespace mgb::imperative::interpreter::intl