interpreter_impl.h 12.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/**
 * \file imperative/src/impl/interpreter/interpreter_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#pragma once

#include <deque>
#include <future>
#include <list>
17
#include <stack>
18 19 20
#include <thread>
#include <unordered_set>
#include <variant>
21
#include "megbrain/comp_node.h"
22 23 24 25 26 27 28
#include "megbrain/utils/mempool.h"
#include "megbrain/imperative/interpreter.h"
#include "megbrain/imperative/profiler.h"

#include "./commands.h"
#include "./tensor_info.h"
#include "./option_manager.h"
29
#include "./stack_manager.h"
30 31

#include "../profiler/events.h"
32 33 34 35 36 37 38 39 40 41 42 43 44 45

namespace mgb::imperative::interpreter::intl {

using Handle = Interpreter::Handle;

struct InterpreterImpl : Interpreter {
    std::unique_ptr<Channel> create_channel() override;
};

struct ChannelImpl : Interpreter::Channel {
    ChannelImpl();
    ~ChannelImpl() override;

    Handle put(const HostTensorND& value, bool no_cache) override;
46
    Handle put(const DeviceTensorND& value, const HostTensorND& hvalue) override;
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63

    void del(Handle) override;
    void swap_in(Handle) override;
    void swap_out(Handle) override;
    void drop(Handle) override;

    SmallVector<Handle> apply_op(
            std::shared_ptr<OpDef> op,
            const SmallVector<Handle>& inputs) override;

    HostTensorND get_value(Handle) override;
    TensorShape get_shape(Handle) override;
    DType get_dtype(Handle) override;
    CompNode get_device(Handle) override;

    DeviceTensorND get_dev_tensor(Handle) override;

64
    bool check_available() override;
65 66 67
    void sync() override;
    void close() override;

68 69
    size_t get_option(std::string name) override;
    void set_option(std::string name, size_t value) override;
70

71 72
    void start_profile() override;
    void stop_profile() override;
73 74 75 76

    void push_scope(std::string) override;
    void pop_scope(std::string) override;
private:
77 78 79
    struct WorkQueue;
    struct State;

80
    TensorInfo* alloc();
81
    void init(TensorInfo*, LogicalTensorDesc desc);
82
    void free(TensorInfo*);
83 84 85
    void real_free(TensorInfo*);
    void recursive_free(TensorInfo*);
    void do_drop(TensorInfo*, bool);
86 87
    void detach_users(TensorInfo*);

88
    TensorInfo* put_impl(const HostTensorND& value, bool no_cache);
89
    TensorInfo* put_impl(const DeviceTensorND& value, const HostTensorND& hvalue);
90 91
    void del_impl(Handle);
    void sync_impl();
92 93 94
    SmallVector<Handle> apply_op_impl(
            std::shared_ptr<OpDef> op,
            const SmallVector<Handle>& inputs);
95 96 97
    TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop);
    void notify_tensor_unsafe(TensorInfo* info);

98
    void process_one_task(Command&);
99 100 101

    void check_worker_exc_unsafe();

102
    void produce_tensor(TensorInfo* dest, TensorPtr ptr);
103 104 105 106

    void release_tensor(TensorInfo* dest);

    void regenerate(TensorInfo* dest);
107
    void flush_apply_stack();
108
    void do_apply_op(const ApplyOp& cmd, std::string reason);
109 110 111 112 113
    
    std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPtr>> init_output_and_workspace(
        const OpDef& def,
        SmallVector<TensorPtr> inputs,
        SmallVector<MemoryDesc> inputs_mem_desc);
114 115 116 117 118 119 120 121 122 123 124 125

    void dispatch_default_cpu(
        std::shared_ptr<OpDef> op,
        const SmallVector<TensorInfo*>& input_infos,
        const SmallVector<LogicalTensorDesc>& input_descs,
        SmallVector<Handle>* outputs);
    void dispatch_kernel(
        std::shared_ptr<OpDef> op,
        const SmallVector<TensorInfo*>& input_infos,
        const SmallVector<LogicalTensorDesc>& input_descs,
        SmallVector<Handle>* outputs);

126 127 128
    void push_scope(std::string, State&);
    void pop_scope(std::string, State&);

129 130 131 132
    void assert_in_channel();
    void assert_in_worker();
    std::thread::id get_worker_tid();

133 134 135 136
    // template <typename TCommand>
    // void enqueue_command(TCommand&& cmd) {
    //     m_buffer.enqueue(Command{std::forward<TCommand>(cmd)});
    // }
137

138 139 140 141 142
    void sample_on_device(CompNode device, bool force);

    // valid => status != Deleted
    std::unordered_set<TensorInfo*> collect_valid_tensors();

143
    std::mutex m_mutex;
144
    Spinlock m_spin;
145 146 147 148
    std::condition_variable m_cv;
    MemPool<TensorInfo> m_pool;
    std::unordered_set<Handle> m_valid_handle;
    TensorInfo* m_waitee = nullptr;
149
    uint64_t m_waitee_id = 0;
150
    std::exception_ptr m_worker_exc;
151
    std::function<void(std::string, std::string)> m_profile_dump_callback;
152
    size_t m_storage_id = 0;
153 154
    // TODO: use explicit struct
    std::stack<std::tuple<ApplyOp, size_t, TensorInfo*, std::string>> m_apply_stack;
155
    bool m_applying = false;
156 157
    bool m_closed = false;

158
    struct WorkQueue : AsyncQueueSC<Command, WorkQueue> {
159 160 161
        // set max_spin=0 to prevent Queue fetch task in busy wait manner.
        // this won't affect throughput when python interpreter is sending enough task,
        // but will significantly save CPU time when waiting for task, e.g. wait for data input
162
        // limit pending tasks to 10000
163
        WorkQueue(ChannelImpl* owner)
164
                : AsyncQueueSC<Command, WorkQueue>(0, 10000), m_owner(owner) {
165
            sys::set_thread_name("interpreter");
166 167 168 169 170 171 172 173 174
            if (const char* env_val = MGB_GETENV("MEGENGINE_ASYNC_QUEUE_SIZE")) {
                int len = strlen(env_val);
                for (int i = 0; i < len; i ++) {
                    mgb_assert(env_val[i] >= '0' && env_val[i] <= '9', "async queue size should be an integer");
                }
                size_t val;
                sscanf(env_val, "%zu", &val);
                update_max_items(val);
            }
175
        }
176
        void process_one_task(Command& icmd) {
177 178
            m_owner->process_one_task(icmd);
        }
179
        void on_async_queue_worker_thread_start() override;
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
    private:
        ChannelImpl* m_owner;
    } m_worker;

    /**
     * Buf a command window for following fuse
     * example:
     *     ---------------------------------------------------------------------
     *     | ..., Apply{in: (i0, i1), out: (o0, o1)}, ... + Del{i0} + Del{i1}  |
     *     ---------------------------------------------------------------------
     *     | ..., Apply{in: (i0, i1), out: (o0, o1), del: (i0)}, ... + Del{i1} |
     *     ---------------------------------------------------------------------
     *     | ..., Apply{in: (i0, i1), out: (o0, o1), del: (i0, i1)}, ...       |
     *     ---------------------------------------------------------------------
     *     Then the fused Apply may be invoked inplace. see: ChannelImpl::process_one_task
     */
    struct CommandBuffer {
        CommandBuffer(ChannelImpl* owner) : m_owner(owner) {}
198
        void enqueue(CommandData cmd);
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
        bool empty() const {
            return m_commands.empty();
        }
        void flush();
    private:
        ChannelImpl* m_owner;
        std::deque<Command> m_commands;

        using Handle = decltype(m_commands)::iterator;
        // [begin, end)
        using Range = std::array<Handle, 2>;

        // Launch commands in range [m_commands.begin(), pos)
        void flush(Handle pos);
        // Select flush position for incoming cmd
        Handle flush_pos_for(const Command& cmd);
        // Fuse del command into suitable ApplyOp
        bool fuse_del(const Del& cmd);
        // Returns the last handle that dest is used within range. If dest is not used, returns range[1]
        Handle find_last_usage(TensorInfo* dest, Range range);
        // Returns the produce position of dest. If not found, returns range[1]
        Handle find_produce(TensorInfo* dest, Range range);
    } m_buffer;

    //! config whether raise error exactly when invoking op.
    //! level 2: both device and user side errors are async;
    //! level 1: user side errors are sync;
    //! level 0: both sync.
    int m_async_level = 2;

229
    struct State {
230
        std::thread::id tid;
231
        OptionManager options;
232 233
    };

234
    struct ChannelState: State {
235
        StackManager stack_manager;
236 237 238 239
    };

    struct WorkerState: State {};

240 241
    ChannelState m_channel_state;
    WorkerState m_worker_state;
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260

    /*!
     * \brief A framework of dynamic sublienar memory optimization
     *
     * Note: The main idea is that during the training process, if the memory
     * usage exceeds the threshold, select some tensors to evict until the
     * memory usage is below the threshold.
     */
    struct DynamicSublinear {
        /*!
         * \brief find an available tensor with the largest evaluation function
         *
         * Note: An available tensor must satisfy: (1) has computing path,
         * (2) is in memory, (3) is not pinned. Evaluation function refers to:
         * @see: TensorInfo::eval_func.
         *
         * \return the pointer of the best tensor; nullptr is returned if no 
         * available tensor is found
         */
261
        TensorInfo* find_best_tensor(bool);
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340

        /*!
         * \brief estimate the cost of recomputing tensor ptr
         *
         * Note: We define the cost as the sum of the costs of each evicted
         * components where all the neighbors of ptr are located.
         */
        double estimate_neighbor_cost(TensorInfo* ptr);

        /*!
         * \brief update the last used time of the tensor ptr
         */
        void update_used_time(TensorInfo* ptr);

        /*!
         * \brief merge the two specified sets (the set in which the element x
         * is located, and the set in which the element y is located)
         */
        void merge(std::shared_ptr<DsuNode> &x, std::shared_ptr<DsuNode> &y);

        /*!
         * \brief return the representative of the set that contains the
         * element x
         */
        std::shared_ptr<DsuNode> find_father(std::shared_ptr<DsuNode> &x);

        /*!
         * \brief update DSU after recomputing tensor ptr
         *
         * Delete ptr from the set where ptr is located. Since DSU does not
         * support this operation, instead, we reset the DSU father of ptr, and
         * subtract the recomputation cost of ptr from the cost of the original
         * set.
         */
        void update_dsu_after_recompute(TensorInfo* ptr);

        /*!
         * \brief update DSU after evicting tensor ptr
         *
         * Check the neighbors of x, that is, the input and output tensors, and
         * if they are evicted, merge their respective sets.
         */
        void update_dsu_after_evict(TensorInfo* ptr);

        /*!
         * \brief pin the tensors in vec
         */
        void pin(const SmallVector<TensorInfo*>& vec);

        /*!
         * \brief unpin the tensors in vec
         */
        void unpin(const SmallVector<TensorInfo*>& vec);

        /*!
         * \brief add the tensor to the candidate set
         *
         * If the size of the tensor does not exceed the minimum threshold,
         * it will do nothing.
         */
        void insert_candidate(TensorInfo* ptr);

        /*!
         * \brief erase the tensor from the candidate set
         *
         * If the size of the tensor does not exceed the minimum threshold,
         * it will do nothing.
         */
        void erase_candidate(TensorInfo* ptr);

        //! estimate the current time, in order to reduce the overhead of timer
        double estimate_timestamp = 0;

        //! the comp node where dynamic sublinear memory optimization works
        CompNode comp_node;

        //! store all tensors that may be evicted
        std::unordered_set<TensorInfo*> candidates;

341 342 343 344 345
        bool is_bad_op(std::string op_name) {
            return std::find(op_blacklist.begin(), op_blacklist.end(), op_name) != op_blacklist.end();
        }

        std::vector<std::string> op_blacklist = {"CollectiveComm", "InplaceAdd",
346 347
                                "ParamPackSplit", "ParamPackConcat", "GaussianRNG", "UniformRNG",
                                "GammaRNG", "PermutationRNG", "PoissonRNG", "BetaRNG"};
348 349 350
    } m_dtr;

    //! automatically evict an optimal tensor
351 352
    bool auto_evict(size_t);

353
    void alloc_tensor_with_evict(Blob*);
354 355 356 357

    // assert thread id when call get_xxx_state to avoid misuse
    ChannelState& get_channel_state();
    WorkerState& get_worker_state();
358 359 360
};

} // namespace mgb::imperative::interpreter::intl