io.cpp

/**
 * \file src/opr/impl/io.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#include "megbrain/opr/io.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/graph/event.h"
#include "megbrain/graph/grad_impl.h"
#include "megbrain/graph/exc_extra_info.h"
#include "megbrain/opr/internal/megdnn_opr_wrapper.h"
#include "megbrain/serialization/opr_load_dump.h"

using namespace mgb;
using namespace opr;

namespace {
//! helper for implementing oprs that hold a device tensor value
namespace dv_helper {
void add_output(cg::OperatorNodeBase& opr, DType dtype,
                const Maybe<std::string>& name = None);
void init_output_mem_plan(const DeviceTensorND& val, cg::OperatorNodeBase& opr,
                          bool dynamic, size_t ovar_idx = 0);
void check_in_exec(const DeviceTensorND& val, VarNode* var);
}  // namespace dv_helper
}  // anonymous namespace

/* ===================== dv_helper ===================== */

void dv_helper::add_output(cg::OperatorNodeBase& opr, DType dtype,
                           const Maybe<std::string>& name) {
    mgb_assert(dtype.valid());
    opr.add_output(name)
            ->add_flag(VarNode::Flag::NO_MEM_RECLAIM)
            .add_flag(VarNode::Flag::PERSISTENT_DEVICE_VALUE)
            .add_flag(VarNode::Flag::DISALLOW_RT_FORCE_DYNAMIC_MEM_ALLOC)
            .dtype(dtype);
}

void dv_helper::init_output_mem_plan(const DeviceTensorND& val,
                                     cg::OperatorNodeBase& opr, bool dynamic,
                                     size_t ovar_idx) {
    mgb_assert(!dynamic);
    auto ovar = opr.output(ovar_idx);
    mgb_assert(val.dtype() == ovar->dtype(),
               "dtype mismatch: get=%s expect=%s opr=%s{%s}",
               val.dtype().name(), ovar->dtype().name(), opr.cname(),
               opr.dyn_typeinfo()->name);
    ovar->init_mem_plan(&val);
}

void dv_helper::check_in_exec(const DeviceTensorND& val, VarNode* var) {
    auto&& oval = var->dev_tensor();
    if(!(val.comp_node().mem_node() == oval.comp_node().mem_node() &&
         val.raw_ptr() == oval.raw_ptr() && val.layout().eq_layout(oval.layout())
         && val.dtype() == var->dtype())) {
        var->owner_opr()->owner_graph()->record_async_error(
            cg::OperatorNodeExcExtraInfo::ExcMaker{var->owner_opr()}
            .make_unique<MegBrainError>(ssprintf(
                "value changed in DeviceTensorHolder: cn=(%s,%s), ptr=(%p,%p), "
                "layout=(%s,%s), dtype=(%s,%s)",
                val.comp_node().to_string().c_str(),
                oval.comp_node().to_string().c_str(), val.raw_ptr(),
                oval.raw_ptr(), val.layout().to_string().c_str(),
                oval.layout().to_string().c_str(),
                val.dtype().name(), var->dtype().name())));
    }
}

/* ===================== HostIONodeBase ===================== */

void intl::HostIONodeBase::init_output_static_infer_desc() {
    using namespace cg::static_infer;
    auto &&mgr = owner_graph()->static_infer_manager();
    auto infer_shp = [this](TensorShape &dest, const InpVal &) -> bool {
        dest = get_output_shape();
        return dest.ndim;
    };

    auto shape_type = static_infer_src_type();
    auto opr_load_ctx = owner_graph()->options().user_data.get_user_data<
        serialization::OprLoadContext>();
    if (opr_load_ctx.second) {
        mgb_assert(opr_load_ctx.second == 1);
        if (opr_load_ctx.first[0]->config().const_var_shape) {
            shape_type = cg::static_infer::SourceType::CONSTANT;
        }
    }
    mgr.register_shape_infer(output(0), {shape_type, {}, infer_shp});

    if (fill_in_static_infer(nullptr)) {
        auto infer_val = [this](DeviceTensorND &dest, const InpVal &) -> bool {
            if (fill_in_static_infer(&dest) && !dest.empty()) {
                return true;
            }
            return false;
        };
        mgr.register_value_infer(output(0),
                {static_infer_src_type(), {}, infer_val});
    }
}

cg::static_infer::SourceType
intl::HostIONodeBase::static_infer_src_type() const {
    return cg::static_infer::SourceType::MUTABLE;
}

/* ===================== DeviceTensorHolder ===================== */

class intl::DeviceTensorHolder::DevValueExecDep final : public ExecDependency {
    DeviceTensorStorage m_val;

public:
    explicit DevValueExecDep(DeviceTensorStorage val) : m_val{std::move(val)} {}
};


void intl::DeviceTensorHolder::init_output_format() {
    auto format = get_dev_tensor().format();
    mgb_assert(format.is_default(), "non-default tensor format: %s",
               format.to_string().c_str());
    // no need to set output foramt since it is initialized as default
}

void intl::DeviceTensorHolder::init_output_mem_plan(bool dynamic) {
    dv_helper::init_output_mem_plan(get_dev_tensor(), *this, dynamic);
}

void intl::DeviceTensorHolder::scn_do_execute() {
    dv_helper::check_in_exec(get_dev_tensor(), output(0));
}

void intl::DeviceTensorHolder::add_output(DType dtype) {
    mgb_assert(output().empty());
    dv_helper::add_output(*this, dtype);
}

void intl::DeviceTensorHolder::record_execute_deps(ExecDependencyArray& deps) {
    if (!output(0)->contain_flag(VarNode::Flag::MEMORY_NO_NEED)) {
        deps.emplace_back(
                std::make_unique<DevValueExecDep>(get_dev_tensor().storage()));
    }
}

/* ===================== Host2DeviceCopy ===================== */

class Host2DeviceCopy::HostValueExecDep final : public ExecDependency {
    std::shared_ptr<HostTensorND> m_hv;
    void* m_ptr;
    TensorShape m_shape;

public:
    explicit HostValueExecDep(std::shared_ptr<HostTensorND> hv)
            : m_hv{hv}, m_ptr{hv->raw_ptr()}, m_shape{hv->shape()} {}

    bool has_runtime_check() const override { return true; }

    void do_runtime_check() override {
        mgb_assert(m_hv->raw_ptr() == m_ptr && m_hv->shape().eq_shape(m_shape),
                   "host tensor changed: %p(%s) vs %p(%s)", m_hv->raw_ptr(),
                   m_hv->shape().to_string().c_str(), m_ptr,
                   m_shape.to_string().c_str());
    }
};

MGB_DYN_TYPE_OBJ_FINAL_IMPL(Host2DeviceCopy);
Host2DeviceCopy::Host2DeviceCopy(ComputingGraph &graph,
        const std::shared_ptr<HostTensorND> &host_data,
        const Param &param,
        const OperatorNodeConfig &config):
    Super{&graph, config, "h2d", {}},
    m_param{param},
    m_host_data{host_data}
{
    auto out_cn = m_host_data->comp_node();
    if (config.has_comp_node_set())
        out_cn = config.get_single_comp_node();
    mgb_assert(out_cn.valid(), "can not get output comp node");

    if (param.allow_cpu_mem_fwd &&
            out_cn.mem_node() == CompNode::default_cpu().mem_node() &&
            host_data->comp_node().mem_node() == out_cn.mem_node()) {
        m_fwd_host_mem = true;
        dv_helper::add_output(*this, host_data->dtype());
    } else {
        m_fwd_host_mem = false;
        add_output(None)->dtype(host_data->dtype());
    }
    add_equivalence_component<ScalarHash<void*>>(host_data.get());
    add_equivalence_component<PODHash<Param>>(&m_param);

    this->comp_node(out_cn);

    output(0)->add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE);
}

const TensorShape& Host2DeviceCopy::get_output_shape() {
    return m_host_data->shape();
}

bool Host2DeviceCopy::fill_in_static_infer(DeviceTensorND* dest) {
    if (!m_param.enable_value_infer) {
        return false;
    }
    if (!dest) {
        // query whether static infer is supported
        return true;
    }
    if (m_host_data->storage().has_no_real_storage()) {
        return false;
    }
    dest->copy_from(*m_host_data);
    return true;
}

void Host2DeviceCopy::scn_do_execute() {
    if (m_fwd_host_mem) {
        mgb_assert(m_host_data->comp_node().mem_node() ==
                comp_node().mem_node());
        if (m_host_data_dev_cont_need_sync)
            m_host_data_dev_cont.copy_from_fixlayout(*m_host_data);
        dv_helper::check_in_exec(get_dev_tensor_in_mem_fwd(), output(0));
    } else {
        auto&& od = output(0)->dev_tensor();
        od.copy_from_fixlayout(*m_host_data);
    }
}

void Host2DeviceCopy::init_output_mem_plan(bool dynamic) {
    if (m_fwd_host_mem) {
        dv_helper::init_output_mem_plan(get_dev_tensor_in_mem_fwd(), *this,
                                        dynamic);
    } else {
        Super::init_output_mem_plan(dynamic);
    }
}

void Host2DeviceCopy::init_output_comp_node() {
}

const DeviceTensorND& Host2DeviceCopy::get_dev_tensor_in_mem_fwd() const {
    mgb_assert(m_fwd_host_mem);
    if (!m_host_data->layout().is_contiguous()) {
        m_host_data_dev_cont_need_sync = true;
        m_host_data_dev_cont.comp_node(comp_node()).
            dtype(m_host_data->dtype()).
            resize(m_host_data->shape());
        return m_host_data_dev_cont;
    }
    m_host_data_dev_cont_need_sync = false;

    m_host_data_dev_proxy = DeviceTensorND::make_proxy(*m_host_data);
    return m_host_data_dev_proxy;
}

cg::OperatorNodeBase::NodeProp* Host2DeviceCopy::do_make_node_prop() const {
    auto ret = Super::do_make_node_prop();
    if (m_fwd_host_mem) {
        ret->add_flag(NodeProp::Flag::IMPURE_OUTPUT_MEM_PLAN);
    }
    return ret;
}

SymbolVar Host2DeviceCopy::make(ComputingGraph &graph,
        const std::shared_ptr<HostTensorND> &host_data,
        const Param &param,
        const OperatorNodeConfig &config) {
    return graph.insert_opr(std::make_unique<Host2DeviceCopy>(
                graph, host_data, param, config))->output(0);
}

void Host2DeviceCopy::record_execute_deps(ExecDependencyArray& deps) {
    deps.emplace_back(
            std::make_unique<HostValueExecDep>(std::move(m_host_data)));
}

/* ===================== SharedDeviceTensor related ===================== */

intl::SharedDeviceTensorBase::SharedDeviceTensorBase(
        ComputingGraph& graph, const std::shared_ptr<DeviceTensorND>& dev_data,
        bool const_value, const OperatorNodeConfig& config)
        : Super{&graph, config, "shared", {}},
          m_dev_data{dev_data},
          m_const_value(const_value) {
    if (config.has_comp_node_set()) {
        mgb_assert(config.get_single_comp_node() == dev_data->comp_node());
    }
    add_output(dev_data->dtype());
    add_equivalence_component<ScalarHash<void*>>(dev_data.get());
}

const TensorShape& intl::SharedDeviceTensorBase::get_output_shape() {
    return m_dev_data->shape();
}

void intl::SharedDeviceTensorBase::init_output_comp_node() {
    if (config().has_comp_node_set()) {
        mgb_throw_if(config().get_single_comp_node() != m_dev_data->comp_node(),
                GraphError,
                "SharedDeviceTensor: comp node in config differs from that in"
                " dev_data");
    }
    comp_node(m_dev_data->comp_node());
}

cg::static_infer::SourceType SharedDeviceTensor::static_infer_src_type() const {
    return cg::static_infer::SourceType::CONSTANT;
}

SymbolVar SharedDeviceTensor::make(ComputingGraph &graph,
        const std::shared_ptr<DeviceTensorND> &dev_data,
        bool const_value,
        const OperatorNodeConfig &config) {
    return graph.insert_opr(std::make_unique<SharedDeviceTensor>(
                graph, dev_data, const_value, config))->output(0);
}

SymbolVar SharedDeviceTensor::make(ComputingGraph &graph,
        const HostTensorND &value,
        bool const_value,
        const OperatorNodeConfig &config) {
    auto cn = value.comp_node();
    if (config.has_comp_node_set())
        cn = config.get_single_comp_node();
    auto dev_v = std::make_shared<DeviceTensorND>();
    dev_v->comp_node(cn).copy_from(value).sync();
    return make(graph, dev_v, const_value, config);
}

MGB_DYN_TYPE_OBJ_FINAL_IMPL(SharedDeviceTensor);

cg::OperatorNodeBase::NodeProp*
VolatileSharedDeviceTensor::do_make_node_prop() const {
    auto ret = Super::do_make_node_prop();
    ret->add_flag(NodeProp::Flag::IMPURE_OUTPUT_MEM_PLAN);
    return ret;
}

SymbolVar VolatileSharedDeviceTensor::make(ComputingGraph &graph,
        const std::shared_ptr<DeviceTensorND> &dev_data,
        const OperatorNodeConfig &config) {
    return graph.insert_opr(std::make_unique<VolatileSharedDeviceTensor>(
                graph, dev_data, false, config))->output(0);
}

MGB_DYN_TYPE_OBJ_FINAL_IMPL(VolatileSharedDeviceTensor);

/* ============== SharedDeviceTensorWithFormat =============== */
void SharedDeviceTensorWithFormat::init_output_format() {
    output(0)->format(get_dev_tensor().format());
}

SymbolVar SharedDeviceTensorWithFormat::make(
        ComputingGraph& graph, const std::shared_ptr<DeviceTensorND>& dev_data,
        bool const_value, const OperatorNodeConfig& config) {
    auto&& opr =
            graph.insert_opr(std::make_unique<SharedDeviceTensorWithFormat>(
                                     graph, dev_data, const_value, config))
                    ->cast_final_safe<SharedDeviceTensorWithFormat>();
    return opr.output(0);
}

cg::static_infer::SourceType
SharedDeviceTensorWithFormat::static_infer_src_type() const {
    return cg::static_infer::SourceType::CONSTANT;
}

MGB_DYN_TYPE_OBJ_FINAL_IMPL(SharedDeviceTensorWithFormat);

/* ===================== ImmutableTensor ===================== */

MGB_DYN_TYPE_OBJ_FINAL_IMPL(ImmutableTensor);

class ImmutableTensor::Value {
    std::mutex m_mtx;
    DeviceTensorND m_dev, m_static_infer;
    std::string m_summary;

    public:
        void setup(CompNode cn, const HostTensorND &val);

        bool initialized() const {
            return m_dev.shape_valid();
        }

        //! value on comp node
        const DeviceTensorND& dev() const {
            return m_dev;
        }

        //! get value on static infer CPU node
        DeviceTensorND& static_infer();

        //! string summary of the value
        const std::string& summary() const {
            return m_summary;
        }
};

void ImmutableTensor::Value::setup(CompNode cn, const HostTensorND &val) {
    mgb_assert(m_dev.empty() && !m_dev.shape_valid());
    m_dev.comp_node(cn).copy_from(val).sync();
    mgb_assert(val.empty() == m_dev.empty());

    auto one_elem = [](const TensorShape& shape) {
        for (size_t i = 0; i < shape.ndim; ++i) {
            if (shape[i] != 1)
                return false;
        }
        return true;
    };

    if (one_elem(val.shape())) {
        float v;
        static_cast_dtype(&v, val.dtype(), val.raw_ptr());
        m_summary = ssprintf("const<%.3g>", v);
        if (val.shape().ndim != 1) {
            m_summary += val.shape().to_string();
        }
    } else {
        m_summary = ssprintf("const%s", val.shape().to_string().c_str());
    }
}

DeviceTensorND& ImmutableTensor::Value::static_infer() {
    MGB_LOCK_GUARD(m_mtx);
    if (m_static_infer.empty()) {
        mgb_assert(!m_dev.empty());
        m_static_infer.comp_node(CompNode::default_cpu()).copy_from(m_dev);
    }
    return m_static_infer;
}

class ImmutableTensor::DevValueCache final: public UserDataContainer::UserData {
    MGB_TYPEINFO_OBJ_DECL;
    CompNode m_comp_node;

    class TensorKey {
        struct Trait {
            size_t hash = 0, size_bytes = 0;
            TensorLayout layout;
        };
        Trait m_trait;
        std::vector<dt_byte> m_val;
        HostTensorND m_val_ref;

        const dt_byte* val_ptr() const {
            mgb_assert(m_trait.size_bytes);
            return m_val.empty() ? m_val_ref.raw_ptr() : m_val.data();
        }

        public:
            TensorKey() = default;
            TensorKey(const HostTensorND &v):
                m_val_ref{v}
            {
                mgb_assert(v.layout().is_contiguous() || v.layout().is_empty());
                m_trait.size_bytes = v.layout().span().high_byte;

                auto &&layout = m_trait.layout;
                // zero to enable byte-comparison
                memset(&layout, 0, sizeof(layout));
                layout.ndim = v.layout().ndim;
                layout.dtype = v.layout().dtype;
                for (size_t i = 0; i < layout.ndim; ++ i) {
                    layout.shape[i] = v.layout().shape[i];
                    layout.stride[i] = v.layout().stride[i];
                }
                XXHash hasher;
                if (!v.empty()) {
                    hasher.update(v.raw_ptr(), m_trait.size_bytes);
                }
                hasher.update(&m_trait.layout, sizeof(m_trait.layout));
                m_trait.hash = hasher.digest();
            }

            bool operator == (const TensorKey &rhs) const {
                return !memcmp(&m_trait, &rhs.m_trait, sizeof(Trait)) &&
                       ((m_trait.size_bytes == 0 &&
                         rhs.m_trait.size_bytes == 0) ||
                        !memcmp(val_ptr(), rhs.val_ptr(), m_trait.size_bytes));
            }

            size_t hash() const {
                return m_trait.hash;
            }

            //! copy from m_val_ref to m_val, to avoid refed value being
            //! modified
            void copy_val_permanent() {
                if (m_trait.size_bytes == 0) return;
                mgb_assert(m_val.empty());
                m_val.resize(m_trait.size_bytes);
                memcpy(m_val.data(), m_val_ref.raw_ptr(), m_trait.size_bytes);
                m_val_ref = {};
            }
    };
    struct ScalarKey {
        size_t hash = 0;
        DTypeScalar val;

        ScalarKey() = default;
        ScalarKey(const DTypeScalar &v):
            val{v}
        {
            hash = PODHash<DTypeScalar>::perform(&val, 1);
        }

        bool operator == (const ScalarKey &rhs) const {
            return val == rhs.val;
        }
    };
    struct Hash {
        size_t operator() (const TensorKey &key) const {
            return key.hash();
        }
        size_t operator() (const ScalarKey &key) const {
            return key.hash;
        }
    };

    std::unordered_map<TensorKey, Value, Hash> m_tensor2val;
    std::unordered_map<ScalarKey, Value, Hash> m_scalar2val;

    std::mutex m_mtx;

    void setup_value(Value &dest, const HostTensorND &val) {
        dest.setup(m_comp_node, val);
    }

    public:
        //! max number of elements for a tensor to be stored in this cache
        static constexpr size_t MAX_SIZE = TensorLayout::MAX_NDIM * 4;

        struct VarNodeCache;

        DevValueCache(const CompNodeEnv &env):
            m_comp_node{env.comp_node()}
        {
        }

        static DevValueCache& inst(CompNode cn) {
            auto &&env = CompNodeEnv::from_comp_node(cn);
            auto maker = [&]() {
                return std::make_shared<DevValueCache>(env);
            };
            return env.get_user_data<DevValueCache>(maker);
        }

        const Value& get(const HostTensorND &tensor) {
            if (tensor.shape().is_scalar()) {
                return get(DTypeScalar::make_from_raw(
                            tensor.dtype(), tensor.raw_ptr()));
            }

            MGB_LOCK_GUARD(m_mtx);
            TensorKey key{tensor};
            Value &item = m_tensor2val[key];
            if (!item.initialized()) {
                setup_value(item, tensor);
                const_cast<TensorKey&>(m_tensor2val.find(key)->first).
                    copy_val_permanent();
            }
            return item;
        }

        const Value& get(const DTypeScalar &scalar) {
            MGB_LOCK_GUARD(m_mtx);

            ScalarKey key{scalar};
            Value &item = m_scalar2val[key];
            if (!item.initialized()) {
                HostTensorND hv{m_comp_node, scalar.dtype()};
                hv.resize({1});
                memcpy(hv.raw_ptr(), scalar.storage(), scalar.dtype().size(1));
                setup_value(item, hv);
            }
            return item;
        }
};
MGB_TYPEINFO_OBJ_IMPL(ImmutableTensor::DevValueCache);
using ImmutableTensorDevValueCache = ImmutableTensor::DevValueCache;

struct ImmutableTensor::DevValueCache::VarNodeCache final:
        public UserDataContainer::UserData {
    ThinHashMap<const Value*, SymbolVar> val2var;

    MGB_TYPEINFO_OBJ_DECL;
};
MGB_TYPEINFO_OBJ_IMPL(ImmutableTensor::DevValueCache::VarNodeCache);

ImmutableTensor::ImmutableTensor(ComputingGraph &graph,
        const Value &value, const OperatorNodeConfig &config):
    Super{&graph, config, value.summary(), {}},
    m_value{value}
{
    mgb_assert(value.initialized());

    add_output(value.dev().dtype());
    add_equivalence_component<ScalarHash<const void*>>(&value);
    output(0)->add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE);
}

ImmutableTensor::~ImmutableTensor() noexcept = default;

SymbolVar ImmutableTensor::make(ComputingGraph &graph, const HostTensorND &val,
        const OperatorNodeConfig &config) {

    auto cn = val.comp_node();
    if (config.has_comp_node_set())
        cn = config.get_single_comp_node();

    if (val.shape().total_nr_elems() > DevValueCache::MAX_SIZE) {
        // tensor too large, do not dedup
        auto value = std::make_shared<Value>();
        value->setup(cn, val);
        return make_from_value(graph, *value, value, config);
    }

    auto &&cache = DevValueCache::inst(cn);
    return make_from_value(graph, cache.get(val), {}, config);
}

SymbolVar ImmutableTensor::make(ComputingGraph &graph, const DTypeScalar &val,
        const OperatorNodeConfig &config) {
    mgb_assert(config.has_comp_node_set(),
            "comp node must be set for constructing ImmutableTensor from "
            "DTypeScalar");

    auto cn = config.get_single_comp_node();
    auto &&cache = DevValueCache::inst(cn);
    return make_from_value(graph, cache.get(val), {}, config);
}

const DeviceTensorND& ImmutableTensor::value() const {
    return m_value.dev();
}
const DeviceTensorND& ImmutableTensor::host_value()  {
    return const_cast<Value*>(&m_value)->static_infer();
}

SymbolVar ImmutableTensor::make_from_value(
        ComputingGraph &graph,
        const Value &val, const std::shared_ptr<Value> &val_refkeep,
        const OperatorNodeConfig &config) {

    auto ud = graph.options().user_data.get_user_data_or_create
        <DevValueCache::VarNodeCache>(
                std::make_shared<DevValueCache::VarNodeCache>);
    SymbolVar &var = ud->val2var[&val];

    if (!var.node()) {
        var = graph.insert_opr(std::make_unique<ImmutableTensor>(
                graph, val, config))->output(0);
        if (val_refkeep) {
            auto &&opr = var.node()->owner_opr()->cast_final<ImmutableTensor>();
            mgb_assert(&opr.m_value == val_refkeep.get() &&
                    !opr.m_value_refkeep);
            opr.m_value_refkeep = val_refkeep;
        }
    }
#if !MGB_BUILD_SLIM_SERVING
    // FIXME: make() of immutable tensor would return immediately instead of
    // calling insert_opr() when hitting cache, so we need call it munually.
    // see MGE-81
    else {
        if (graph.options().eager_evaluation) {
            auto &&opr = var.node()->owner_opr();
            graph.insert_opr(std::unique_ptr<OperatorNodeBase>(opr));
        }
    }
#endif
    return var;
}

void ImmutableTensor::init_output_comp_node() {
    comp_node(m_value.dev().comp_node());
}

const TensorShape& ImmutableTensor::get_output_shape() {
    return m_value.dev().shape();
}

bool ImmutableTensor::fill_in_static_infer(DeviceTensorND *dest) {
    if (dest)
        *dest = const_cast<Value&>(m_value).static_infer();
    return true;
}

const DeviceTensorND& ImmutableTensor::get_dev_tensor() const {
    return m_value.dev();
}

cg::static_infer::SourceType ImmutableTensor::static_infer_src_type() const {
    return cg::static_infer::SourceType::CONSTANT;
}

/* ===================== Copy ===================== */

MGB_DYN_TYPE_OBJ_FINAL_IMPL(Copy);

Copy::Copy(VarNode *inp, const OperatorNodeConfig &config):
    Super{inp->owner_graph(), config, "copy", {inp}}
{
    add_input({inp});
    add_output(None);
}

SymbolVar Copy::make(SymbolVar inp, const OperatorNodeConfig &config) {
    return inp.insert_single_output_opr<Copy>(inp.node(), config);
}

void Copy::mem_plan_fwd_in2out_readonly() {
    if (owner_graph()->options().force_dynamic_alloc) {
        // copy on same CN in force_dynamic_alloc graphs usually used for
        // resolving dependency
        // TODO: add an option disable_auto_memfwd for Copy
        m_mem_fwd_success = false;
        return;
    }

    if (output(0)->comp_node().mem_node() == input(0)->comp_node().mem_node()) {
        m_mem_fwd_success = output(0)->set_fwd_in2out_readonly(
                input(0), SubTensorSpec::make_from_layout(input(0)->layout()));
    } else
        m_mem_fwd_success = false;
}

void Copy::init_output_comp_node() {
    Super::init_output_comp_node();
    if (output(0)->comp_node().mem_node() != input(0)->comp_node().mem_node()) {
        owner_graph()->seq_comp_node_optimizer().register_stream_var(
                output(0), {CompNode::Stream::COPY,
                            cg::SeqCompNodeOptimizer::StreamPropType::WEAK});
    }
}

void Copy::init_rt_force_dynamic_mem_alloc_imply_chain() {
    auto ivar = input(0), ovar = output(0);
    auto cn0 = ivar->comp_node(), cn1 = ovar->comp_node();
    if (cn0 != cn1 && cn0.mem_node() == cn1.mem_node()) {
        // make it possible to forward memory between comp nodes on the same mem
        // node
        ivar->add_rt_force_dynamic_mem_alloc_imply_chain(ovar);
        ovar->add_rt_force_dynamic_mem_alloc_imply_chain(ivar);
    }
}

void Copy::scn_do_execute() {
    auto &&od = output(0)->dev_tensor(),
         &&id = input(0)->dev_tensor();
    if (m_mem_fwd_success) {
        mgb_assert(od.raw_ptr() == id.raw_ptr() &&
                od.layout().eq_layout(id.layout()));
    } else {
        od.copy_from_fixlayout(id);
    }
}

Copy::NodeProp* Copy::do_make_node_prop() const {
    auto rst = Super::do_make_node_prop();
    using F = NodeProp::Flag;
    rst->add_flag(F::CROSS_COMP_NODE_MEMORY);
    rst->add_flag(F::NO_AUTOMATIC_DUP);
    return rst;
}

#if MGB_ENABLE_GRAD
MGB_IMPL_OPR_GRAD(Copy) {
    mgb_assert(wrt_idx == 0);
    return Copy::make(out_grad[0],
            OperatorNodeConfig{}.follow_comp_node(opr.input(0))).node();
}
#endif

void Copy::add_input_layout_constraint() {
    if (input(0)->comp_node() != output(0)->comp_node()) {
        auto check = [this](const TensorLayout& layout) {
            auto handle = intl::get_megdnn_handle(this->comp_node());
            return handle->check_cross_dev_copy_constraint(layout);
        };
        input(0)->add_layout_constraint(check);
    }
}

void Copy::init_output_static_infer_desc() {
    using namespace cg::static_infer;
    Super::init_output_static_infer_desc();
    owner_graph()->static_infer_manager().register_value_infer(
            output(0), ValueInferDesc::make_identity(input(0)));
}

/* ===================== MultipleDeviceTensorHolderBase ===================== */

class intl::MultipleDeviceTensorHolderBase::DevValuesExecDep final
        : public ExecDependency {
    SmallVector<DeviceTensorStorage> m_vals;

public:
    explicit DevValuesExecDep(const ValueArray& vals,
                              MultipleDeviceTensorHolderBase* opr) {
        mgb_assert(vals.size() == opr->output().size(),
                   "the output value size is diff from output var size");
        for (size_t index = 0; index < vals.size(); index++) {
            if (!opr->output(index)->contain_flag(
                        VarNode::Flag::MEMORY_NO_NEED)) {
                m_vals.emplace_back(std::move(vals[index]->storage()));
            }
        }
    }
};

intl::MultipleDeviceTensorHolderBase::MultipleDeviceTensorHolderBase(
        ComputingGraph& graph, ValueArray values,
        const OperatorNodeConfig& config)
        : Super(&graph, config, "multi_dv", {}), m_values{std::move(values)} {
    mgb_assert(
            !config.has_comp_node_set(),
            "comp node should not be set for MultipleDeviceTensorHolderBase");
    for (size_t i = 0; i < m_values.size(); ++i) {
        dv_helper::add_output(*this, m_values[i]->dtype(), ssprintf("o%zu", i));
        add_equivalence_component<ScalarHash<void*>>(m_values[i].get());
    }
}

void intl::MultipleDeviceTensorHolderBase::do_execute(ExecEnv& env) {
    // only dispatch to first comp node since all device values should be ready
    // due to PERSISTENT_DEVICE_VALUE
    auto work = [this]() {
        auto&& out = output();
        for (size_t i = 0; i < m_values.size(); ++i) {
            dv_helper::check_in_exec(*m_values[i], out[i]);
        }
    };
    env.dispatch_on_comp_node(output(0)->comp_node(), work);

    // Send BeforeKernel/AfterKernel event on every different comp_node
    ThinHashSet<mgb::CompNode> st = cg::get_opr_comp_node_set(this);
    for (auto cn : st) {
        auto send_event = [this, cn]() {
            this->owner_graph()
                    ->event()
                    .signal_inplace<cg::event::BeforeKernel>(this, cn);
            this->owner_graph()->event().signal_inplace<cg::event::AfterKernel>(
                    this, cn);
        };
        env.dispatch_on_comp_node(cn, send_event);
    }
}

void intl::MultipleDeviceTensorHolderBase::init_output_mem_plan(bool dynamic) {
    for (size_t i = 0; i < m_values.size(); ++i) {
        dv_helper::init_output_mem_plan(*m_values[i], *this, dynamic, i);
    }
}

void intl::MultipleDeviceTensorHolderBase::on_output_comp_node_stream_changed() {
    mgb_throw(SystemError, "comp node of device tensor should not change");
}

void intl::MultipleDeviceTensorHolderBase::init_output_comp_node() {
    for (size_t i = 0; i < m_values.size(); ++i) {
        output(i)->comp_node(m_values[i]->comp_node());
    }
}

void intl::MultipleDeviceTensorHolderBase::init_output_static_infer_desc() {
    using namespace cg::static_infer;
    auto&& mgr = owner_graph()->static_infer_manager();
    for (size_t i = 0; i < m_values.size(); ++i) {
        auto infer_shp = [p = m_values[i].get()](TensorShape & dest,
                                                 const InpVal&)
                                 ->bool {
            dest = p->shape();
            return dest.ndim;
        };
        mgr.register_shape_infer(output(i),
                                 {SourceType::CONSTANT, {}, infer_shp});
    }
}

intl::MultipleDeviceTensorHolderBase::NodeProp*
intl::MultipleDeviceTensorHolderBase::do_make_node_prop() const {
    auto ret = Super::do_make_node_prop();
    ret->add_flag(NodeProp::Flag::CROSS_COMP_NODE_MEMORY);
    return ret;
}

void intl::MultipleDeviceTensorHolderBase::record_execute_deps(
        ExecDependencyArray& deps) {
    deps.emplace_back(std::make_unique<DevValuesExecDep>(values(), this));
}

/* ===================== MultipleDeviceTensorHolder ===================== */

MGB_DYN_TYPE_OBJ_FINAL_IMPL(MultipleDeviceTensorHolder);

SymbolVarArray MultipleDeviceTensorHolder::make(
        ComputingGraph& graph, ValueArray values,
        const OperatorNodeConfig& config) {
    return cg::to_symbol_var_array(
            graph.insert_opr(
                         std::make_unique<MultipleDeviceTensorHolder>(
                                 graph, std::move(values), config))
                    ->output());
}

/* ================== MultipleDeviceTensorWithFormatHolder ================== */

MGB_DYN_TYPE_OBJ_FINAL_IMPL(MultipleDeviceTensorWithFormatHolder);

SymbolVarArray MultipleDeviceTensorWithFormatHolder::make(
        ComputingGraph& graph, ValueArray values,
        const OperatorNodeConfig& config) {
    return cg::to_symbol_var_array(
            graph.insert_opr(
                         std::make_unique<MultipleDeviceTensorWithFormatHolder>(
                                 graph, std::move(values), config))
                    ->output());
}

void MultipleDeviceTensorWithFormatHolder::init_output_format() {
    for (size_t i = 0; i < m_values.size(); ++i) {
        output(i)->format(m_values[i]->format());
    }
}

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}