/**
 * \file src/opr/impl/mc20_runtime_op.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

#include "megbrain/opr/mc20_runtime_op.h"
#include "megbrain/common.h"
#include "megbrain/graph/event.h"
#include "megdnn/dtype.h"

#include <memory>

#if MGB_MC20

using namespace mgb;
using namespace opr;

namespace {
TensorShape mc20_shape_to_mgb_shape(AX_NPU_SDK_EX_TENSOR_META_T tensor_meta) {
    TensorShape ret;
    ret.ndim = tensor_meta.nShapeNDim;
    for (size_t i = 0; i < ret.ndim; ++i) {
        ret[i] = tensor_meta.pShape[i];
    }
    return ret;
}
DType mc20_dtype_to_mgb_dtype(AX_NPU_SDK_EX_ADV_TENSOR_DTYPE data_type) {
    switch (data_type) {
        case AX_NPU_TDT_UINT8:
            return dtype::Uint8();
        case AX_NPU_TDT_FLOAT32:
            return dtype::Float32();
        case AX_NPU_TDT_INT16:
            return dtype::Int16();
        case AX_NPU_TDT_INT32:
            return dtype::Int32();
        default:
            mgb_throw(
                    MegBrainError, "MC20DataType %d is not supported by MegBrain.",
                    static_cast<int>(data_type));
    }
}

};  // namespace

constexpr AX_NPU_SDK_EX_HANDLE_T MC20RuntimeOpr::INVALID_MODEL_HANDLE;

/* ====================== MC20RuntimeOpr ==================== */
MGB_DYN_TYPE_OBJ_FINAL_IMPL(MC20RuntimeOpr);
MC20RuntimeOpr::MC20RuntimeOpr(
        SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T model_handle,
        const VarNodeArray& inputs, const OperatorNodeConfig& config)
        : Super(inputs[0]->owner_graph(), config, "mc20_runtime", inputs),
          m_buffer{std::move(buf)},
          m_model_handle(model_handle) {
    mgb_assert(
            inputs[0]->comp_node().device_type() == CompNode::DeviceType::MC20,
            "MC20RuntimeOpr can only be used on mc20 comp node; "
            "got %s",
            inputs[0]->comp_node().to_string().c_str());

    for (auto i : inputs) {
        add_input({i});
    }
    if (m_model_handle == INVALID_MODEL_HANDLE) {
        MGB_MC20_CHECK(AX_NPU_SDK_EX_Create_handle(
                &m_model_handle, m_buffer.data(), m_buffer.size()));
        m_is_model_holder = true;
    }

    const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
            AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);

    size_t nr_outputs = io_info->nOutputSize;
    bool has_workspace = false;
    if (nr_outputs == 1) {
        const auto& tensor_meta = *(io_info->pOutputs[0].pTensorMeta);
        add_output(std::string(reinterpret_cast<char*>(tensor_meta.pName)));
        if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
            mgb_assert(tensor_meta.nInnerSize > 0);
            has_workspace = true;
        }

    } else {
        for (size_t i = 0; i < nr_outputs; ++i) {
            const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta);
            add_output(std::string(reinterpret_cast<char*>(tensor_meta.pName)));
            if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
                mgb_assert(tensor_meta.nInnerSize > 0);
                has_workspace = true;
            }
        }
    }
    mgb_assert(has_workspace, "Currently only support model with cpu tail");

    //! \warning There is no interface in MC20 to get the batch size of
    //! model.MC20 supports multi-batch by changing the input of n-batch to n
    //! 1-batch input.
    mgb_assert(
            io_info->nInputSize % inputs.size() == 0,
            "The number of inputs in the neu model should be multiple of "
            "the number of inputs in megbrain, but got %zu(neu model) vs "
            "%zu(mgb model)",
            io_info->nInputSize, inputs.size());
    m_model_batch = reinterpret_cast<size_t>(io_info->nInputSize / inputs.size());

    add_equivalence_component<mgb::ScalarHash<const void*>>(m_buffer.data());
    cg::add_workspace_output(this);
};

MC20RuntimeOpr::~MC20RuntimeOpr() {
    if (m_is_model_holder) {
        MGB_MC20_CHECK(AX_NPU_SDK_EX_Destroy_handle(m_model_handle));
    }
}

void MC20RuntimeOpr::execute_mc20() {
    auto&& mc20_env = CompNodeEnv::from_comp_node(input(0)->comp_node()).mc20_env();
    mc20_env.activate();

    const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
            AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);

    AX_NPU_SDK_EX_IO_T npu_io;
    memset(&npu_io, 0, sizeof(npu_io));
    size_t batch_size = input(0)->dev_tensor().layout().shape[0];
    for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx += m_model_batch) {
        //! prepare input
        npu_io.nInputSize = io_info->nInputSize;
        auto inputs = std::make_unique<AX_NPU_SDK_EX_BUF_T[]>(npu_io.nInputSize);
        npu_io.pInputs = inputs.get();
        for (size_t i = 0; i < npu_io.nInputSize; i++) {
            // get input addr info
            size_t inp_idx = reinterpret_cast<size_t>(i / m_model_batch);
            AX_VOID* p_virtual_addr = input(inp_idx)->dev_tensor().raw_ptr();
            AX_U64 phy_addr = MC20MemoryManager::Instance().get_phyaddr(p_virtual_addr);
            auto nr_bytes_per_batch =
                    input(inp_idx)->layout().span().dist_byte() / batch_size;
            // add batch offset
            p_virtual_addr = reinterpret_cast<AX_VOID*>(
                    reinterpret_cast<AX_U64>(p_virtual_addr) +
                    nr_bytes_per_batch * (batch_idx + i % m_model_batch));
            phy_addr += nr_bytes_per_batch * (batch_idx + i % m_model_batch);

            MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer(
                    phy_addr, p_virtual_addr, nr_bytes_per_batch, phy_addr,
                    p_virtual_addr, nr_bytes_per_batch, &npu_io.pInputs[i]));
        }

        //! prepare output
        npu_io.nOutputSize = io_info->nOutputSize;
        auto outputs = std::make_unique<AX_NPU_SDK_EX_BUF_T[]>(npu_io.nOutputSize);
        npu_io.pOutputs = outputs.get();
        AX_U32 offset = 0;
        AX_VOID* inner_virtual_addr_start = nullptr;
        AX_U64 inner_phy_addr_start = 0;
        // get innder addr form workspace
        inner_virtual_addr_start = output(npu_io.nOutputSize)->dev_tensor().raw_ptr();
        inner_phy_addr_start =
                MC20MemoryManager::Instance().get_phyaddr(inner_virtual_addr_start);
        for (size_t i = 0; i < npu_io.nOutputSize; i++) {
            // get output addr info
            AX_VOID* p_virtual_addr = output(i)->dev_tensor().raw_ptr();
            AX_U64 phy_addr = 0;
            auto nr_bytes_per_batch =
                    output(i)->layout().span().dist_byte() / batch_size;
            // add batch offset
            p_virtual_addr = reinterpret_cast<AX_VOID*>(
                    reinterpret_cast<AX_U64>(p_virtual_addr) +
                    nr_bytes_per_batch * batch_idx);
            phy_addr += nr_bytes_per_batch * batch_idx;

            const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta);
            if (tensor_meta.eMemoryType == AX_NPU_MT_PHYSICAL) {
                MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer(
                        phy_addr, p_virtual_addr, nr_bytes_per_batch, phy_addr,
                        p_virtual_addr, nr_bytes_per_batch, &npu_io.pOutputs[i]));
            } else if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
                auto p_inner_virtual_addr = reinterpret_cast<AX_VOID*>(
                        reinterpret_cast<AX_U64>(inner_virtual_addr_start) + offset);
                auto innerphy_addr = inner_phy_addr_start + offset;
                MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer(
                        phy_addr, p_virtual_addr, nr_bytes_per_batch, innerphy_addr,
                        p_inner_virtual_addr, tensor_meta.nInnerSize,
                        &npu_io.pOutputs[i]));

                offset += tensor_meta.nInnerSize;
            }
        }

        MGB_MC20_CHECK(AX_NPU_SDK_EX_Run_task_sync(m_model_handle, &npu_io));
    }
}

void MC20RuntimeOpr::init_output_comp_node() {
    //! set output to cpu compnode if has cpu tail
    const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
            AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);

    CompNode input_cn;
    for (auto&& i : input()) {
        if (!input_cn.valid()) {
            input_cn = i->comp_node();
        } else {
            mgb_assert(
                    input_cn.mem_node() == i->comp_node().mem_node(),
                    "opr %s{%s} requires all input to be on the same memory "
                    "node expect=%s cur_var=%s cur_cn=%s",
                    this->cname(), this->dyn_typeinfo()->name,
                    input_cn.to_string().c_str(), i->cname(),
                    i->comp_node().to_string().c_str());
        }
    }
    for (size_t i = 0; i < io_info->nOutputSize; i++) {
        //! compnode of the var should be default_cpu as the output will be
        //! proxy to user
        output(i)->comp_node(CompNode::default_cpu());
    }
    //! the last output is workspace, which should be the same as input
    output(io_info->nOutputSize)->comp_node(input_cn);
}

MC20RuntimeOpr::NodeProp* MC20RuntimeOpr::do_make_node_prop() const {
    auto ret = Super::do_make_node_prop();
    ret->add_flag(NodeProp::Flag::CROSS_COMP_NODE_MEMORY);
    return ret;
}

void MC20RuntimeOpr::do_execute(ExecEnv& env) {
    CompNode cn = output(0)->comp_node();
    auto runner = [this, cn]() {
        this->owner_graph()->event().signal_inplace<cg::event::BeforeKernel>(this, cn);
        cn.activate();
        execute_mc20();
        this->owner_graph()->event().signal_inplace<cg::event::AfterKernel>(this, cn);
    };
    env.dispatch_on_comp_node(cn, runner);

    // Send BeforeKernel/AfterKernel event on every different comp_node
    ThinHashSet<mgb::CompNode> st = cg::get_opr_comp_node_set(this);
    for (auto cn : st) {
        auto send_event = [this, cn]() {
            this->owner_graph()->event().signal_inplace<cg::event::BeforeKernel>(
                    this, cn);
            this->owner_graph()->event().signal_inplace<cg::event::AfterKernel>(
                    this, cn);
        };
        env.dispatch_on_comp_node(cn, send_event);
    }
}

void MC20RuntimeOpr::on_output_comp_node_stream_changed() {
    mgb_throw(SystemError, "comp node of output should not change");
}

void MC20RuntimeOpr::get_output_var_shape(
        const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const {
    const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
            AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
    size_t nr_inputs = io_info->nInputSize;

    for (size_t i = 0; i < nr_inputs; ++i) {
        const auto& tensor_meta = *(io_info->pInputs[i].pTensorMeta);
        auto model_shape = mc20_shape_to_mgb_shape(tensor_meta);
        size_t inp_idx = reinterpret_cast<size_t>(i / m_model_batch);
        // enable mutibatch
        mgb_assert(
                inp_shape[inp_idx][0] % model_shape[0] == 0 &&
                        (inp_shape[inp_idx][0] / model_shape[0]) % m_model_batch == 0,
                "input %zu batch is %zu, while model's input batch is %zu", i,
                inp_shape[inp_idx][0], model_shape[0]);
        model_shape[0] = inp_shape[inp_idx][0];
        mgb_assert(
                model_shape.eq_shape(inp_shape[inp_idx]),
                "shape mismatch of input %zu, expected: %s got: %s", i,
                model_shape.to_string().c_str(),
                inp_shape[inp_idx].to_string().c_str());
    }
    size_t input_batch = (io_info->pInputs[0].pTensorMeta)->pShape[0];
    //! \warning mc20 sdk implement multi-batch by breaking an n-batch input up
    //! into n 1-batch inputs
    mgb_assert(input_batch == 1, "input batch: %d, net's input batch: 1", input_batch);
    AX_U32 workspace_size = 0;
    for (size_t i = 0; i < io_info->nOutputSize; ++i) {
        const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta);
        out_shape[i] = mc20_shape_to_mgb_shape(tensor_meta);
        // enable mutibatch
        out_shape[i][0] =
                out_shape[i][0] * inp_shape[0][0] / input_batch / m_model_batch;
        if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
            workspace_size += tensor_meta.nInnerSize;
        }
    }
    out_shape.back() = {workspace_size};
}

void MC20RuntimeOpr::add_input_layout_constraint() {
    //! default contiguous
    for (auto i : input()) {
        i->add_layout_constraint_contiguous();
    }
}

void MC20RuntimeOpr::init_output_dtype() {
    DType dt_mc20, dt_input;
    const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
            AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
    for (size_t i = 0; i < io_info->nInputSize; ++i) {
        dt_mc20 = mc20_dtype_to_mgb_dtype(io_info->pInputs[i].eDType);
        size_t inp_idx = reinterpret_cast<size_t>(i / m_model_batch);
        dt_input = input(inp_idx)->dtype();
        mgb_assert(
                dt_mc20.valid() && dt_input.valid() &&
                        dt_mc20.enumv() == dt_input.enumv(),
                "dtype mismatch of input %zu: expected %s, "
                "got %s",
                i, dt_mc20.name(), dt_input.name());
    }

    for (size_t i = 0; i < io_info->nOutputSize; ++i) {
        dt_mc20 = mc20_dtype_to_mgb_dtype(io_info->pOutputs[i].eDType);
        mgb_assert(
                dt_mc20.valid(),
                "output dtype checking failed: invalid dtype returned.");
        if (!output(i)->dtype().valid())
            output(i)->dtype(dt_mc20);
    }
}

SymbolVarArray MC20RuntimeOpr::make(
        SharedBuffer buf, const SymbolVarArray& src, const OperatorNodeConfig& config) {
    VarNodeArray var_node_array = cg::to_var_node_array(src);
    auto mc20_runtime_opr = std::make_unique<MC20RuntimeOpr>(
            std::move(buf), INVALID_MODEL_HANDLE, var_node_array, config);
    auto ret = cg::to_symbol_var_array(src[0].node()
                                               ->owner_graph()
                                               ->insert_opr(std::move(mc20_runtime_opr))
                                               ->output());
    ret.pop_back();  // remove workspace
    return ret;
}

SymbolVarArray MC20RuntimeOpr::make(
        const void* buf, size_t size, const SymbolVarArray& src,
        const OperatorNodeConfig& config) {
    mgb_throw_if(
            !CompNode::get_device_count(CompNode::DeviceType::MC20), SystemError,
            "can not create MC20RuntimeOpr when mc20 is not "
            "available");
    std::shared_ptr<uint8_t> shptr{new uint8_t[size], [](uint8_t* p) { delete[] p; }};
    memcpy(shptr.get(), buf, size);
    SharedBuffer buffer{std::move(shptr), size};
    return make(std::move(buffer), src, config);
}

SymbolVarArray MC20RuntimeOpr::make(
        SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T model_handle,
        const SymbolVarArray& src, const OperatorNodeConfig& config) {
    VarNodeArray var_node_array = cg::to_var_node_array(src);
    auto mc20_runtime_opr = std::make_unique<MC20RuntimeOpr>(
            std::move(buf), model_handle, var_node_array, config);
    auto ret = cg::to_symbol_var_array(src[0].node()
                                               ->owner_graph()
                                               ->insert_opr(std::move(mc20_runtime_opr))
                                               ->output());
    ret.pop_back();  // remove workspace
    return ret;
}

#endif  // MGB_MC20

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}