tensorrt_runtime_opr.cpp 9.2 KB
Newer Older
1 2 3 4
/**
 * \file src/tensorrt/impl/tensorrt_runtime_opr.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
5
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
6 7 8 9 10 11 12 13 14
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#include "megbrain/tensorrt/tensorrt_runtime_opr.h"
#include "megbrain/common.h"
#include "megbrain/plugin/profiler.h"
M
Megvii Engine Team 已提交
15
#include "megbrain/serialization/opr_load_dump.h"
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
#include "megbrain/version_symbol.h"
#include "megdnn/basic_types.h"

#include <cinttypes>

#if MGB_ENABLE_TENSOR_RT

using namespace mgb;
using namespace opr;
using TensorRTManager = intl::TensorRTManager;

namespace {

DType get_dtype_from_trt(nvinfer1::DataType trt_dtype) {
    switch (trt_dtype) {
        case nvinfer1::DataType::kFLOAT:
            return dtype::Float32();
        case nvinfer1::DataType::kHALF:
#if !MEGDNN_DISABLE_FLOAT16
            return dtype::Float16();
#else
            mgb_throw(MegBrainError, "Float16 support is disabled.");
#endif
        // We cannot get scale of an Tensor from tensorrt Engine, so the scale
        // here is not correct. When researchers build TensorRT engine, they
        // should make sure the scale of quantized int8 tensors in MegBrain
        // matches with dynamic ranges of TensorRT tensors
        case nvinfer1::DataType::kINT8:
            return dtype::QuantizedS8(1.f);
        case nvinfer1::DataType::kINT32:
            return dtype::Int32();
        default:
            mgb_assert("DataType of trt engine is unknown.");
    }
    return DType();
}

}  // anonymous namespace

/* ========================== TensorRTRuntimeOpr ========================== */

MGB_DYN_TYPE_OBJ_FINAL_IMPL(TensorRTRuntimeOpr);
TensorRTRuntimeOpr::TensorRTRuntimeOpr(
        std::shared_ptr<nvinfer1::ICudaEngine> engine,
        std::shared_ptr<GpuAllocator> gpu_allocator, const VarNodeArray& inputs,
        const OperatorNodeConfig& config)
M
Megvii Engine Team 已提交
62
        : Super(inputs.at(0)->owner_graph(), config, "tensor_rt", {inputs.at(0)}),
63 64 65 66 67 68 69 70 71 72 73 74
          m_gpu_allocator{std::move(gpu_allocator)},
          m_engine{std::move(engine)},
          m_trt_engine_has_batch{false} {
    mgb_assert(
            inputs[0]->comp_node().device_type() == CompNode::DeviceType::CUDA,
            "TensorRTRuntimeOpr can only be used on cuda comp nodes; got %s",
            inputs[0]->comp_node().to_string().c_str());
    size_t nr_input = 0;
    bool is_input = true;
    for (int i = 0; i < m_engine->getNbBindings(); ++i) {
        if (m_engine->bindingIsInput(nr_input)) {
            mgb_assert(is_input, "mixed input/output bindings");
75 76 77
            // nbDims == 3, means CHW, without batch
            if (m_engine->getBindingDimensions(nr_input).nbDims != 3)
                m_trt_engine_has_batch = true;
78 79 80 81 82 83
            ++nr_input;
        } else {
            is_input = false;
        }
    }
    size_t nr_output = m_engine->getNbBindings() - nr_input;
M
Megvii Engine Team 已提交
84 85 86
    mgb_assert(
            nr_input == inputs.size(), "inputs size not equal: expect=%zu got=%zu",
            nr_input, inputs.size());
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
    for (auto i : inputs) {
        add_input({i});
    }
    if (nr_output == 1) {
        add_output(None);
    } else {
        for (size_t i = 0; i < nr_output; ++i)
            add_output(ssprintf("o%zu", i));
    }
    cg::add_workspace_output(this);
    add_equivalence_component<mgb::ScalarHash<void*>>(m_engine.get());
}

void TensorRTRuntimeOpr::get_output_var_shape(
        const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const {
    auto batch = inp_shape.at(0)[0];
    auto get_mgb_shape = [this, batch](int binding_idx) -> TensorShape {
        auto dims = m_engine->getBindingDimensions(binding_idx);
#if NV_TENSOR_RT_VERSION >= 6001
        auto format = m_engine->getBindingFormat(binding_idx);
        // converting dims to nchw4 format
        if (format == nvinfer1::TensorFormat::kCHW4) {
M
Megvii Engine Team 已提交
109 110 111 112 113
            mgb_assert(
                    dims.nbDims == 3 || dims.nbDims == 4,
                    "Tensor with NCHW4 format should have dimensions of "
                    "3/4.(got: %d)",
                    dims.nbDims);
114 115 116 117 118
            int chan_pos = 0;
            if (dims.nbDims == 4) {
                chan_pos = 1;
            }
            dims.nbDims = dims.nbDims + 1;
119
            dims.d[chan_pos] = (dims.d[chan_pos] + 3) / 4;
120 121 122 123 124 125 126 127 128
            dims.d[dims.nbDims - 1] = 4;
        }
#endif
        return m_trt_engine_has_batch ? TensorRTOpr::dims2shape(dims)
                                      : TensorRTOpr::dims2shape(dims, batch);
    };
    for (size_t i = 0; i < inp_shape.size(); ++i) {
        mgb_assert(batch == inp_shape[i][0], "input batchsize not equal");
        TensorShape shp = get_mgb_shape(i);
M
Megvii Engine Team 已提交
129 130 131
        mgb_assert(
                shp.eq_shape(inp_shape[i]), "input shape mismatch: expect=%s got=%s",
                shp.to_string().c_str(), inp_shape[i].to_string().c_str());
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
    }
    for (size_t i = 0; i < out_shape.size() - 1; ++i) {
        out_shape[i] = get_mgb_shape(i + input().size());
    }
    out_shape.back() = {intl::workspace_size(m_engine.get())};
}

void TensorRTRuntimeOpr::add_input_layout_constraint() {
    for (auto i : input()) {
        i->add_layout_constraint_contiguous();
    }
}

void TensorRTRuntimeOpr::scn_do_execute() {
    auto batch = this->input(0)->shape()[0];
    if (m_trt_engine_has_batch)
M
Megvii Engine Team 已提交
148 149
        m_manager.exec(
                this, m_gpu_allocator ? m_gpu_allocator->comp_node() : CompNode{},
150 151
                m_engine.get());
    else
M
Megvii Engine Team 已提交
152 153
        m_manager.exec(
                this, m_gpu_allocator ? m_gpu_allocator->comp_node() : CompNode{},
154 155 156 157 158 159 160 161 162
                m_engine.get(), batch);
}

void TensorRTRuntimeOpr::init_output_dtype() {
    DType dt_trt, dt_input;
    int idx = 0;
    for (auto inp : input()) {
        dt_trt = get_dtype_from_trt(m_engine->getBindingDataType(idx));
        dt_input = inp->dtype();
M
Megvii Engine Team 已提交
163 164 165 166 167 168
        mgb_assert(
                dt_trt.valid() && dt_input.valid() &&
                        dt_trt.enumv() == dt_input.enumv(),
                "Input %d Dtype is not expected in trt engine: expected %s, "
                "got %s",
                idx, dt_trt.name(), dt_input.name());
169 170 171 172 173
        idx++;
    }

    for (size_t i = 0; i < output().size(); ++i) {
        dt_trt = get_dtype_from_trt(m_engine->getBindingDataType(idx));
M
Megvii Engine Team 已提交
174 175 176
        mgb_assert(
                dt_trt.valid(),
                "output dtype checking failed: invalid dtype returned.");
177
        if (dt_trt.enumv() == DTypeEnum::QuantizedS8) {
M
Megvii Engine Team 已提交
178 179 180 181
            mgb_assert(
                    output(i)->dtype().valid(),
                    "user should specify scale of output tensor of "
                    "TensorRTRuntimeOpr.");
182 183 184 185 186 187 188 189 190 191 192 193 194
        }
        if (!output(i)->dtype().valid())
            output(i)->dtype(dt_trt);
        idx++;
    }
}

SymbolVarArray TensorRTRuntimeOpr::make(
        std::shared_ptr<nvinfer1::ICudaEngine> engine,
        std::shared_ptr<GpuAllocator> gpu_allocator, const SymbolVarArray& src,
        const OperatorNodeConfig& config) {
    VarNodeArray var_node_array = cg::to_var_node_array(src);
    auto tensor_rt_opr = std::make_unique<TensorRTRuntimeOpr>(
M
Megvii Engine Team 已提交
195 196 197 198 199
            std::move(engine), std::move(gpu_allocator), var_node_array, config);
    auto ret = cg::to_symbol_var_array(src[0].node()
                                               ->owner_graph()
                                               ->insert_opr(std::move(tensor_rt_opr))
                                               ->output());
200 201 202 203
    ret.pop_back();  // remove workspace
    return ret;
}

M
Megvii Engine Team 已提交
204 205 206
SymbolVarArray TensorRTRuntimeOpr::make(
        const void* buf, size_t buf_size, const SymbolVarArray& src,
        const OperatorNodeConfig& config) {
207
    mgb_throw_if(
M
Megvii Engine Team 已提交
208
            !CompNode::get_device_count(CompNode::DeviceType::CUDA), SystemError,
209 210 211 212
            "can not create TensorRTRuntimeOpr when CUDA is not available");
    mgb_assert(!src.empty(), "no inputs provided");
    TensorRTUniquePtr<nvinfer1::IRuntime> runtime{
            nvinfer1::createInferRuntime(TensorRTOpr::Logger::instance()), {}};
M
Megvii Engine Team 已提交
213
    auto gpu_allocator = std::make_shared<GpuAllocator>(src[0].node()->comp_node());
214 215 216 217 218 219
    runtime->setGpuAllocator(gpu_allocator.get());
    auto engine = runtime->deserializeCudaEngine(buf, buf_size, nullptr);
    mgb_assert(engine, "failed to deserialize ICudaEngine");
    return make(to_shared_ptr_engine(engine), gpu_allocator, src, config);
}

M
Megvii Engine Team 已提交
220 221
void TensorRTRuntimeOpr::LoadDumpImpl::dump(
        serialization::OprDumpContext& ctx, const cg::OperatorNodeBase& opr) {
222 223 224 225 226 227 228 229 230 231 232
    TensorRTUniquePtr<nvinfer1::IHostMemory> buf{
            opr.cast_final_safe<Opr>().trt_cuda_engine()->serialize(), {}};
    mgb_assert(buf, "failed to serialize ICudaEngine");
    ctx.dump_buf_with_len(buf->data(), buf->size());
}

cg::OperatorNodeBase* TensorRTRuntimeOpr::LoadDumpImpl::load(
        serialization::OprLoadContext& ctx, const cg::VarNodeArray& inputs,
        const OperatorNodeConfig& config) {
    inputs.at(0)->comp_node().activate();
    auto buf = ctx.load_shared_buf_with_len();
M
Megvii Engine Team 已提交
233
    return Opr::make(buf.data(), buf.size(), cg::to_symbol_var_array(inputs), config)
234 235 236 237 238 239 240 241
            .at(0)
            .node()
            ->owner_opr();
}

#endif  // MGB_ENABLE_TENSOR_RT

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}