tensorrt_runtime_opr.cpp 10.6 KB
Newer Older
1 2 3 4
/**
 * \file src/tensorrt/impl/tensorrt_runtime_opr.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
5
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
6 7 8 9 10 11 12 13 14
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#include "megbrain/tensorrt/tensorrt_runtime_opr.h"
#include "megbrain/common.h"
#include "megbrain/plugin/profiler.h"
M
Megvii Engine Team 已提交
15
#include "megbrain/serialization/opr_load_dump.h"
16 17 18 19 20 21
#include "megbrain/version_symbol.h"
#include "megdnn/basic_types.h"

#include <cinttypes>

#if MGB_ENABLE_TENSOR_RT
22
#include <NvInferPlugin.h>
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62

using namespace mgb;
using namespace opr;
using TensorRTManager = intl::TensorRTManager;

namespace {

DType get_dtype_from_trt(nvinfer1::DataType trt_dtype) {
    switch (trt_dtype) {
        case nvinfer1::DataType::kFLOAT:
            return dtype::Float32();
        case nvinfer1::DataType::kHALF:
#if !MEGDNN_DISABLE_FLOAT16
            return dtype::Float16();
#else
            mgb_throw(MegBrainError, "Float16 support is disabled.");
#endif
        // We cannot get scale of an Tensor from tensorrt Engine, so the scale
        // here is not correct. When researchers build TensorRT engine, they
        // should make sure the scale of quantized int8 tensors in MegBrain
        // matches with dynamic ranges of TensorRT tensors
        case nvinfer1::DataType::kINT8:
            return dtype::QuantizedS8(1.f);
        case nvinfer1::DataType::kINT32:
            return dtype::Int32();
        default:
            mgb_assert("DataType of trt engine is unknown.");
    }
    return DType();
}

}  // anonymous namespace

/* ========================== TensorRTRuntimeOpr ========================== */

MGB_DYN_TYPE_OBJ_FINAL_IMPL(TensorRTRuntimeOpr);
TensorRTRuntimeOpr::TensorRTRuntimeOpr(
        std::shared_ptr<nvinfer1::ICudaEngine> engine,
        std::shared_ptr<GpuAllocator> gpu_allocator, const VarNodeArray& inputs,
        const OperatorNodeConfig& config)
M
Megvii Engine Team 已提交
63
        : Super(inputs.at(0)->owner_graph(), config, "tensor_rt", {inputs.at(0)}),
64 65 66 67 68 69 70 71 72
          m_gpu_allocator{std::move(gpu_allocator)},
          m_engine{std::move(engine)},
          m_trt_engine_has_batch{false} {
    mgb_assert(
            inputs[0]->comp_node().device_type() == CompNode::DeviceType::CUDA,
            "TensorRTRuntimeOpr can only be used on cuda comp nodes; got %s",
            inputs[0]->comp_node().to_string().c_str());
    size_t nr_input = 0;
    bool is_input = true;
73 74 75 76 77 78 79
#if NV_TENSOR_RT_VERSION >= 6001
    auto profile_num = m_engine->getNbOptimizationProfiles();
#else
    int profile_num = 1;
#endif
    auto bindings_per_profile = m_engine->getNbBindings() / profile_num;
    for (int i = 0; i < bindings_per_profile; ++i) {
80 81
        if (m_engine->bindingIsInput(nr_input)) {
            mgb_assert(is_input, "mixed input/output bindings");
82 83 84
            // nbDims == 3, means CHW, without batch
            if (m_engine->getBindingDimensions(nr_input).nbDims != 3)
                m_trt_engine_has_batch = true;
85 86 87 88 89
            ++nr_input;
        } else {
            is_input = false;
        }
    }
90
    size_t nr_output = bindings_per_profile - nr_input;
M
Megvii Engine Team 已提交
91 92 93
    mgb_assert(
            nr_input == inputs.size(), "inputs size not equal: expect=%zu got=%zu",
            nr_input, inputs.size());
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
    for (auto i : inputs) {
        add_input({i});
    }
    if (nr_output == 1) {
        add_output(None);
    } else {
        for (size_t i = 0; i < nr_output; ++i)
            add_output(ssprintf("o%zu", i));
    }
    cg::add_workspace_output(this);
    add_equivalence_component<mgb::ScalarHash<void*>>(m_engine.get());
}

void TensorRTRuntimeOpr::get_output_var_shape(
        const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const {
    auto batch = inp_shape.at(0)[0];
110
    m_manager.create_trt_context(this->comp_node(), inp_shape, m_engine.get());
111
    auto get_mgb_shape = [&](int binding_idx) -> TensorShape {
112 113 114 115 116
        auto dims = m_engine->getBindingDimensions(binding_idx);
#if NV_TENSOR_RT_VERSION >= 6001
        auto format = m_engine->getBindingFormat(binding_idx);
        // converting dims to nchw4 format
        if (format == nvinfer1::TensorFormat::kCHW4) {
M
Megvii Engine Team 已提交
117 118 119 120 121
            mgb_assert(
                    dims.nbDims == 3 || dims.nbDims == 4,
                    "Tensor with NCHW4 format should have dimensions of "
                    "3/4.(got: %d)",
                    dims.nbDims);
122 123 124 125 126
            int chan_pos = 0;
            if (dims.nbDims == 4) {
                chan_pos = 1;
            }
            dims.nbDims = dims.nbDims + 1;
127
            dims.d[chan_pos] = (dims.d[chan_pos] + 3) / 4;
128 129 130
            dims.d[dims.nbDims - 1] = 4;
        }
#endif
131 132 133 134 135 136 137 138 139 140
        auto shape = m_trt_engine_has_batch ? TensorRTOpr::dims2shape(dims)
                                            : TensorRTOpr::dims2shape(dims, batch);
#if NV_TENSOR_RT_VERSION >= 6001
        if (static_cast<size_t>(binding_idx) < inp_shape.size()) {
            for (int i = 0; i < dims.nbDims; i++) {
                if (dims.d[i] == -1) {
                    shape[i] = inp_shape.at(binding_idx)[i];
                }
            }
        } else {
141
            auto trt_infer_dims = m_manager.get_binding_dimensions(binding_idx);
142 143 144 145 146 147 148 149
            for (int i = 0; i < dims.nbDims; i++) {
                if (dims.d[i] == -1) {
                    shape[i] = trt_infer_dims.d[i];
                }
            }
        }
#endif
        return shape;
150 151 152 153
    };
    for (size_t i = 0; i < inp_shape.size(); ++i) {
        mgb_assert(batch == inp_shape[i][0], "input batchsize not equal");
        TensorShape shp = get_mgb_shape(i);
M
Megvii Engine Team 已提交
154 155 156
        mgb_assert(
                shp.eq_shape(inp_shape[i]), "input shape mismatch: expect=%s got=%s",
                shp.to_string().c_str(), inp_shape[i].to_string().c_str());
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
    }
    for (size_t i = 0; i < out_shape.size() - 1; ++i) {
        out_shape[i] = get_mgb_shape(i + input().size());
    }
    out_shape.back() = {intl::workspace_size(m_engine.get())};
}

void TensorRTRuntimeOpr::add_input_layout_constraint() {
    for (auto i : input()) {
        i->add_layout_constraint_contiguous();
    }
}

void TensorRTRuntimeOpr::scn_do_execute() {
    auto batch = this->input(0)->shape()[0];
    if (m_trt_engine_has_batch)
M
Megvii Engine Team 已提交
173 174
        m_manager.exec(
                this, m_gpu_allocator ? m_gpu_allocator->comp_node() : CompNode{},
175 176
                m_engine.get());
    else
M
Megvii Engine Team 已提交
177 178
        m_manager.exec(
                this, m_gpu_allocator ? m_gpu_allocator->comp_node() : CompNode{},
179 180 181 182 183 184 185 186 187
                m_engine.get(), batch);
}

void TensorRTRuntimeOpr::init_output_dtype() {
    DType dt_trt, dt_input;
    int idx = 0;
    for (auto inp : input()) {
        dt_trt = get_dtype_from_trt(m_engine->getBindingDataType(idx));
        dt_input = inp->dtype();
M
Megvii Engine Team 已提交
188 189 190 191 192 193
        mgb_assert(
                dt_trt.valid() && dt_input.valid() &&
                        dt_trt.enumv() == dt_input.enumv(),
                "Input %d Dtype is not expected in trt engine: expected %s, "
                "got %s",
                idx, dt_trt.name(), dt_input.name());
194 195 196
        idx++;
    }

197 198
    size_t out = 0;
    for (; out < output().size() - 1; ++out) {
199
        dt_trt = get_dtype_from_trt(m_engine->getBindingDataType(idx));
M
Megvii Engine Team 已提交
200 201 202
        mgb_assert(
                dt_trt.valid(),
                "output dtype checking failed: invalid dtype returned.");
203
        if (dt_trt.enumv() == DTypeEnum::QuantizedS8) {
M
Megvii Engine Team 已提交
204
            mgb_assert(
205
                    output(out)->dtype().valid(),
M
Megvii Engine Team 已提交
206 207
                    "user should specify scale of output tensor of "
                    "TensorRTRuntimeOpr.");
208
        }
209 210
        if (!output(out)->dtype().valid())
            output(out)->dtype(dt_trt);
211 212
        idx++;
    }
213 214 215
    //! workspace
    if (!output(out)->dtype().valid())
        output(out)->dtype(dtype::Byte());
216 217 218 219 220 221
}

SymbolVarArray TensorRTRuntimeOpr::make(
        std::shared_ptr<nvinfer1::ICudaEngine> engine,
        std::shared_ptr<GpuAllocator> gpu_allocator, const SymbolVarArray& src,
        const OperatorNodeConfig& config) {
222 223 224 225 226 227
    mgb_assert(
            NV_TENSORRT_VERSION == getInferLibVersion(),
            "TensorRT version mismatch: compiled with %d; detected %d at runtime , may "
            "caused by customized environment, for example LD_LIBRARY_PATH on LINUX "
            "and PATH on Windows!!",
            NV_TENSORRT_VERSION, getInferLibVersion());
228 229
    VarNodeArray var_node_array = cg::to_var_node_array(src);
    auto tensor_rt_opr = std::make_unique<TensorRTRuntimeOpr>(
M
Megvii Engine Team 已提交
230 231 232 233 234
            std::move(engine), std::move(gpu_allocator), var_node_array, config);
    auto ret = cg::to_symbol_var_array(src[0].node()
                                               ->owner_graph()
                                               ->insert_opr(std::move(tensor_rt_opr))
                                               ->output());
235 236 237 238
    ret.pop_back();  // remove workspace
    return ret;
}

M
Megvii Engine Team 已提交
239 240 241
SymbolVarArray TensorRTRuntimeOpr::make(
        const void* buf, size_t buf_size, const SymbolVarArray& src,
        const OperatorNodeConfig& config) {
242
    mgb_throw_if(
M
Megvii Engine Team 已提交
243
            !CompNode::get_device_count(CompNode::DeviceType::CUDA), SystemError,
244 245
            "can not create TensorRTRuntimeOpr when CUDA is not available");
    mgb_assert(!src.empty(), "no inputs provided");
246
    initLibNvInferPlugins(&TensorRTOpr::Logger::instance(), "");
247 248
    TensorRTUniquePtr<nvinfer1::IRuntime> runtime{
            nvinfer1::createInferRuntime(TensorRTOpr::Logger::instance()), {}};
M
Megvii Engine Team 已提交
249
    auto gpu_allocator = std::make_shared<GpuAllocator>(src[0].node()->comp_node());
250 251 252 253 254 255
    runtime->setGpuAllocator(gpu_allocator.get());
    auto engine = runtime->deserializeCudaEngine(buf, buf_size, nullptr);
    mgb_assert(engine, "failed to deserialize ICudaEngine");
    return make(to_shared_ptr_engine(engine), gpu_allocator, src, config);
}

M
Megvii Engine Team 已提交
256 257
void TensorRTRuntimeOpr::LoadDumpImpl::dump(
        serialization::OprDumpContext& ctx, const cg::OperatorNodeBase& opr) {
258 259 260 261 262 263 264 265 266 267 268
    TensorRTUniquePtr<nvinfer1::IHostMemory> buf{
            opr.cast_final_safe<Opr>().trt_cuda_engine()->serialize(), {}};
    mgb_assert(buf, "failed to serialize ICudaEngine");
    ctx.dump_buf_with_len(buf->data(), buf->size());
}

cg::OperatorNodeBase* TensorRTRuntimeOpr::LoadDumpImpl::load(
        serialization::OprLoadContext& ctx, const cg::VarNodeArray& inputs,
        const OperatorNodeConfig& config) {
    inputs.at(0)->comp_node().activate();
    auto buf = ctx.load_shared_buf_with_len();
M
Megvii Engine Team 已提交
269
    return Opr::make(buf.data(), buf.size(), cg::to_symbol_var_array(inputs), config)
270 271 272 273 274 275 276 277
            .at(0)
            .node()
            ->owner_opr();
}

#endif  // MGB_ENABLE_TENSOR_RT

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}