tensorrt_engine_op.h 8.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */

#pragma once

#ifdef PADDLE_WITH_CUDA

G
gongweibao 已提交
19 20 21
#include <string>
#include <vector>

N
nhzlx 已提交
22
#include "paddle/fluid/framework/op_registry.h"
23 24
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h"
N
nhzlx 已提交
25
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
26 27 28
#include "paddle/fluid/inference/tensorrt/engine.h"

namespace paddle {
29 30 31

DECLARE_int32(tensorrt_engine_batch_size);

32 33
namespace operators {

N
nhzlx 已提交
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;

namespace {

TRT_DT FluidDataType2TRT(FluidDT type) {
  switch (type) {
    case FluidDT::VarType_Type_FP32:
      return TRT_DT::kFLOAT;
    case FluidDT::VarType_Type_INT32:
      return TRT_DT::kINT32;
    default:
      return TRT_DT::kINT32;
  }
  PADDLE_THROW("unkown type");
  return TRT_DT::kINT32;
}

nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
  PADDLE_ENFORCE_GT(shape.size(), 1UL,
                    "TensorRT' tensor input requires at least 2 dimensions");
  PADDLE_ENFORCE_LE(shape.size(), 4UL,
                    "TensorRT' tensor input requires at most 4 dimensions");
57 58 59 60
  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
  if (shape.size() == 4UL)
    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
  return nvinfer1::DimsCHW(shape[1], 1, 1);
N
nhzlx 已提交
61 62 63 64
}

}  // namespace

Y
Yan Chunwei 已提交
65 66 67
using inference::Singleton;
using inference::tensorrt::TRT_EngineManager;

68 69 70 71 72 73 74 75 76
class TensorRTEngineOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {}

  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
77
    auto input0 = ctx.Inputs("Xs").front();
78
    framework::OpKernelType kt = framework::OpKernelType(
79 80 81 82
        framework::ToDataType(ctx.scope()
                                  .FindVar(input0)
                                  ->GetMutable<framework::LoDTensor>()
                                  ->type()),
N
nhzlx 已提交
83
        ctx.GetPlace());
84 85 86 87 88 89 90 91
    return kt;
  }
};

template <typename DeviceContext, typename T>
class TensorRTEngineKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
Y
Yan Chunwei 已提交
92
    auto engine_name = context.Attr<std::string>("engine_uniq_key");
N
nhzlx 已提交
93
    int max_batch_size = context.Attr<int>("max_batch_size");
Y
Yan Chunwei 已提交
94
    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
95 96
      Prepare(context);
    }
Y
Yan Chunwei 已提交
97
    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
98 99
    auto input_names = context.op().Inputs("Xs");
    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
N
nhzlx 已提交
100
    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
101

102 103 104 105 106 107 108 109
    std::vector<std::string> output_maps =
        context.Attr<std::vector<std::string>>("output_name_mapping");

    auto params = context.Attr<std::vector<std::string>>("parameters");
    std::unordered_set<std::string> parameters;
    for (const auto& param : params) {
      parameters.insert(param);
    }
110 111
    // Convert input tensor from fluid to engine.
    for (const auto& x : context.Inputs("Xs")) {
112
      if (parameters.count(x)) continue;
113
      // convert input and copy to TRT engine's buffer
114 115
      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
          context.scope(), x);
116
      if (platform::is_cpu_place(t.place())) {
Y
Yan Chunwei 已提交
117 118
        engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
                                t.memory_size());
119
      } else {
Y
Yan Chunwei 已提交
120 121
        engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
                                t.memory_size());
122 123 124
      }
    }
    // Execute the engine.
125 126
    PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
    engine->Execute(FLAGS_tensorrt_engine_batch_size);
127

128
    // Convert output tensor from engine to fluid
129
    int output_index = 0;
N
nhzlx 已提交
130
    VLOG(4) << "TensorRT Engine Op Outputs:";
131
    for (const auto& y : context.Outputs("Ys")) {
N
nhzlx 已提交
132
      VLOG(4) << y;
133
      // convert output and copy to fluid.
134
      nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
135 136
      auto dims = trt_t->getDimensions();
      // Use the output ITensor's dims to reshape the Fluid Tensor.
137 138 139 140 141 142
      // The ITensor doesn't contain the batch size dim.
      std::vector<int> ddim;
      ddim.push_back(FLAGS_tensorrt_engine_batch_size);
      for (int i = 0; i < dims.nbDims; i++) {
        ddim.push_back(dims.d[i]);
      }
143 144 145 146

      auto* fluid_v = context.scope().FindVar(y);
      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
      auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
N
nhzlx 已提交
147

148
      fluid_t->Resize(framework::make_ddim(ddim));
149

150 151 152 153
      // TODO(Superjomn) find some way to determine which device to output the
      // tensor.
      // if (platform::is_cpu_place(fluid_t->place())) {
      // TODO(Superjomn) change this float to dtype size.
N
nhzlx 已提交
154 155
      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
                  FLAGS_tensorrt_engine_batch_size;
N
nhzlx 已提交
156 157 158 159 160
      engine->GetOutputInGPU(
          output_maps[output_index],
          fluid_t->mutable_data<float>(platform::CUDAPlace(
              boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
          size * sizeof(float));
N
nhzlx 已提交
161

162
      output_index += 1;
163
    }
164

Y
Yan Chunwei 已提交
165
    cudaStreamSynchronize(*engine->stream());
166 167 168
  }

 protected:
N
nhzlx 已提交
169 170 171 172 173
  void Prepare(const framework::ExecutionContext& context) const {
    VLOG(4) << "Prepare engine";
    // Get the ProgramDesc and pass to convert.
    framework::proto::BlockDesc block_desc;
    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
N
nhzlx 已提交
174 175 176
    int max_batch_size = context.Attr<int>("max_batch_size");
    int workspace_size = context.Attr<int>("workspace_size");

N
nhzlx 已提交
177 178 179 180 181 182 183 184 185 186 187
    auto params = context.Attr<std::vector<std::string>>("parameters");
    std::unordered_set<std::string> parameters;
    for (const auto& param : params) {
      parameters.insert(param);
    }

    std::vector<std::string> output_maps =
        context.Attr<std::vector<std::string>>("output_name_mapping");

    // TODO(Superjomn) replace this with a different stream
    auto* engine = Singleton<TRT_EngineManager>::Global().Create(
N
nhzlx 已提交
188
        max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
N
nhzlx 已提交
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
        context.Attr<std::string>("engine_uniq_key"),
        boost::get<platform::CUDAPlace>(context.GetPlace()).device);

    engine->InitNetwork();

    framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
    VLOG(4) << "parsed var size " << block.AllVars().size();
    // Add inputs
    VLOG(4) << "declare inputs";
    for (auto& input : context.Inputs("Xs")) {
      if (parameters.count(input)) continue;
      VLOG(4) << "declare input " << input;
      auto* var = block.FindVar(input);
      // TensorRT engine need to create parameters. The parameter's description
      // should be set in
      PADDLE_ENFORCE(var, "no variable called %s", input);
      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                        "TensorRT engine only takes LoDTensor as input");
      auto shape = var->GetShape();
      // For the special batch_size placeholder -1, drop it and pass the real
      // shape of data.
      // TODO(Superjomn) fix this with batch broadcast, or it can't handle
      // variational batch size.
      if (shape[0] == -1) {
        shape[0] = FLAGS_tensorrt_engine_batch_size;
      }
      engine->DeclareInput(
          input, FluidDataType2TRT(
                     var->Proto()->type().lod_tensor().tensor().data_type()),
          Vec2TRT_Dims(shape));
    }

    inference::Singleton<inference::tensorrt::OpConverter>::Global()
        .ConvertBlock(block_desc, parameters, context.scope(), engine);

    // Add outputs
    for (auto& output : output_maps) {
      engine->DeclareOutput(output);
    }

    engine->FreezeNetwork();
  }
231 232 233 234 235 236
};

}  // namespace operators
}  // namespace paddle

#endif  // PADDLE_WITH_CUDA