fc_op.cc 10.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"

W
wanghuancoder 已提交
17 18 19
namespace paddle {
namespace framework {
class Scope;
20

W
wanghuancoder 已提交
21 22 23 24 25 26
namespace proto {
class OpDesc;
}  // namespace proto
}  // namespace framework
}  // namespace paddle

27 28 29 30 31 32 33 34 35
namespace paddle {
namespace inference {
namespace tensorrt {

/*
 * FC converter convert a MUL op in Fluid to a FC layer in TRT.
 */
class FcOpConverter : public OpConverter {
 public:
36
  nvinfer1::ILayer* reshape_before_fc(nvinfer1::ITensor* before_fc,
37 38
                                      nvinfer1::Dims x_dim, int x_num_col_dims,
                                      std::string output_name) {
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
    // add shuffle before fc
    nvinfer1::Dims reshape_before_fc_dim;
    reshape_before_fc_dim.nbDims = x_num_col_dims + 3;
    // padding shape "* x q x 1 x 1"
    for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) {
      reshape_before_fc_dim.d[i] = 1;
    }
    for (int i = 0; i < x_dim.nbDims; i++) {
      if (i < x_num_col_dims) {
        reshape_before_fc_dim.d[i] = 0;
      } else {
        if (x_dim.d[i] < 0) {
          reshape_before_fc_dim.d[x_num_col_dims] = -1;
          break;
        }
        reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i];
      }
    }
    auto* reshape_before_fc_layer =
        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *before_fc);
    reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
60 61 62
    reshape_before_fc_layer->setName(
        ("fc_op_reshape_before_fc: Shuffle (Output: " + output_name + ")")
            .c_str());
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
    return reshape_before_fc_layer;
  }

  nvinfer1::ILayer* reshape_after_fc(nvinfer1::ITensor* after_fc,
                                     nvinfer1::Dims x_dim, int x_num_col_dims) {
    // add shuffle after fc
    nvinfer1::Dims reshape_after_fc_dim;
    if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
        x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 1) {
      // If use tensorrt'oss, the x_dim and x_num_col_dims need change
      reshape_after_fc_dim.nbDims = 4;
    } else {
      reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
    }
    for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
      reshape_after_fc_dim.d[i] = 0;
    }
    auto* reshape_after_fc_layer =
        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *after_fc);
    reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
    return reshape_after_fc_layer;
  }

86
  void operator()(const framework::proto::OpDesc& op,
87
                  const framework::Scope& scope, bool test_mode) override {
88
    VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias";
Y
Yan Chunwei 已提交
89
    framework::OpDesc op_desc(op, nullptr);
90
    auto output_name = op_desc.Output("Out").front();
91 92 93 94 95 96 97 98
    auto input_names = op_desc.InputNames();
    bool with_bias = input_names.size() >= 3;
    std::string w_name = "Y";
    std::string i_name = "X";
    if (with_bias) {
      w_name = "W";
      i_name = "Input";
    }
99
    // Declare inputs
100
    auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
W
Wangzheee 已提交
101
    auto x_dim = X->getDimensions();
102
    // Declare weights
103
    auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
104 105 106
    PADDLE_ENFORCE_NOT_NULL(
        Y_v, platform::errors::NotFound(
                 "Can not find %s presistale var of fc in scope.", w_name));
107
    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
108
    int x_num_col_dims =
P
Pei Yang 已提交
109
        op_desc.HasAttr("x_num_col_dims")
110
            ? BOOST_GET_CONST(int, op_desc.GetAttr("x_num_col_dims"))
P
Pei Yang 已提交
111
            : (op_desc.HasAttr("in_num_col_dims")
112
                   ? BOOST_GET_CONST(int, op_desc.GetAttr("in_num_col_dims"))
P
Pei Yang 已提交
113 114 115
                   : 1);
    const std::string activation_type =
        op_desc.HasAttr("activation_type")
116
            ? BOOST_GET_CONST(std::string, op_desc.GetAttr("activation_type"))
P
Pei Yang 已提交
117
            : "";
118
    // This may trigger a GPU->CPU copy, because TRT's weight can only be
119
    // assigned from CPU memory, which can't be avoided.
120
    float* weight_data = nullptr;
121
    bool enable_int8 = op_desc.HasAttr("enable_int8");
122
    float in_scale = 0.;
123 124
    if (enable_int8) {
#if IS_TRT_VERSION_GE(5000)
125
      CHECK(op_desc.HasAttr(i_name + "_scale"));
126
      in_scale =
127
          BOOST_GET_CONST(float, op_desc.GetAttr(i_name + "_scale")) * 127;
128
      auto weight_scale =
129
          BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("weight_scale"));
130 131 132 133 134 135 136 137
      weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(),
                                              Y_t, true, weight_scale);
      engine_->SetTensorDynamicRange(X, in_scale);
#endif
    } else {
      weight_data =
          engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t, false);
    }
N
nhzlx 已提交
138

139 140 141 142 143
    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL,
                      platform::errors::InvalidArgument(
                          "The fc's weight should be a matrix with 2 dims, but "
                          "it's %d-dimensional.",
                          Y_t->dims().size()));  // a matrix
144
    size_t n_output = Y_t->dims()[1];
145 146 147 148 149 150 151 152 153 154 155 156 157
    int m = Y_t->dims()[0];
    int n = Y_t->dims()[1];
    auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
      for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
          dst[j * m + i] = src[i * n + j];
        }
      }
    };

    auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output,
                         TensorRTEngine::Weight& weight,
                         TensorRTEngine::Weight& bias) {
158
      if (enable_int8) {
159
        // add conv layer
160 161 162 163 164 165 166
        PADDLE_ENFORCE_EQ(
            op_desc.HasAttr("out_threshold"), true,
            platform::errors::InvalidArgument(
                "must have out threshold in fc layers in int8 mode"));
        float out_scale =
            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
        nvinfer1::DimsHW nv_ksize(1, 1);
167 168 169
        auto* fc_layer_int8 =
            TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
                                 nv_ksize, weight.get(), bias.get());
170 171 172
        fc_layer_int8->setName(
            ("fc_op_int8_conv1x1: Convolution (Output: " + output_name + ")")
                .c_str());
173
        engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0), out_scale);
174 175
        auto* fc_after_reshape_int8 = reshape_after_fc(
            fc_layer_int8->getOutput(0), x_dim, x_num_col_dims);
176
        if (activation_type == "relu") {
177 178 179 180
          fc_after_reshape_int8->setName(
              ("fc_op_int8_reshape_after_fc: Shuffle (Output: " + output_name +
               ")")
                  .c_str());
181
          nvinfer1::IActivationLayer* relu_layer_int8 = TRT_ENGINE_ADD_LAYER(
182
              engine_, Activation, *(fc_after_reshape_int8->getOutput(0)),
183 184 185 186
              nvinfer1::ActivationType::kRELU);
          RreplenishLayerAndOutput(relu_layer_int8, "relu_after_fc_shuffle",
                                   {output_name}, test_mode);
        } else {
187 188
          RreplenishLayerAndOutput(fc_after_reshape_int8,
                                   "fc_op_int8_reshape_after_fc: Shuffle",
189 190
                                   {output_name}, test_mode);
        }
191
      } else {
192
        // add fc layer
193
        auto* fc_layer_float =
194 195
            TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, n_output,
                                 weight.get(), bias.get());
196 197 198
        fc_layer_float->setName(
            ("fc_op_float: FullyConnected (Output: " + output_name + ")")
                .c_str());
199 200
        auto* fc_after_reshape_float = reshape_after_fc(
            fc_layer_float->getOutput(0), x_dim, x_num_col_dims);
201
        if (activation_type == "relu") {
202 203 204 205
          fc_after_reshape_float->setName(
              ("fc_op_float_reshape_after_fc: Shuffle (Output: " + output_name +
               ")")
                  .c_str());
206
          nvinfer1::IActivationLayer* relu_layer_float = TRT_ENGINE_ADD_LAYER(
207
              engine_, Activation, *(fc_after_reshape_float->getOutput(0)),
208 209 210 211
              nvinfer1::ActivationType::kRELU);
          RreplenishLayerAndOutput(relu_layer_float, "relu_after_fc_shuffle",
                                   {output_name}, test_mode);
        } else {
212
          RreplenishLayerAndOutput(fc_after_reshape_float, "shuffle_after_fc",
213 214
                                   {output_name}, test_mode);
        }
215 216 217 218 219 220 221
      }
    };

    std::vector<float> weight_data_tmp;
    weight_data_tmp.reserve(Y_t->numel());
    memcpy(weight_data_tmp.data(), weight_data, Y_t->numel() * sizeof(float));
    tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
222 223
    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                  static_cast<void*>(weight_data),
N
nhzlx 已提交
224
                                  static_cast<size_t>(Y_t->numel())};
225
    weight.dims.assign({n, m});
226 227 228
    float* bias_data = nullptr;
    int bias_num = 0;
    if (with_bias) {
229
      auto* b_v = scope.GetVar(op_desc.Input("Bias").front());
230 231 232 233 234 235 236 237
      auto* b_t = b_v->GetMutable<framework::LoDTensor>();
      bias_data =
          engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t, false);
      bias_num = b_t->numel();
    }
    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
                                static_cast<void*>(bias_data),
                                static_cast<size_t>(bias_num)};
238

239 240 241
    // Running the TRT Static Shape mode: x_num_col_dims-1
    if (!engine_->with_dynamic_shape()) {
      x_num_col_dims--;
242
    }
W
Wangzheee 已提交
243 244 245 246 247
    // If use tensorrt'oss, the x_dim and x_num_col_dims need change
    if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
        x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 2) {
      x_num_col_dims = 1;
    }
248 249 250 251 252 253 254
    PADDLE_ENFORCE_GT(
        x_dim.nbDims, x_num_col_dims,
        platform::errors::InvalidArgument(
            "Params and input dims mismatch. Paddle-TRT FC "
            "converter expects x_dim.nbDims > x_num_col_dims, but "
            "x_dim.nbDims : %d, x_num_col_dims : %d.",
            x_dim.nbDims, x_num_col_dims));
255 256
    auto* reshape_before_fc_layer =
        reshape_before_fc(X, x_dim, x_num_col_dims, output_name);
257 258 259
    auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
    if (enable_int8) {
      engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
P
Pei Yang 已提交
260
    }
261
    regist_fc(reshape_itensor, n_output, weight, bias);
262 263 264 265 266 267 268
  }
};

}  // namespace tensorrt
}  // namespace inference
}  // namespace paddle

N
nhzlx 已提交
269
REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);