fc_op.cc 10.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"

W
wanghuancoder 已提交
17 18 19
namespace paddle {
namespace framework {
class Scope;
20

W
wanghuancoder 已提交
21 22 23 24 25 26
namespace proto {
class OpDesc;
}  // namespace proto
}  // namespace framework
}  // namespace paddle

27 28 29 30 31 32 33 34 35
namespace paddle {
namespace inference {
namespace tensorrt {

/*
 * FC converter convert a MUL op in Fluid to a FC layer in TRT.
 */
class FcOpConverter : public OpConverter {
 public:
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
  nvinfer1::ILayer* reshape_before_fc(nvinfer1::ITensor* before_fc,
                                      nvinfer1::Dims x_dim,
                                      int x_num_col_dims) {
    // add shuffle before fc
    nvinfer1::Dims reshape_before_fc_dim;
    reshape_before_fc_dim.nbDims = x_num_col_dims + 3;
    // padding shape "* x q x 1 x 1"
    for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) {
      reshape_before_fc_dim.d[i] = 1;
    }
    for (int i = 0; i < x_dim.nbDims; i++) {
      if (i < x_num_col_dims) {
        reshape_before_fc_dim.d[i] = 0;
      } else {
        if (x_dim.d[i] < 0) {
          reshape_before_fc_dim.d[x_num_col_dims] = -1;
          break;
        }
        reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i];
      }
    }
    auto* reshape_before_fc_layer =
        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *before_fc);
    reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
    return reshape_before_fc_layer;
  }

  nvinfer1::ILayer* reshape_after_fc(nvinfer1::ITensor* after_fc,
                                     nvinfer1::Dims x_dim, int x_num_col_dims) {
    // add shuffle after fc
    nvinfer1::Dims reshape_after_fc_dim;
    if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
        x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 1) {
      // If use tensorrt'oss, the x_dim and x_num_col_dims need change
      reshape_after_fc_dim.nbDims = 4;
    } else {
      reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
    }
    for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
      reshape_after_fc_dim.d[i] = 0;
    }
    auto* reshape_after_fc_layer =
        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *after_fc);
    reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
    return reshape_after_fc_layer;
  }

83
  void operator()(const framework::proto::OpDesc& op,
84
                  const framework::Scope& scope, bool test_mode) override {
85
    VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias";
Y
Yan Chunwei 已提交
86
    framework::OpDesc op_desc(op, nullptr);
87
    auto output_name = op_desc.Output("Out").front();
88 89 90 91 92 93 94 95
    auto input_names = op_desc.InputNames();
    bool with_bias = input_names.size() >= 3;
    std::string w_name = "Y";
    std::string i_name = "X";
    if (with_bias) {
      w_name = "W";
      i_name = "Input";
    }
96
    // Declare inputs
97
    auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
W
Wangzheee 已提交
98
    auto x_dim = X->getDimensions();
99
    // Declare weights
100
    auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
101 102 103
    PADDLE_ENFORCE_NOT_NULL(
        Y_v, platform::errors::NotFound(
                 "Can not find %s presistale var of fc in scope.", w_name));
104
    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
105
    int x_num_col_dims =
P
Pei Yang 已提交
106
        op_desc.HasAttr("x_num_col_dims")
107
            ? BOOST_GET_CONST(int, op_desc.GetAttr("x_num_col_dims"))
P
Pei Yang 已提交
108
            : (op_desc.HasAttr("in_num_col_dims")
109
                   ? BOOST_GET_CONST(int, op_desc.GetAttr("in_num_col_dims"))
P
Pei Yang 已提交
110 111 112
                   : 1);
    const std::string activation_type =
        op_desc.HasAttr("activation_type")
113
            ? BOOST_GET_CONST(std::string, op_desc.GetAttr("activation_type"))
P
Pei Yang 已提交
114
            : "";
115
    // This may trigger a GPU->CPU copy, because TRT's weight can only be
116
    // assigned from CPU memory, which can't be avoided.
117
    float* weight_data = nullptr;
118
    bool enable_int8 = op_desc.HasAttr("enable_int8");
119
    float in_scale = 0.;
120 121
    if (enable_int8) {
#if IS_TRT_VERSION_GE(5000)
122
      CHECK(op_desc.HasAttr(i_name + "_scale"));
123
      in_scale =
124
          BOOST_GET_CONST(float, op_desc.GetAttr(i_name + "_scale")) * 127;
125
      auto weight_scale =
126
          BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("weight_scale"));
127 128 129 130 131 132 133 134
      weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(),
                                              Y_t, true, weight_scale);
      engine_->SetTensorDynamicRange(X, in_scale);
#endif
    } else {
      weight_data =
          engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t, false);
    }
N
nhzlx 已提交
135

136 137 138 139 140
    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL,
                      platform::errors::InvalidArgument(
                          "The fc's weight should be a matrix with 2 dims, but "
                          "it's %d-dimensional.",
                          Y_t->dims().size()));  // a matrix
141
    size_t n_output = Y_t->dims()[1];
142 143 144 145 146 147 148 149 150 151 152 153 154
    int m = Y_t->dims()[0];
    int n = Y_t->dims()[1];
    auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
      for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
          dst[j * m + i] = src[i * n + j];
        }
      }
    };

    auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output,
                         TensorRTEngine::Weight& weight,
                         TensorRTEngine::Weight& bias) {
155
      if (enable_int8) {
156
        // add conv layer
157 158 159 160 161 162 163
        PADDLE_ENFORCE_EQ(
            op_desc.HasAttr("out_threshold"), true,
            platform::errors::InvalidArgument(
                "must have out threshold in fc layers in int8 mode"));
        float out_scale =
            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
        nvinfer1::DimsHW nv_ksize(1, 1);
164 165 166
        auto* fc_layer_int8 =
            TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
                                 nv_ksize, weight.get(), bias.get());
167
        engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0), out_scale);
168 169
        auto* fc_after_reshape_int8 = reshape_after_fc(
            fc_layer_int8->getOutput(0), x_dim, x_num_col_dims);
170 171
        if (activation_type == "relu") {
          nvinfer1::IActivationLayer* relu_layer_int8 = TRT_ENGINE_ADD_LAYER(
172
              engine_, Activation, *(fc_after_reshape_int8->getOutput(0)),
173 174 175 176
              nvinfer1::ActivationType::kRELU);
          RreplenishLayerAndOutput(relu_layer_int8, "relu_after_fc_shuffle",
                                   {output_name}, test_mode);
        } else {
177
          RreplenishLayerAndOutput(fc_after_reshape_int8, "shuffle_after_fc",
178 179
                                   {output_name}, test_mode);
        }
180
      } else {
181
        // add fc layer
182
        auto* fc_layer_float =
183 184
            TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, n_output,
                                 weight.get(), bias.get());
185 186
        auto* fc_after_reshape_float = reshape_after_fc(
            fc_layer_float->getOutput(0), x_dim, x_num_col_dims);
187 188
        if (activation_type == "relu") {
          nvinfer1::IActivationLayer* relu_layer_float = TRT_ENGINE_ADD_LAYER(
189
              engine_, Activation, *(fc_after_reshape_float->getOutput(0)),
190 191 192 193
              nvinfer1::ActivationType::kRELU);
          RreplenishLayerAndOutput(relu_layer_float, "relu_after_fc_shuffle",
                                   {output_name}, test_mode);
        } else {
194
          RreplenishLayerAndOutput(fc_after_reshape_float, "shuffle_after_fc",
195 196
                                   {output_name}, test_mode);
        }
197 198 199 200 201 202 203
      }
    };

    std::vector<float> weight_data_tmp;
    weight_data_tmp.reserve(Y_t->numel());
    memcpy(weight_data_tmp.data(), weight_data, Y_t->numel() * sizeof(float));
    tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
204 205
    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                  static_cast<void*>(weight_data),
N
nhzlx 已提交
206
                                  static_cast<size_t>(Y_t->numel())};
207
    weight.dims.assign({n, m});
208 209 210
    float* bias_data = nullptr;
    int bias_num = 0;
    if (with_bias) {
211
      auto* b_v = scope.GetVar(op_desc.Input("Bias").front());
212 213 214 215 216 217 218 219
      auto* b_t = b_v->GetMutable<framework::LoDTensor>();
      bias_data =
          engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t, false);
      bias_num = b_t->numel();
    }
    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
                                static_cast<void*>(bias_data),
                                static_cast<size_t>(bias_num)};
220

221 222 223
    // Running the TRT Static Shape mode: x_num_col_dims-1
    if (!engine_->with_dynamic_shape()) {
      x_num_col_dims--;
224
    }
W
Wangzheee 已提交
225 226 227 228 229
    // If use tensorrt'oss, the x_dim and x_num_col_dims need change
    if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
        x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 2) {
      x_num_col_dims = 1;
    }
230 231 232 233 234 235 236
    PADDLE_ENFORCE_GT(
        x_dim.nbDims, x_num_col_dims,
        platform::errors::InvalidArgument(
            "Params and input dims mismatch. Paddle-TRT FC "
            "converter expects x_dim.nbDims > x_num_col_dims, but "
            "x_dim.nbDims : %d, x_num_col_dims : %d.",
            x_dim.nbDims, x_num_col_dims));
237
    auto* reshape_before_fc_layer = reshape_before_fc(X, x_dim, x_num_col_dims);
238 239 240
    auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
    if (enable_int8) {
      engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
P
Pei Yang 已提交
241
    }
242
    regist_fc(reshape_itensor, n_output, weight, bias);
243 244 245 246 247 248 249
  }
};

}  // namespace tensorrt
}  // namespace inference
}  // namespace paddle

N
nhzlx 已提交
250
REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);