fc_op.cc 10.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"

W
wanghuancoder 已提交
17 18 19
namespace paddle {
namespace framework {
class Scope;
20

W
wanghuancoder 已提交
21 22 23 24 25 26
namespace proto {
class OpDesc;
}  // namespace proto
}  // namespace framework
}  // namespace paddle

27 28 29 30 31 32 33 34 35
namespace paddle {
namespace inference {
namespace tensorrt {

/*
 * FC converter convert a MUL op in Fluid to a FC layer in TRT.
 */
class FcOpConverter : public OpConverter {
 public:
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
  nvinfer1::ILayer* reshape_before_fc(nvinfer1::ITensor* before_fc,
                                      nvinfer1::Dims x_dim,
                                      int x_num_col_dims) {
    // add shuffle before fc
    nvinfer1::Dims reshape_before_fc_dim;
    reshape_before_fc_dim.nbDims = x_num_col_dims + 3;
    // padding shape "* x q x 1 x 1"
    for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) {
      reshape_before_fc_dim.d[i] = 1;
    }
    for (int i = 0; i < x_dim.nbDims; i++) {
      if (i < x_num_col_dims) {
        reshape_before_fc_dim.d[i] = 0;
      } else {
        if (x_dim.d[i] < 0) {
          reshape_before_fc_dim.d[x_num_col_dims] = -1;
          break;
        }
        reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i];
      }
    }
    auto* reshape_before_fc_layer =
        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *before_fc);
    reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
    return reshape_before_fc_layer;
  }

  nvinfer1::ILayer* reshape_after_fc(nvinfer1::ITensor* after_fc,
                                     nvinfer1::Dims x_dim, int x_num_col_dims) {
    // add shuffle after fc
    nvinfer1::Dims reshape_after_fc_dim;
    if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
        x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 1) {
      // If use tensorrt'oss, the x_dim and x_num_col_dims need change
      reshape_after_fc_dim.nbDims = 4;
    } else {
      reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
    }
    for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
      reshape_after_fc_dim.d[i] = 0;
    }
    auto* reshape_after_fc_layer =
        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *after_fc);
    reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
    return reshape_after_fc_layer;
  }

83
  void operator()(const framework::proto::OpDesc& op,
84
                  const framework::Scope& scope, bool test_mode) override {
85
    VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias";
Y
Yan Chunwei 已提交
86
    framework::OpDesc op_desc(op, nullptr);
87
    auto output_name = op_desc.Output("Out").front();
88 89 90 91 92 93 94 95
    auto input_names = op_desc.InputNames();
    bool with_bias = input_names.size() >= 3;
    std::string w_name = "Y";
    std::string i_name = "X";
    if (with_bias) {
      w_name = "W";
      i_name = "Input";
    }
96
    // Declare inputs
97
    auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
W
Wangzheee 已提交
98
    auto x_dim = X->getDimensions();
99
    // Declare weights
100
    auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
101 102 103
    PADDLE_ENFORCE_NOT_NULL(
        Y_v, platform::errors::NotFound(
                 "Can not find %s presistale var of fc in scope.", w_name));
104
    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
105
    int x_num_col_dims =
P
Pei Yang 已提交
106
        op_desc.HasAttr("x_num_col_dims")
107
            ? BOOST_GET_CONST(int, op_desc.GetAttr("x_num_col_dims"))
P
Pei Yang 已提交
108
            : (op_desc.HasAttr("in_num_col_dims")
109
                   ? BOOST_GET_CONST(int, op_desc.GetAttr("in_num_col_dims"))
P
Pei Yang 已提交
110 111 112
                   : 1);
    const std::string activation_type =
        op_desc.HasAttr("activation_type")
113
            ? BOOST_GET_CONST(std::string, op_desc.GetAttr("activation_type"))
P
Pei Yang 已提交
114
            : "";
115
    // This may trigger a GPU->CPU copy, because TRT's weight can only be
116
    // assigned from CPU memory, which can't be avoided.
117
    float* weight_data = nullptr;
118
    bool enable_int8 = op_desc.HasAttr("enable_int8");
119
    float in_scale = 0.;
120 121
    if (enable_int8) {
#if IS_TRT_VERSION_GE(5000)
122
      CHECK(op_desc.HasAttr(i_name + "_scale"));
123
      in_scale =
124
          BOOST_GET_CONST(float, op_desc.GetAttr(i_name + "_scale")) * 127;
125
      auto weight_scale =
126
          BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("weight_scale"));
127 128 129 130 131 132 133 134
      weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(),
                                              Y_t, true, weight_scale);
      engine_->SetTensorDynamicRange(X, in_scale);
#endif
    } else {
      weight_data =
          engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t, false);
    }
N
nhzlx 已提交
135

136 137 138 139 140
    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL,
                      platform::errors::InvalidArgument(
                          "The fc's weight should be a matrix with 2 dims, but "
                          "it's %d-dimensional.",
                          Y_t->dims().size()));  // a matrix
141
    size_t n_output = Y_t->dims()[1];
142 143 144 145 146 147 148 149 150 151 152 153 154
    int m = Y_t->dims()[0];
    int n = Y_t->dims()[1];
    auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
      for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
          dst[j * m + i] = src[i * n + j];
        }
      }
    };

    auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output,
                         TensorRTEngine::Weight& weight,
                         TensorRTEngine::Weight& bias) {
155
      if (enable_int8) {
156
        // add conv layer
157 158 159 160 161 162 163
        PADDLE_ENFORCE_EQ(
            op_desc.HasAttr("out_threshold"), true,
            platform::errors::InvalidArgument(
                "must have out threshold in fc layers in int8 mode"));
        float out_scale =
            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
        nvinfer1::DimsHW nv_ksize(1, 1);
164 165 166
        auto* fc_layer_int8 =
            TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
                                 nv_ksize, weight.get(), bias.get());
167 168 169 170
        auto* fc_after_reshape_int8 = reshape_after_fc(
            fc_layer_int8->getOutput(0), x_dim, x_num_col_dims);
        engine_->SetTensorDynamicRange(fc_after_reshape_int8->getOutput(0),
                                       out_scale);
171 172
        if (activation_type == "relu") {
          nvinfer1::IActivationLayer* relu_layer_int8 = TRT_ENGINE_ADD_LAYER(
173
              engine_, Activation, *(fc_after_reshape_int8->getOutput(0)),
174 175 176 177
              nvinfer1::ActivationType::kRELU);
          RreplenishLayerAndOutput(relu_layer_int8, "relu_after_fc_shuffle",
                                   {output_name}, test_mode);
        } else {
178
          RreplenishLayerAndOutput(fc_after_reshape_int8, "shuffle_after_fc",
179 180
                                   {output_name}, test_mode);
        }
181
      } else {
182
        // add fc layer
183
        auto* fc_layer_float =
184 185
            TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, n_output,
                                 weight.get(), bias.get());
186 187
        auto* fc_after_reshape_float = reshape_after_fc(
            fc_layer_float->getOutput(0), x_dim, x_num_col_dims);
188 189
        if (activation_type == "relu") {
          nvinfer1::IActivationLayer* relu_layer_float = TRT_ENGINE_ADD_LAYER(
190
              engine_, Activation, *(fc_after_reshape_float->getOutput(0)),
191 192 193 194
              nvinfer1::ActivationType::kRELU);
          RreplenishLayerAndOutput(relu_layer_float, "relu_after_fc_shuffle",
                                   {output_name}, test_mode);
        } else {
195
          RreplenishLayerAndOutput(fc_after_reshape_float, "shuffle_after_fc",
196 197
                                   {output_name}, test_mode);
        }
198 199 200 201 202 203 204
      }
    };

    std::vector<float> weight_data_tmp;
    weight_data_tmp.reserve(Y_t->numel());
    memcpy(weight_data_tmp.data(), weight_data, Y_t->numel() * sizeof(float));
    tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
205 206
    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                  static_cast<void*>(weight_data),
N
nhzlx 已提交
207
                                  static_cast<size_t>(Y_t->numel())};
208
    weight.dims.assign({n, m});
209 210 211
    float* bias_data = nullptr;
    int bias_num = 0;
    if (with_bias) {
212
      auto* b_v = scope.GetVar(op_desc.Input("Bias").front());
213 214 215 216 217 218 219 220
      auto* b_t = b_v->GetMutable<framework::LoDTensor>();
      bias_data =
          engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t, false);
      bias_num = b_t->numel();
    }
    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
                                static_cast<void*>(bias_data),
                                static_cast<size_t>(bias_num)};
221

222 223 224
    // Running the TRT Static Shape mode: x_num_col_dims-1
    if (!engine_->with_dynamic_shape()) {
      x_num_col_dims--;
225
    }
W
Wangzheee 已提交
226 227 228 229 230
    // If use tensorrt'oss, the x_dim and x_num_col_dims need change
    if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
        x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 2) {
      x_num_col_dims = 1;
    }
231 232 233 234 235 236 237
    PADDLE_ENFORCE_GT(
        x_dim.nbDims, x_num_col_dims,
        platform::errors::InvalidArgument(
            "Params and input dims mismatch. Paddle-TRT FC "
            "converter expects x_dim.nbDims > x_num_col_dims, but "
            "x_dim.nbDims : %d, x_num_col_dims : %d.",
            x_dim.nbDims, x_num_col_dims));
238
    auto* reshape_before_fc_layer = reshape_before_fc(X, x_dim, x_num_col_dims);
239 240 241
    auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
    if (enable_int8) {
      engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
P
Pei Yang 已提交
242
    }
243
    regist_fc(reshape_itensor, n_output, weight, bias);
244 245 246 247 248 249 250
  }
};

}  // namespace tensorrt
}  // namespace inference
}  // namespace paddle

N
nhzlx 已提交
251
REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);