fc_compute.cc 9.8 KB
Newer Older
Y
Yan Chunwei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/arm/fc_compute.h"
#include <vector>
#include "lite/api/paddle_place.h"
18 19 20
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/arm/math/gemm_prepacked_int8.h"
#include "lite/backends/arm/math/gemv_arm_int8.h"
Y
Yan Chunwei 已提交
21 22 23 24 25 26 27 28
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace arm {

29 30 31 32 33
///  for fp32 kernel
template <>
void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
  ReInitWhenNeeded();
}
Y
Yan Chunwei 已提交
34

35 36 37 38 39 40 41 42 43 44 45 46 47 48
/// for int8 kernel with fp32 output
template <>
void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
  ReInitWhenNeeded();
  auto& param = this->template Param<operators::FcParam>();
  /// update scale
  float input_scale = param.input_scale;
  int extend_size = flag_gemm_ ? m_ : n_;
  scale_.resize(extend_size);
  for (int i = 0; i < extend_size; ++i) {
    if (flag_gemm_) {
      scale_[i] = param.weight_scale[0] * input_scale;
    } else {
      scale_[i] = param.weight_scale[i] * input_scale;
Y
Yan Chunwei 已提交
49
    }
50 51
  }
}
Y
Yan Chunwei 已提交
52

53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
/// for int8 kernel with int8 output
template <>
void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
  ReInitWhenNeeded();
  auto& param = this->template Param<operators::FcParam>();
  /// update scale
  scale_ = param.weight_scale;
  float input_scale = param.input_scale;
  float output_scale = param.output_scale;
  int extend_size = flag_gemm_ ? m_ : n_;
  scale_.resize(extend_size);
  for (int i = 0; i < extend_size; ++i) {
    if (flag_gemm_) {
      scale_[i] = param.weight_scale[0] * input_scale / output_scale;
    } else {
      scale_[i] = param.weight_scale[i] * input_scale / output_scale;
Y
Yan Chunwei 已提交
69 70
    }
  }
71 72 73 74 75 76 77 78 79 80 81
  /// update bias
  if (param.bias) {
    bias_.Resize(param.bias->dims());
    auto ptr = bias_.mutable_data<float>();
    auto ptr_in = bias_.data<float>();
    float out_scale = param.output_scale;
    for (int i = 0; i < bias_.numel(); ++i) {
      ptr[i] = ptr_in[i] / out_scale;
    }
    flag_trans_bias_ = true;
  }
Y
Yan Chunwei 已提交
82 83
}

84 85
template <>
void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
Y
Yan Chunwei 已提交
86 87
  auto& param = this->Param<operators::FcParam>();
  auto& ctx = this->ctx_->template As<ARMContext>();
88 89 90

  auto i_data = param.input->data<float>();
  auto o_data = param.output->mutable_data<float>();
T
TianXiaogang 已提交
91
  auto w_data = flag_gemm_ ? param.w->data<float>() : weights_.data<float>();
92 93 94 95
  const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
  if (flag_trans_bias_) {
    b_data = bias_.data<float>();
  }
96 97
  bool flag_act = false;
  lite_api::ActivationType act;
98
  if (param.activation_type == "relu") {
99 100
    act = lite_api::ActivationType::kRelu;
    flag_act = true;
101
  }
102
  if (flag_gemm_) {
103 104
    operators::ActivationParam act_param;
    act_param.has_active = false;
Y
Yan Chunwei 已提交
105 106 107 108 109 110 111 112 113 114 115 116 117
    lite::arm::math::sgemm(false,
                           false,
                           m_,
                           n_,
                           k_,
                           1.f,
                           i_data,
                           k_,
                           w_data,
                           n_,
                           0.f,
                           o_data,
                           n_,
118
                           nullptr,
Y
Yan Chunwei 已提交
119
                           false,
120
                           act_param,
Y
Yan Chunwei 已提交
121 122 123
                           &ctx);
    if (param.bias) {
      CHECK_EQ(param.bias->numel(), n_);
124
      lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_, flag_act);
Y
Yan Chunwei 已提交
125 126
    }
  } else {
127 128 129 130 131 132 133 134 135 136 137
    for (int i = 0; i < m_; ++i) {
      auto i_data_batch = i_data + i * k_;
      auto o_data_batch = o_data + i * n_;
      lite::arm::math::sgemv(w_data,
                             i_data_batch,
                             o_data_batch,
                             false,
                             n_,
                             k_,
                             param.bias != nullptr,
                             b_data,
138 139
                             flag_act,
                             act,
140
                             &ctx);
141
    }
Y
Yan Chunwei 已提交
142 143 144
  }
}

145 146
template <>
void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
Y
Yan Chunwei 已提交
147 148 149
  auto& param = this->Param<operators::FcParam>();
  auto& ctx = this->ctx_->template As<ARMContext>();

150 151 152 153 154 155 156 157
  auto i_data = param.input->data<int8_t>();
  auto o_data = param.output->mutable_data<float>();
  auto w_data =
      flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
  const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
  if (flag_trans_bias_) {
    b_data = bias_.data<float>();
  }
158 159 160 161
  bool flag_relu = false;
  if (param.activation_type == "relu") {
    flag_relu = true;
  }
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
  if (flag_gemm_) {
    lite::arm::math::gemm_s8(false,
                             false,
                             m_,
                             n_,
                             k_,
                             i_data,
                             w_data,
                             o_data,
                             nullptr,
                             false,
                             false,
                             scale_.data(),
                             &ctx);
    if (param.bias) {
      CHECK_EQ(param.bias->numel(), n_);
178
      lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_, flag_relu);
Y
Yan Chunwei 已提交
179
    }
180 181 182 183 184 185 186 187 188 189 190 191 192
  } else {
    for (int i = 0; i < m_; ++i) {
      auto i_data_batch = i_data + i * k_;
      auto o_data_batch = o_data + i * n_;
      lite::arm::math::gemv_int8(w_data,
                                 i_data_batch,
                                 o_data_batch,
                                 false,
                                 n_,
                                 k_,
                                 scale_.data(),
                                 param.bias != nullptr,
                                 b_data,
193
                                 flag_relu,
194
                                 &ctx);
Y
Yan Chunwei 已提交
195 196 197 198
    }
  }
}

199 200
template <>
void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
Y
Yan Chunwei 已提交
201 202 203
  auto& param = this->Param<operators::FcParam>();
  auto& ctx = this->ctx_->template As<ARMContext>();

204 205 206 207 208 209 210
  auto i_data = param.input->data<int8_t>();
  auto o_data = param.output->mutable_data<int8_t>();
  auto w_data =
      flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
  const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
  if (flag_trans_bias_) {
    b_data = bias_.data<float>();
Y
Yan Chunwei 已提交
211
  }
212 213 214 215
  bool flag_relu = false;
  if (param.activation_type == "relu") {
    flag_relu = true;
  }
216 217 218 219 220 221 222 223 224 225 226 227 228
  if (flag_gemm_) {
    CHECK(!param.bias) << "fc int8 kernel with int8 output using gemm kernel "
                          "must not have bias";
    lite::arm::math::gemm_s8(false,
                             false,
                             m_,
                             n_,
                             k_,
                             i_data,
                             w_data,
                             o_data,
                             nullptr,
                             false,
229
                             flag_relu,
230 231
                             scale_.data(),
                             &ctx);
Y
Yan Chunwei 已提交
232
  } else {
233 234 235 236 237 238 239 240 241 242 243 244
    for (int i = 0; i < m_; ++i) {
      auto i_data_batch = i_data + i * k_;
      auto o_data_batch = o_data + i * n_;
      lite::arm::math::gemv_int8(w_data,
                                 i_data_batch,
                                 o_data_batch,
                                 false,
                                 n_,
                                 k_,
                                 scale_.data(),
                                 param.bias != nullptr,
                                 b_data,
245
                                 flag_relu,
246 247
                                 &ctx);
    }
Y
Yan Chunwei 已提交
248 249 250 251 252 253 254 255
  }
}

}  // namespace arm
}  // namespace kernels
}  // namespace lite
}  // namespace paddle

256 257 258 259 260 261 262 263 264 265 266
typedef paddle::lite::kernels::arm::FcCompute<PRECISION(kFloat),
                                              PRECISION(kFloat)>
    FcCompute_FP32;
typedef paddle::lite::kernels::arm::FcCompute<PRECISION(kInt8),
                                              PRECISION(kFloat)>
    FcCompute_int8_fp32;
typedef paddle::lite::kernels::arm::FcCompute<PRECISION(kInt8),
                                              PRECISION(kInt8)>
    FcCompute_int8_int8;

REGISTER_LITE_KERNEL(fc, kARM, kFloat, kNCHW, FcCompute_FP32, def)
Y
Yan Chunwei 已提交
267 268 269 270 271 272
    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

273
REGISTER_LITE_KERNEL(fc, kARM, kInt8, kNCHW, FcCompute_int8_int8, int8out)
Y
Yan Chunwei 已提交
274
    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
275
    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
Y
Yan Chunwei 已提交
276 277 278 279
    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
    .Finalize();

280
REGISTER_LITE_KERNEL(fc, kARM, kInt8, kNCHW, FcCompute_int8_fp32, fp32out)
Y
Yan Chunwei 已提交
281
    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
282
    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
Y
Yan Chunwei 已提交
283 284 285
    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
    .Finalize();