quantize_kernel.cpp 4.6 KB
Newer Older
T
Tian 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#ifdef QUANT_OP
16

17
#include "operators/kernel/quantize_kernel.h"
18
#include <cmath>
19
#include "operators/math/quantize.h"
T
Tian 已提交
20

21 22
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
H
hjchen2 已提交
23
#endif
H
Refine  
hjchen2 已提交
24

25 26 27
namespace paddle_mobile {
namespace operators {

H
hjchen2 已提交
28
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
29
#ifndef __aarch64__
H
hjchen2 已提交
30
inline float32_t vmaxvq_f32(float32x4_t r) {
31 32 33 34 35
  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
  return vget_lane_f32(vpmax_f32(v, v), 0);
}
#endif

36 37
template <RoundType R>
static void Quantize(const Tensor *input, const float scale, Tensor *output) {
38
  const float *x = input->data<const float>();
H
hjchen2 已提交
39
  int8_t *y = output->mutable_data<int8_t>();
40
  size_t remain = input->numel();
H
hjchen2 已提交
41
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
42 43
  size_t loop = remain >> 4;
  remain = remain & 0xF;
H
hjchen2 已提交
44

45
  float32x4_t __scale = vdupq_n_f32(scale);
H
hjchen2 已提交
46
  #pragma omp parallel for
47
  for (size_t i = 0; i < loop; ++i) {
H
hjchen2 已提交
48 49 50 51 52 53
    const float *local_x = x + (i << 4);
    int8_t *local_y = y + (i << 4);
    float32x4_t r0 = vld1q_f32(local_x);
    float32x4_t r1 = vld1q_f32(local_x + 4);
    float32x4_t r2 = vld1q_f32(local_x + 8);
    float32x4_t r3 = vld1q_f32(local_x + 12);
54 55 56 57
    r0 = vmulq_f32(r0, __scale);
    r1 = vmulq_f32(r1, __scale);
    r2 = vmulq_f32(r2, __scale);
    r3 = vmulq_f32(r3, __scale);
58 59 60 61
    int32x4_t q0 = math::vRoundq_f32<R>(r0);
    int32x4_t q1 = math::vRoundq_f32<R>(r1);
    int32x4_t q2 = math::vRoundq_f32<R>(r2);
    int32x4_t q3 = math::vRoundq_f32<R>(r3);
62 63 64 65
    int16x4_t d0 = vmovn_s32(q0);
    int16x4_t d1 = vmovn_s32(q1);
    int16x4_t d2 = vmovn_s32(q2);
    int16x4_t d3 = vmovn_s32(q3);
H
hjchen2 已提交
66 67
    int16x8_t q5 = vcombine_s16(d0, d1);
    int16x8_t q6 = vcombine_s16(d2, d3);
H
Refine  
hjchen2 已提交
68 69
    int8x8_t d5 = vmovn_s16(q5);
    int8x8_t d6 = vmovn_s16(q6);
H
hjchen2 已提交
70 71
    vst1_s8(local_y, d5);
    vst1_s8(local_y + 8, d6);
72
  }
H
hjchen2 已提交
73 74
  x += (loop << 4);
  y += (loop << 4);
75
#endif
76
  for (size_t i = 0; i < remain; ++i) {
77
    y[i] = math::Round<R>(x[i] * scale);
78 79 80
  }
}

81 82
float find_abs_max(const Tensor *input) {
  float max_abs = 0.f;
83
  const float *x = input->data<const float>();
84
  size_t remain = input->numel();
85
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
86 87 88
  size_t loop = remain >> 4;
  remain = remain & 0xF;
  float32x4_t __max = {0.f, 0.f, 0.f, 0.f};
H
hjchen2 已提交
89

90 91 92 93 94 95 96 97 98 99 100 101 102
  for (size_t i = 0; i < loop; ++i, x += 16) {
    float32x4_t r0 = vld1q_f32(x);
    float32x4_t r1 = vld1q_f32(x + 4);
    float32x4_t r2 = vld1q_f32(x + 8);
    float32x4_t r3 = vld1q_f32(x + 12);
    r0 = vabsq_f32(r0);
    r1 = vabsq_f32(r1);
    r2 = vabsq_f32(r2);
    r3 = vabsq_f32(r3);
    r0 = vmaxq_f32(r0, r1);
    r1 = vmaxq_f32(r2, r3);
    r0 = vmaxq_f32(r0, r1);
    __max = vmaxq_f32(r0, __max);
103
  }
104
  max_abs = vmaxvq_f32(__max);
105
#endif
106 107
  for (size_t i = 0; i < remain; ++i) {
    max_abs = std::max(max_abs, std::abs(x[i]));
H
hjchen2 已提交
108
  }
109
  return max_abs;
H
hjchen2 已提交
110
}
111

112 113 114 115 116 117 118
}  // namespace operators
}  // namespace paddle_mobile
#endif  // __ARM_NEON__

namespace paddle_mobile {
namespace operators {

119
template <>
120 121 122 123
bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
  return true;
}

124
template <>
L
liuruilong 已提交
125
void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
126
  const Tensor *input = param.input_;
H
hjchen2 已提交
127
  Tensor *output = param.output_;
128
  Tensor *output_scale = param.online_scale_;
H
hjchen2 已提交
129
  float max_abs = 0.f;
130 131
  if (param.offline_) {
    max_abs = param.offline_scale_->data<float>()[0];
132 133 134
  } else {
    max_abs = find_abs_max(input);
  }
H
hjchen2 已提交
135
  max_abs = std::max(max_abs, 1e-6f);
136
  // only support int8 currently
137 138
  float scale = 127 / max_abs;
  param.online_scale_->mutable_data<float>()[0] = max_abs;
139 140
  switch (param.round_type_) {
    case ROUND_NEAREST_TO_EVEN:
141
      Quantize<ROUND_NEAREST_TO_EVEN>(input, scale, output);
142 143
      break;
    case ROUND_NEAREST_TOWARDS_ZERO:
144
      Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, scale, output);
145 146
      break;
    case ROUND_NEAREST_AWAY_ZERO:
147
      Quantize<ROUND_NEAREST_AWAY_ZERO>(input, scale, output);
148
      break;
149 150 151 152
    default:
      LOG(kLOG_ERROR) << "round type is not supported.";
      break;
  }
153 154 155
}

}  // namespace operators
156
}  // namespace paddle_mobile
157

158
#endif  // QUANT_OP