quantize_kernel.cpp 4.5 KB
Newer Older
T
Tian 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#ifdef QUANT_OP
16

17
#include "operators/kernel/quantize_kernel.h"
18
#include <cmath>
19
#include "operators/math/quantize.h"
T
Tian 已提交
20

21 22
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
H
Refine  
hjchen2 已提交
23

24 25 26
namespace paddle_mobile {
namespace operators {

27
#ifndef __aarch64__
H
hjchen2 已提交
28
inline float32_t vmaxvq_f32(float32x4_t r) {
29 30 31 32 33
  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
  return vget_lane_f32(vpmax_f32(v, v), 0);
}
#endif

34 35
template <RoundType R>
static void Quantize(const Tensor *input, const float scale, Tensor *output) {
36
  const float *x = input->data<const float>();
H
hjchen2 已提交
37
  int8_t *y = output->mutable_data<int8_t>();
38
  size_t remain = input->numel();
H
hjchen2 已提交
39
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
40 41
  size_t loop = remain >> 4;
  remain = remain & 0xF;
H
hjchen2 已提交
42

43
  float32x4_t __scale = vdupq_n_f32(scale);
H
hjchen2 已提交
44
  #pragma omp parallel for
45
  for (size_t i = 0; i < loop; ++i) {
H
hjchen2 已提交
46 47 48 49 50 51
    const float *local_x = x + (i << 4);
    int8_t *local_y = y + (i << 4);
    float32x4_t r0 = vld1q_f32(local_x);
    float32x4_t r1 = vld1q_f32(local_x + 4);
    float32x4_t r2 = vld1q_f32(local_x + 8);
    float32x4_t r3 = vld1q_f32(local_x + 12);
52 53 54 55 56 57 58 59
    r0 = vmulq_f32(r0, __scale);
    r1 = vmulq_f32(r1, __scale);
    r2 = vmulq_f32(r2, __scale);
    r3 = vmulq_f32(r3, __scale);
    int32x4_t q0 = math::vround_f32<R>(r0);
    int32x4_t q1 = math::vround_f32<R>(r1);
    int32x4_t q2 = math::vround_f32<R>(r2);
    int32x4_t q3 = math::vround_f32<R>(r3);
60 61 62 63
    int16x4_t d0 = vmovn_s32(q0);
    int16x4_t d1 = vmovn_s32(q1);
    int16x4_t d2 = vmovn_s32(q2);
    int16x4_t d3 = vmovn_s32(q3);
H
hjchen2 已提交
64 65
    int16x8_t q5 = vcombine_s16(d0, d1);
    int16x8_t q6 = vcombine_s16(d2, d3);
H
Refine  
hjchen2 已提交
66 67
    int8x8_t d5 = vmovn_s16(q5);
    int8x8_t d6 = vmovn_s16(q6);
H
hjchen2 已提交
68 69
    vst1_s8(local_y, d5);
    vst1_s8(local_y + 8, d6);
70
  }
H
hjchen2 已提交
71 72
  x += (loop << 4);
  y += (loop << 4);
73
#endif
74
  for (size_t i = 0; i < remain; ++i) {
75
    y[i] = math::Round<R>(x[i] * scale);
76 77 78
  }
}

79 80
float find_abs_max(const Tensor *input) {
  float max_abs = 0.f;
81
  const float *x = input->data<const float>();
82
  size_t remain = input->numel();
83
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
84 85 86
  size_t loop = remain >> 4;
  remain = remain & 0xF;
  float32x4_t __max = {0.f, 0.f, 0.f, 0.f};
H
hjchen2 已提交
87

88 89 90 91 92 93 94 95 96 97 98 99 100
  for (size_t i = 0; i < loop; ++i, x += 16) {
    float32x4_t r0 = vld1q_f32(x);
    float32x4_t r1 = vld1q_f32(x + 4);
    float32x4_t r2 = vld1q_f32(x + 8);
    float32x4_t r3 = vld1q_f32(x + 12);
    r0 = vabsq_f32(r0);
    r1 = vabsq_f32(r1);
    r2 = vabsq_f32(r2);
    r3 = vabsq_f32(r3);
    r0 = vmaxq_f32(r0, r1);
    r1 = vmaxq_f32(r2, r3);
    r0 = vmaxq_f32(r0, r1);
    __max = vmaxq_f32(r0, __max);
101
  }
102
  max_abs = vmaxvq_f32(__max);
103
#endif
104 105
  for (size_t i = 0; i < remain; ++i) {
    max_abs = std::max(max_abs, std::abs(x[i]));
H
hjchen2 已提交
106
  }
107
  return max_abs;
H
hjchen2 已提交
108
}
109

110 111 112 113 114 115 116
}  // namespace operators
}  // namespace paddle_mobile
#endif  // __ARM_NEON__

namespace paddle_mobile {
namespace operators {

117
template <>
118 119 120 121
bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
  return true;
}

122
template <>
L
liuruilong 已提交
123
void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
124
  const Tensor *input = param.input_;
H
hjchen2 已提交
125
  Tensor *output = param.output_;
126
  Tensor *output_scale = param.online_scale_;
H
hjchen2 已提交
127
  float max_abs = 0.f;
128 129
  if (param.offline_) {
    max_abs = param.offline_scale_->data<float>()[0];
130 131 132
  } else {
    max_abs = find_abs_max(input);
  }
H
hjchen2 已提交
133
  max_abs = std::max(max_abs, 1e-6f);
134
  // only support int8 currently
135 136
  float scale = 127 / max_abs;
  param.online_scale_->mutable_data<float>()[0] = max_abs;
137 138
  switch (param.round_type_) {
    case ROUND_NEAREST_TO_EVEN:
139
      Quantize<ROUND_NEAREST_TO_EVEN>(input, scale, output);
140 141
      break;
    case ROUND_NEAREST_TOWARDS_ZERO:
142
      Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, scale, output);
143 144
      break;
    case ROUND_NEAREST_AWAY_ZERO:
145
      Quantize<ROUND_NEAREST_AWAY_ZERO>(input, scale, output);
146
      break;
147 148 149 150
    default:
      LOG(kLOG_ERROR) << "round type is not supported.";
      break;
  }
151 152 153
}

}  // namespace operators
154
}  // namespace paddle_mobile
155

156
#endif  // QUANT_OP