Fix offline quantize to make sure it will not overflow

c509b1e6 · hjchen2 · 4f63b086 · c509b1e6
隐藏空白更改
内联并排

Showing with 68 addition and 6 deletion

src/operators/kernel/arm/quantize_kernel.cpp src/operators/kernel/arm/quantize_kernel.cpp +68 -6

未找到文件。
--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -34,14 +34,66 @@ inline float32_t vmaxvq_f32(float32x4_t r) {
 #endif

 template <RoundType R>
-static void Quantize(const Tensor *input, const float scale, Tensor *output) {
+inline void QuantizeOffline(const Tensor *input, const float scale,
+                            const float max_abs, Tensor *output) {
  const float *x = input->data<const float>();
  int8_t *y = output->mutable_data<int8_t>();
  size_t remain = input->numel();
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  size_t loop = remain >> 4;
  remain = remain & 0xF;
+  float32x4_t __scale = vdupq_n_f32(scale);
+  float32x4_t __postive_max = vdupq_n_f32(max_abs);
+  float32x4_t __negtive_max = vdupq_n_f32(-max_abs);
+  #pragma omp parallel for
+  for (size_t i = 0; i < loop; ++i) {
+    const float *local_x = x + (i << 4);
+    int8_t *local_y = y + (i << 4);
+    float32x4_t r0 = vld1q_f32(local_x);
+    float32x4_t r1 = vld1q_f32(local_x + 4);
+    float32x4_t r2 = vld1q_f32(local_x + 8);
+    float32x4_t r3 = vld1q_f32(local_x + 12);
+    r0 = vmaxq_f32(vminq_f32(r0, __postive_max), __negtive_max);
+    r1 = vmaxq_f32(vminq_f32(r1, __postive_max), __negtive_max);
+    r2 = vmaxq_f32(vminq_f32(r2, __postive_max), __negtive_max);
+    r3 = vmaxq_f32(vminq_f32(r3, __postive_max), __negtive_max);
+    r0 = vmulq_f32(r0, __scale);
+    r1 = vmulq_f32(r1, __scale);
+    r2 = vmulq_f32(r2, __scale);
+    r3 = vmulq_f32(r3, __scale);
+    int32x4_t q0 = math::vRoundq_f32<R>(r0);
+    int32x4_t q1 = math::vRoundq_f32<R>(r1);
+    int32x4_t q2 = math::vRoundq_f32<R>(r2);
+    int32x4_t q3 = math::vRoundq_f32<R>(r3);
+    int16x4_t d0 = vmovn_s32(q0);
+    int16x4_t d1 = vmovn_s32(q1);
+    int16x4_t d2 = vmovn_s32(q2);
+    int16x4_t d3 = vmovn_s32(q3);
+    int16x8_t q5 = vcombine_s16(d0, d1);
+    int16x8_t q6 = vcombine_s16(d2, d3);
+    int8x8_t d5 = vmovn_s16(q5);
+    int8x8_t d6 = vmovn_s16(q6);
+    vst1_s8(local_y, d5);
+    vst1_s8(local_y + 8, d6);
+  }
+  x += (loop << 4);
+  y += (loop << 4);
+#endif
+  for (size_t i = 0; i < remain; ++i) {
+    float x_temp = std::max(std::min(x[i], max_abs), -max_abs);
+    y[i] = math::Round<R>(x_temp * scale);
+  }
+}

+template <RoundType R>
+inline void QuantizeOnline(const Tensor *input, const float scale,
+                           Tensor *output) {
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+  size_t remain = input->numel();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = remain >> 4;
+  remain = remain & 0xF;
  float32x4_t __scale = vdupq_n_f32(scale);
  #pragma omp parallel for
  for (size_t i = 0; i < loop; ++i) {
@@ -78,6 +130,17 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
  }
 }

+template <RoundType R>
+static void Quantize(const Tensor *input, const float max_abs,
+                     const bool offline, Tensor *output) {
+  float scale = 127.f / max_abs;
+  if (offline) {
+    QuantizeOffline<R>(input, scale, max_abs, output);
+  } else {
+    QuantizeOnline<R>(input, scale, output);
+  }
+}
+
 float find_abs_max(const Tensor *input) {
  float max_abs = 0.f;
  const float *x = input->data<const float>();
@@ -133,18 +196,17 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
    max_abs = find_abs_max(input);
  }
  max_abs = std::max(max_abs, 1e-6f);
-  // only support int8 currently
-  float scale = 127 / max_abs;
  param.online_scale_->mutable_data<float>()[0] = max_abs;
  switch (param.round_type_) {
    case ROUND_NEAREST_TO_EVEN:
-      Quantize<ROUND_NEAREST_TO_EVEN>(input, scale, output);
+      Quantize<ROUND_NEAREST_TO_EVEN>(input, max_abs, param.offline_, output);
      break;
    case ROUND_NEAREST_TOWARDS_ZERO:
-      Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, scale, output);
+      Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, max_abs, param.offline_,
+                                           output);
      break;
    case ROUND_NEAREST_AWAY_ZERO:
-      Quantize<ROUND_NEAREST_AWAY_ZERO>(input, scale, output);
+      Quantize<ROUND_NEAREST_AWAY_ZERO>(input, max_abs, param.offline_, output);
      break;
    default:
      LOG(kLOG_ERROR) << "round type is not supported.";