Optimize quantize and dequantize ops

c6efac34 · liyin · f0c7717e · c6efac34 · c6efac34 · c6efac34
5 changed file
--- a/mace/core/BUILD.bazel
+++ b/mace/core/BUILD.bazel
@@ -10,11 +10,12 @@ licenses(["notice"])  # Apache 2.0
 load(
    "//mace:mace.bzl",
    "if_android",
+    "if_android_armv7",
    "if_hexagon_enabled",
-    "if_not_hexagon_enabled",
-    "if_openmp_enabled",
    "if_neon_enabled",
+    "if_not_hexagon_enabled",
    "if_opencl_enabled",
+    "if_openmp_enabled",
    "if_quantize_enabled",
 )

@@ -58,6 +59,9 @@ cc_library(
        "-DMACE_ENABLE_HEXAGON",
    ]) + if_neon_enabled([
        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
    ]),
    linkopts = ["-ldl"],
    deps = [

--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -40,19 +40,33 @@ struct CPUFreq {
  float freq;
 };

+enum SchedulePolicy {
+  SCHED_STATIC,
+  SCHED_GUIDED,
+};
+
 namespace {

 MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
-                                           const std::vector<size_t> &cpu_ids) {
+                                           const std::vector<size_t> &cpu_ids,
+                                           SchedulePolicy schedule_policy) {
  MaceOpenMPThreadCount = omp_num_threads;

 #ifdef MACE_ENABLE_OPENMP
  VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
          << ", CPU core IDs: " << MakeString(cpu_ids);
-  omp_set_schedule(omp_sched_guided, 1);
+  if (schedule_policy == SCHED_GUIDED) {
+    omp_set_schedule(omp_sched_guided, 1);
+  } else if (schedule_policy == SCHED_STATIC) {
+    omp_set_schedule(omp_sched_static, 0);
+  } else {
+    LOG(WARNING) << "Unknown schedule policy: " << schedule_policy;
+  }
+
  omp_set_num_threads(omp_num_threads);
 #else
  MACE_UNUSED(omp_num_threads);
+  MACE_UNUSED(schedule_policy);
  LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif

@@ -148,6 +162,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
  } else {
    cores_to_use = num_threads_hint;
  }
+  MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0");

  VLOG(2) << "Use " << num_threads_hint << " threads";
  std::vector<size_t> cpu_ids(cores_to_use);
@@ -156,6 +171,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
            << cpu_freq[i].freq;
    cpu_ids[i] = cpu_freq[i].core_id;
  }
+  SchedulePolicy sched_policy = SCHED_GUIDED;
+  if (std::abs(cpu_freq[0].freq - cpu_freq[cores_to_use - 1].freq) < 1e-6) {
+    sched_policy = SCHED_STATIC;
+  }

 #ifdef MACE_ENABLE_QUANTIZE
  if (gemm_context) {
@@ -164,7 +183,9 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
  }
 #endif  // MACE_ENABLE_QUANTIZE

-  return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, cpu_ids);
+  return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint,
+                                         cpu_ids,
+                                         sched_policy);
 }

 }  // namespace mace

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1942,13 +1942,21 @@ class Transformer(base_converter.ConverterInterface):
                continue

            quantized_inputs_names = []
+
            should_quantize = False
+            has_const = False
+            for idx, input_tensor in enumerate(op.input):
+                if input_tensor in self._consts:
+                    has_const = True
+                    break
+            if not has_const:
+                continue
+
            for idx, input_tensor in enumerate(op.input):
                if self.get_tensor_data_type(input_tensor) \
                        == mace_pb2.DT_FLOAT:
                    should_quantize = True
                    break
-
            if not should_quantize:
                continue
            else:

--- a/mace/utils/BUILD.bazel
+++ b/mace/utils/BUILD.bazel
@@ -7,6 +7,14 @@ package(

 licenses(["notice"])  # Apache 2.0

+load(
+    "//mace:mace.bzl",
+    "if_android",
+    "if_android_armv7",
+    "if_neon_enabled",
+    "if_openmp_enabled",
+)
+
 cc_library(
    name = "utils_hdrs",
    hdrs = glob([
@@ -37,7 +45,17 @@ cc_library(
        "-Werror",
        "-Wextra",
        "-Wno-missing-field-initializers",
-    ],
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ]),
+    linkopts = if_android([
+        "-llog",
+    ]),
    deps = [
        ":utils_hdrs",
    ],

--- a/mace/utils/quantize.h
+++ b/mace/utils/quantize.h
@@ -19,6 +19,10 @@
 #include <cmath>
 #include <limits>

+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif  // MACE_ENABLE_NEON
+
 #include "mace/utils/logging.h"

 namespace mace {
@@ -156,6 +160,106 @@ inline void Dequantize(const T *input,
  }
 }

+#if defined(MACE_ENABLE_NEON)
+template<>
+inline void QuantizeWithScaleAndZeropoint<uint8_t>(const float *input,
+                                                   const index_t size,
+                                                   float scale,
+                                                   int32_t zero_point,
+                                                   uint8_t *output) {
+  const float32x4_t vround = vdupq_n_f32(0.5);
+  const float32x4_t
+      vzero = vaddq_f32(vround, vcvtq_f32_s32(vdupq_n_s32(zero_point)));
+  const float recip_scale = 1.f / scale;
+  const float32x4_t vrecip_scale = vdupq_n_f32(recip_scale);
+  const index_t block_count = size / 16;
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i < block_count; ++i) {
+    float32x4_t vi0 = vld1q_f32(input + i * 16);
+    float32x4_t vi1 = vld1q_f32(input + i * 16 + 4);
+    float32x4_t vi2 = vld1q_f32(input + i * 16 + 8);
+    float32x4_t vi3 = vld1q_f32(input + i * 16 + 12);
+
+    int32x4_t vo0_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi0, vrecip_scale));
+    int32x4_t vo1_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi1, vrecip_scale));
+    int32x4_t vo2_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi2, vrecip_scale));
+    int32x4_t vo3_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi3, vrecip_scale));
+
+    uint8x8_t vo0_u8 =
+        vqmovun_s16(vcombine_s16(vqmovn_s32(vo0_s32), vqmovn_s32(vo1_s32)));
+    uint8x8_t vo1_u8 =
+        vqmovun_s16(vcombine_s16(vqmovn_s32(vo2_s32), vqmovn_s32(vo3_s32)));
+    uint8x16_t vo = vcombine_u8(vo0_u8, vo1_u8);
+
+    vst1q_u8(output + i * 16, vo);
+  }
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = block_count * 16; i < size; ++i) {
+    output[i] = Saturate<uint8_t>(roundf(zero_point + recip_scale * input[i]));
+  }
+}
+
+template<>
+inline void Dequantize<int32_t>(const int32_t *input,
+                                const index_t size,
+                                const float scale,
+                                const int32_t zero_point,
+                                float *output) {
+  const index_t block_count = size / 4;
+  const int32x4_t vzero = vdupq_n_s32(zero_point);
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i < block_count; ++i) {
+    int32x4_t vi = vld1q_s32(input + i * 4);
+    float32x4_t vo = vmulq_f32(vscale, vcvtq_f32_s32(vsubq_s32(vi, vzero)));
+    vst1q_f32(output + i * 4, vo);
+  }
+  for (index_t i = block_count * 4; i < size; ++i) {
+    output[i] = scale * (input[i] - zero_point);
+  }
+}
+
+template<>
+inline void Dequantize<uint8_t>(const uint8_t *input,
+                                const index_t size,
+                                const float scale,
+                                const int32_t zero_point,
+                                float *output) {
+  const index_t block_count = size / 16;
+  const int32x4_t vzero = vdupq_n_s32(zero_point);
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i < block_count; ++i) {
+    uint8x16_t vi = vld1q_u8(input + i * 16);
+    float32x4x4_t vo = {
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_high_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_low_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
+    };
+    vst1q_f32(output + i * 16, vo.val[0]);
+    vst1q_f32(output + i * 16 + 4, vo.val[1]);
+    vst1q_f32(output + i * 16 + 8, vo.val[2]);
+    vst1q_f32(output + i * 16 + 12, vo.val[3]);
+  }
+  for (index_t i = block_count * 16; i < size; ++i) {
+    output[i] = scale * (input[i] - zero_point);
+  }
+}
+#endif  // MACE_ENABLE_NEON
+
 template<typename T>
 inline void DeQuantize(const Tensor &input,
                       const float min_in,
@@ -175,8 +279,8 @@ inline void DeQuantize(const Tensor &input,
 }

 inline void QuantizeMultiplier(double multiplier,
-                               int32_t* output_multiplier,
-                               int32_t* shift) {
+                               int32_t *output_multiplier,
+                               int32_t *shift) {
  const double q = std::frexp(multiplier, shift);
  auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
  if (qint == (1ll << 31)) {