diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel index 14ddc6b6cfe215497d32b9a5509a3cc1835ea7a6..2c905389aec20c51a2e54672411c1a696ff1b3b9 100644 --- a/mace/core/BUILD.bazel +++ b/mace/core/BUILD.bazel @@ -10,11 +10,12 @@ licenses(["notice"]) # Apache 2.0 load( "//mace:mace.bzl", "if_android", + "if_android_armv7", "if_hexagon_enabled", - "if_not_hexagon_enabled", - "if_openmp_enabled", "if_neon_enabled", + "if_not_hexagon_enabled", "if_opencl_enabled", + "if_openmp_enabled", "if_quantize_enabled", ) @@ -58,6 +59,9 @@ cc_library( "-DMACE_ENABLE_HEXAGON", ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + "-mfloat-abi=softfp", ]), linkopts = ["-ldl"], deps = [ diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index 76cd33e231b97fd2becd6453f01c0f48c085b9b8..5db5b36b1bb8bd2d2399f1cfa4ba406e78654a40 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -40,19 +40,33 @@ struct CPUFreq { float freq; }; +enum SchedulePolicy { + SCHED_STATIC, + SCHED_GUIDED, +}; + namespace { MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, - const std::vector &cpu_ids) { + const std::vector &cpu_ids, + SchedulePolicy schedule_policy) { MaceOpenMPThreadCount = omp_num_threads; #ifdef MACE_ENABLE_OPENMP VLOG(1) << "Set OpenMP threads number: " << omp_num_threads << ", CPU core IDs: " << MakeString(cpu_ids); - omp_set_schedule(omp_sched_guided, 1); + if (schedule_policy == SCHED_GUIDED) { + omp_set_schedule(omp_sched_guided, 1); + } else if (schedule_policy == SCHED_STATIC) { + omp_set_schedule(omp_sched_static, 0); + } else { + LOG(WARNING) << "Unknown schedule policy: " << schedule_policy; + } + omp_set_num_threads(omp_num_threads); #else MACE_UNUSED(omp_num_threads); + MACE_UNUSED(schedule_policy); LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled."; #endif @@ -148,6 +162,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( } else { cores_to_use = num_threads_hint; } + MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0"); VLOG(2) << "Use " << num_threads_hint << " threads"; std::vector cpu_ids(cores_to_use); @@ -156,6 +171,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( << cpu_freq[i].freq; cpu_ids[i] = cpu_freq[i].core_id; } + SchedulePolicy sched_policy = SCHED_GUIDED; + if (std::abs(cpu_freq[0].freq - cpu_freq[cores_to_use - 1].freq) < 1e-6) { + sched_policy = SCHED_STATIC; + } #ifdef MACE_ENABLE_QUANTIZE if (gemm_context) { @@ -164,7 +183,9 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( } #endif // MACE_ENABLE_QUANTIZE - return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, cpu_ids); + return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, + cpu_ids, + sched_policy); } } // namespace mace diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 256a397c7e76b794d2ae33b28984660ecf20e5d0..02fc3a12ace0808e083482ea93ae9fe2d6a7c65a 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -1942,13 +1942,21 @@ class Transformer(base_converter.ConverterInterface): continue quantized_inputs_names = [] + should_quantize = False + has_const = False + for idx, input_tensor in enumerate(op.input): + if input_tensor in self._consts: + has_const = True + break + if not has_const: + continue + for idx, input_tensor in enumerate(op.input): if self.get_tensor_data_type(input_tensor) \ == mace_pb2.DT_FLOAT: should_quantize = True break - if not should_quantize: continue else: diff --git a/mace/utils/BUILD.bazel b/mace/utils/BUILD.bazel index 3207441f099ab3f4bccd080ccfe043c88fa37f57..378210a3905e68188a8de35d2b1a8b1dacdefd39 100644 --- a/mace/utils/BUILD.bazel +++ b/mace/utils/BUILD.bazel @@ -7,6 +7,14 @@ package( licenses(["notice"]) # Apache 2.0 +load( + "//mace:mace.bzl", + "if_android", + "if_android_armv7", + "if_neon_enabled", + "if_openmp_enabled", +) + cc_library( name = "utils_hdrs", hdrs = glob([ @@ -37,7 +45,17 @@ cc_library( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ], + ] + if_openmp_enabled([ + "-fopenmp", + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + "-mfloat-abi=softfp", + ]), + linkopts = if_android([ + "-llog", + ]), deps = [ ":utils_hdrs", ], diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h index afd5a9af5bc7d57cfa1da8cca38e80b43d88f625..7634833cc1e75763d79901f68b47f46705fa97db 100644 --- a/mace/utils/quantize.h +++ b/mace/utils/quantize.h @@ -19,6 +19,10 @@ #include #include +#if defined(MACE_ENABLE_NEON) +#include +#endif // MACE_ENABLE_NEON + #include "mace/utils/logging.h" namespace mace { @@ -156,6 +160,106 @@ inline void Dequantize(const T *input, } } +#if defined(MACE_ENABLE_NEON) +template<> +inline void QuantizeWithScaleAndZeropoint(const float *input, + const index_t size, + float scale, + int32_t zero_point, + uint8_t *output) { + const float32x4_t vround = vdupq_n_f32(0.5); + const float32x4_t + vzero = vaddq_f32(vround, vcvtq_f32_s32(vdupq_n_s32(zero_point))); + const float recip_scale = 1.f / scale; + const float32x4_t vrecip_scale = vdupq_n_f32(recip_scale); + const index_t block_count = size / 16; + +#pragma omp parallel for schedule(runtime) + for (index_t i = 0; i < block_count; ++i) { + float32x4_t vi0 = vld1q_f32(input + i * 16); + float32x4_t vi1 = vld1q_f32(input + i * 16 + 4); + float32x4_t vi2 = vld1q_f32(input + i * 16 + 8); + float32x4_t vi3 = vld1q_f32(input + i * 16 + 12); + + int32x4_t vo0_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi0, vrecip_scale)); + int32x4_t vo1_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi1, vrecip_scale)); + int32x4_t vo2_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi2, vrecip_scale)); + int32x4_t vo3_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi3, vrecip_scale)); + + uint8x8_t vo0_u8 = + vqmovun_s16(vcombine_s16(vqmovn_s32(vo0_s32), vqmovn_s32(vo1_s32))); + uint8x8_t vo1_u8 = + vqmovun_s16(vcombine_s16(vqmovn_s32(vo2_s32), vqmovn_s32(vo3_s32))); + uint8x16_t vo = vcombine_u8(vo0_u8, vo1_u8); + + vst1q_u8(output + i * 16, vo); + } + +#pragma omp parallel for schedule(runtime) + for (index_t i = block_count * 16; i < size; ++i) { + output[i] = Saturate(roundf(zero_point + recip_scale * input[i])); + } +} + +template<> +inline void Dequantize(const int32_t *input, + const index_t size, + const float scale, + const int32_t zero_point, + float *output) { + const index_t block_count = size / 4; + const int32x4_t vzero = vdupq_n_s32(zero_point); + const float32x4_t vscale = vdupq_n_f32(scale); + +#pragma omp parallel for schedule(runtime) + for (index_t i = 0; i < block_count; ++i) { + int32x4_t vi = vld1q_s32(input + i * 4); + float32x4_t vo = vmulq_f32(vscale, vcvtq_f32_s32(vsubq_s32(vi, vzero))); + vst1q_f32(output + i * 4, vo); + } + for (index_t i = block_count * 4; i < size; ++i) { + output[i] = scale * (input[i] - zero_point); + } +} + +template<> +inline void Dequantize(const uint8_t *input, + const index_t size, + const float scale, + const int32_t zero_point, + float *output) { + const index_t block_count = size / 16; + const int32x4_t vzero = vdupq_n_s32(zero_point); + const float32x4_t vscale = vdupq_n_f32(scale); + +#pragma omp parallel for schedule(runtime) + for (index_t i = 0; i < block_count; ++i) { + uint8x16_t vi = vld1q_u8(input + i * 16); + float32x4x4_t vo = { + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))), + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_high_u16(vmovl_u8(vget_low_u8(vi))))), vzero))), + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_low_u16(vmovl_u8(vget_high_u8(vi))))), vzero))), + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))), + }; + vst1q_f32(output + i * 16, vo.val[0]); + vst1q_f32(output + i * 16 + 4, vo.val[1]); + vst1q_f32(output + i * 16 + 8, vo.val[2]); + vst1q_f32(output + i * 16 + 12, vo.val[3]); + } + for (index_t i = block_count * 16; i < size; ++i) { + output[i] = scale * (input[i] - zero_point); + } +} +#endif // MACE_ENABLE_NEON + template inline void DeQuantize(const Tensor &input, const float min_in, @@ -175,8 +279,8 @@ inline void DeQuantize(const Tensor &input, } inline void QuantizeMultiplier(double multiplier, - int32_t* output_multiplier, - int32_t* shift) { + int32_t *output_multiplier, + int32_t *shift) { const double q = std::frexp(multiplier, shift); auto qint = static_cast(roundl(q * (1ll << 31))); if (qint == (1ll << 31)) {