diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4fb6d388076f241b8ec8a4331df7ef4612a4b722..b709d57fab731f78fb003ca7297a4afa760767fe 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -49,9 +49,9 @@ docs: platform_compatible_tests: stage: platform_compatible_tests script: - - bazel build mace/core:core --define openmp=true - - bazel build --config arm_linux_gnueabihf --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so - - bazel build --config aarch64_linux_gnu --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so + - bazel build mace/core:core --define openmp=false + - bazel build --config arm_linux_gnueabihf --define openmp=false --define opencl=true --define neon=true //mace/libmace:libmace.so + - bazel build --config aarch64_linux_gnu --define openmp=false --define opencl=true --define neon=true //mace/libmace:libmace.so build_libraries: stage: build_libraries @@ -202,13 +202,13 @@ so_size_check: stage: so_size_check script: - DYNAMIC_LIB_PATH="bazel-bin/mace/libmace/libmace.so" - - bazel build -s --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=false --define quantize=false --cpu=armeabi-v7a + - bazel build -s --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=false --define quantize=false --cpu=armeabi-v7a - CURRENT_LIBMACE_SO_SIZE=`ls -l $DYNAMIC_LIB_PATH --block-size=K -s | cut -f 1 -d "K"` - TARGET_MACE_WORK_DIR=`mktemp -d` - pushd $TARGET_MACE_WORK_DIR - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace.git - pushd mace - - bazel build -s --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=false --define quantize=false --cpu=armeabi-v7a + - bazel build -s --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=false --define quantize=false --cpu=armeabi-v7a - TARGET_LIBMACE_SO_SIZE=`ls -l $DYNAMIC_LIB_PATH --block-size=K -s | cut -f 1 -d "K"` - popd - popd diff --git a/WORKSPACE b/WORKSPACE index 524126a41b27444477f67688afc3acf140bad417..daa855de4784d5b968dbc877bc3cc0031f3e455e 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -79,19 +79,19 @@ new_http_archive( http_archive( name = "gemmlowp", - sha256 = "4e9cd60f7871ae9e06dcea5fec1a98ddf1006b32a85883480273e663f143f303", - strip_prefix = "gemmlowp-master-66fb41a7cafd2034a50e0b32791359897d657f7a", + sha256 = "afbea037aee2d21b625985238486b4219396f9c2550b0fde3157fab4d2580205", + strip_prefix = "gemmlowp-master-1f6d8d442805a400c74e63a4a017390733df2e28", urls = [ - "https://cnbj1.fds.api.xiaomi.com/mace/third-party/gemmlowp/gemmlowp-master-66fb41a7cafd2034a50e0b32791359897d657f7a.zip", + "http://cnbj1.fds.api.xiaomi.com/mace/third-party/gemmlowp/gemmlowp-master-1f6d8d442805a400c74e63a4a017390733df2e28.zip", ], ) http_archive( name = "tflite", - sha256 = "1bb4571ee5cbde427ecfed076b39edaad96ace897ab86bb2495bdb93c706b203", - strip_prefix = "tensorflow-mace-ffc8cc7e8c9d1894753509e88b17e251bc6255e3", + sha256 = "8b4c1b2ad2d31da9859e17b0ad551b12e1db7ff2faf7e83218901ab48d9fa91a", + strip_prefix = "tensorflow-mace-dfabaf85145e4d5ad39f34a0cea57b44c32dbe43", urls = [ - "http://cnbj1.fds.api.xiaomi.com/mace/third-party/tflite/tensorflow-mace-ffc8cc7e8c9d1894753509e88b17e251bc6255e3_custom.zip", + "http://cnbj1.fds.api.xiaomi.com/mace/third-party/tflite/tensorflow-mace-dfabaf85145e4d5ad39f34a0cea57b44c32dbe43.zip", ], ) diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index bfe0bb848a02990eee9b14ab2421c9ea4012d66d..e0dac730639276dbd30bf210b466c57d9940feaf 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -252,8 +252,7 @@ int Main(int argc, char **argv) { MaceEngineConfig config(device_type); mace_status = config.SetCPUThreadPolicy( FLAGS_omp_num_threads, - static_cast(FLAGS_cpu_affinity_policy), - true); + static_cast(FLAGS_cpu_affinity_policy)); if (mace_status != MaceStatus::MACE_SUCCESS) { LOG(INFO) << "Set openmp or cpu affinity failed."; } diff --git a/mace/core/device.cc b/mace/core/device.cc index 535b7193633cf6881fea54f129c0485ddc3ed585..43f600753c7bc56423b99b04bc277ac84b64c1ce 100644 --- a/mace/core/device.cc +++ b/mace/core/device.cc @@ -21,10 +21,10 @@ namespace mace { CPUDevice::CPUDevice(const int num_threads, const CPUAffinityPolicy policy, - const bool use_gemmlowp) + utils::ThreadPool *thread_pool) : cpu_runtime_(make_unique(num_threads, policy, - use_gemmlowp)), + thread_pool)), scratch_buffer_(make_unique(GetCPUAllocator())) {} CPUDevice::~CPUDevice() = default; diff --git a/mace/core/device.h b/mace/core/device.h index e5fda181ee66e127f953c6f46937481269ccfc16..85019d9485005dc75f5ba19e682022ef237da6b5 100644 --- a/mace/core/device.h +++ b/mace/core/device.h @@ -46,7 +46,7 @@ class CPUDevice : public Device { public: CPUDevice(const int num_threads, const CPUAffinityPolicy policy, - const bool use_gemmlowp); + utils::ThreadPool *thread_pool); virtual ~CPUDevice(); #ifdef MACE_ENABLE_OPENCL diff --git a/mace/core/net.cc b/mace/core/net.cc index fbe1c1b8b9da81929732a77c176195f29dd688b9..a10d96bb560b2a145146bcffa88e2b4e045f0e10 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -136,7 +136,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, make_unique( target_device->cpu_runtime()->num_threads(), target_device->cpu_runtime()->policy(), - target_device->cpu_runtime()->use_gemmlowp())) { + &target_device->cpu_runtime()->thread_pool())) { MACE_LATENCY_LOGGER(1, "Constructing SerialNet"); // quantize model flag bool is_quantize_model = IsQuantizedModel(*net_def); @@ -154,7 +154,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, } for (auto &tensor : net_def->tensors()) { tensor_shape_map[tensor.name()] = - std::vector(tensor.dims().begin(), tensor.dims().end()); + std::vector(tensor.dims().begin(), tensor.dims().end()); } bool has_data_format = false; diff --git a/mace/core/quantize.cc b/mace/core/quantize.cc new file mode 100644 index 0000000000000000000000000000000000000000..167c6da356cb975eaed53ce87343fdd3185ce854 --- /dev/null +++ b/mace/core/quantize.cc @@ -0,0 +1,130 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(MACE_ENABLE_NEON) +#include +#endif // MACE_ENABLE_NEON + +#include "mace/core/quantize.h" + +namespace mace { + +#ifdef MACE_ENABLE_NEON + +template<> +void QuantizeUtil::QuantizeWithScaleAndZeropoint( + const float *input, + const index_t size, + float scale, + int32_t zero_point, + uint8_t *output) { + const float32x4_t vround = vdupq_n_f32(0.5); + const float32x4_t + vzero = vaddq_f32(vround, vcvtq_f32_s32(vdupq_n_s32(zero_point))); + const float recip_scale = 1.f / scale; + const float32x4_t vrecip_scale = vdupq_n_f32(recip_scale); + const index_t block_count = size / 16; + + thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + float32x4_t vi0 = vld1q_f32(input + i * 16); + float32x4_t vi1 = vld1q_f32(input + i * 16 + 4); + float32x4_t vi2 = vld1q_f32(input + i * 16 + 8); + float32x4_t vi3 = vld1q_f32(input + i * 16 + 12); + + int32x4_t vo0_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi0, vrecip_scale)); + int32x4_t vo1_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi1, vrecip_scale)); + int32x4_t vo2_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi2, vrecip_scale)); + int32x4_t vo3_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi3, vrecip_scale)); + + uint8x8_t vo0_u8 = + vqmovun_s16(vcombine_s16(vqmovn_s32(vo0_s32), vqmovn_s32(vo1_s32))); + uint8x8_t vo1_u8 = + vqmovun_s16(vcombine_s16(vqmovn_s32(vo2_s32), vqmovn_s32(vo3_s32))); + uint8x16_t vo = vcombine_u8(vo0_u8, vo1_u8); + + vst1q_u8(output + i * 16, vo); + } + }, 0, block_count, 1); + + for (index_t i = block_count * 16; i < size; ++i) { + output[i] = + Saturate(roundf(zero_point + recip_scale * input[i])); + } +} + +template<> +void QuantizeUtil::Dequantize(const uint8_t *input, + const index_t size, + const float scale, + const int32_t zero_point, + float *output) { + const index_t block_count = size / 16; + const int32x4_t vzero = vdupq_n_s32(zero_point); + const float32x4_t vscale = vdupq_n_f32(scale); + + thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + uint8x16_t vi = vld1q_u8(input + i * 16); + float32x4x4_t vo = { + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))), + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_high_u16(vmovl_u8(vget_low_u8(vi))))), vzero))), + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_low_u16(vmovl_u8(vget_high_u8(vi))))), vzero))), + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))), + }; + vst1q_f32(output + i * 16, vo.val[0]); + vst1q_f32(output + i * 16 + 4, vo.val[1]); + vst1q_f32(output + i * 16 + 8, vo.val[2]); + vst1q_f32(output + i * 16 + 12, vo.val[3]); + } + }, 0, block_count, 1); + + for (index_t i = block_count * 16; i < size; ++i) { + output[i] = scale * (input[i] - zero_point); + } +} + +template<> +void QuantizeUtil::Dequantize(const int *input, + const index_t size, + const float scale, + const int32_t zero_point, + float *output) { + const index_t block_count = size / 4; + const int32x4_t vzero = vdupq_n_s32(zero_point); + const float32x4_t vscale = vdupq_n_f32(scale); + + thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + int32x4_t vi = vld1q_s32(input + i * 4); + float32x4_t vo = vmulq_f32(vscale, vcvtq_f32_s32(vsubq_s32(vi, vzero))); + vst1q_f32(output + i * 4, vo); + } + }, 0, block_count, 1); + + for (index_t i = block_count * 4; i < size; ++i) { + output[i] = scale * (input[i] - zero_point); + } +} +#endif + +} // namespace mace diff --git a/mace/core/quantize.h b/mace/core/quantize.h new file mode 100644 index 0000000000000000000000000000000000000000..3e755bf0e7af22f0424ef5c84f8384699d041d12 --- /dev/null +++ b/mace/core/quantize.h @@ -0,0 +1,232 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_QUANTIZE_H_ +#define MACE_CORE_QUANTIZE_H_ + +#include +#include +#include + +#include "mace/utils/logging.h" +#include "mace/utils/thread_pool.h" +#include "mace/core/tensor.h" + +namespace mace { + +template +inline void AdjustRange(const float in_min_data, + const float in_max_data, + const bool non_zero, + float *scale, + int32_t *zero_point) { + // re-range to make range include zero float and + // make zero float as integer u8 + const T quantized_min = std::numeric_limits::lowest(); + const T quantized_max = std::numeric_limits::max(); + if (quantized_min < 0) { + MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value."); + } + + float out_max = std::max(0.f, in_max_data); + float out_min = std::min(0.f, in_min_data); + // make in_min_data quantize as greater than 1 + if (non_zero) { + out_min = std::min(out_min, + in_min_data - (out_max - in_min_data) + / (quantized_max - quantized_min - 1)); + } + + *scale = (out_max - out_min) / (quantized_max - quantized_min); + const float kEps = 1e-6; + if (out_min < -kEps && out_max > kEps) { + float quantized_zero = -out_min / *scale; + int32_t + quantized_zero_near_int = static_cast(roundf(quantized_zero)); + *zero_point = quantized_zero_near_int; + if (fabs(quantized_zero - quantized_zero_near_int) > kEps && non_zero) { + *zero_point = static_cast(std::ceil(quantized_zero)); + } + } else if (out_min > -kEps) { + *zero_point = quantized_min; + } else { + *zero_point = quantized_max; + } +} + +template +inline T Saturate(float value) { + int rounded_value = static_cast(value); + if (rounded_value <= std::numeric_limits::lowest()) { + return std::numeric_limits::lowest(); + } else if (rounded_value >= std::numeric_limits::max()) { + return std::numeric_limits::max(); + } else { + return static_cast(rounded_value); + } +} + +inline void FindMinMax(const float *input, + const index_t size, + float *min_val, float *max_val) { + float max_v = std::numeric_limits::lowest(); + float min_v = std::numeric_limits::max(); + for (index_t i = 0; i < size; ++i) { + max_v = std::max(max_v, input[i]); + min_v = std::min(min_v, input[i]); + } + *min_val = min_v; + *max_val = max_v; +} + +inline void QuantizeMultiplier(double multiplier, + int32_t *output_multiplier, + int32_t *shift) { + const double q = std::frexp(multiplier, shift); + auto qint = static_cast(roundl(q * (1ll << 31))); + if (qint == (1ll << 31)) { + qint /= 2; + ++*shift; + } + *output_multiplier = static_cast(qint); + MACE_CHECK(*output_multiplier <= std::numeric_limits::max()); +} + +inline void GetOutputMultiplierAndShift( + const float lhs_scale, const float rhs_scale, const float output_scale, + int32_t *quantized_multiplier, int *right_shift) { + float real_multiplier = lhs_scale * rhs_scale / output_scale; + MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier); + + int exponent; + QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent); + *right_shift = -exponent; + MACE_CHECK(*right_shift >= 0); +} + +template +class QuantizeUtil { + public: + explicit QuantizeUtil(utils::ThreadPool *thread_pool) + : thread_pool_(thread_pool) {} + + void QuantizeWithScaleAndZeropoint(const float *input, + const index_t size, + float scale, + int32_t zero_point, + T *output) { + float recip_scale = 1 / scale; + thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + output[i] = Saturate(roundf(zero_point + recip_scale * input[i])); + } + }, 0, size, 1); + } + + void Quantize(const float *input, + const index_t size, + bool non_zero, + T *output, + float *scale, + int32_t *zero_point) { + float in_min_data; + float in_max_data; + FindMinMax(input, size, &in_min_data, &in_max_data); + + AdjustRange(in_min_data, in_max_data, non_zero, + scale, zero_point); + + QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output); + } + + void Quantize(const Tensor &input, + Tensor *output, + float *min_out, + float *max_out) { + MACE_CHECK(input.size() != 0); + Tensor::MappingGuard input_guard(&input); + Tensor::MappingGuard output_guard(output); + auto *input_data = input.data(); + auto *output_data = output->mutable_data(); + float scale; + int32_t zero_point; + + Quantize(input_data, input.size(), false, output_data, &scale, &zero_point); + + *min_out = scale * (std::numeric_limits::lowest() - zero_point); + *max_out = scale * (std::numeric_limits::max() - zero_point); + } + + void Dequantize(const T *input, + const index_t size, + const float scale, + const int32_t zero_point, + float *output) { + thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + output[i] = scale * (input[i] - zero_point); + } + }, 0, size, 1); + } + + void DeQuantize(const Tensor &input, + const float min_in, + const float max_in, + Tensor *output) { + MACE_CHECK(input.size() != 0); + Tensor::MappingGuard input_guard(&input); + Tensor::MappingGuard output_guard(output); + auto *input_data = input.data(); + auto *output_data = output->mutable_data(); + float scale; + int32_t zero_point; + + AdjustRange(min_in, max_in, false, &scale, &zero_point); + + Dequantize(input_data, input.size(), scale, zero_point, output_data); + } + + private: + utils::ThreadPool *thread_pool_; +}; + +#ifdef MACE_ENABLE_NEON + +template<> +void QuantizeUtil::QuantizeWithScaleAndZeropoint( + const float *input, + const index_t size, + float scale, + int32_t zero_point, + uint8_t *output); + +template<> +void QuantizeUtil::Dequantize(const uint8_t *input, + const index_t size, + const float scale, + const int32_t zero_point, + float *output); + +template<> +void QuantizeUtil::Dequantize(const int *input, + const index_t size, + const float scale, + const int32_t zero_point, + float *output); + +#endif + +} // namespace mace + +#endif // MACE_CORE_QUANTIZE_H_ diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index ae3689c2f35cceb13d68e7a91b415dfadeb9fc37..ad60447e706613252affbac36d635a8f88193a71 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -68,7 +68,7 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, #else MACE_UNUSED(omp_num_threads); MACE_UNUSED(schedule_policy); - LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled."; + VLOG(2) << "Set OpenMP threads number failed: OpenMP not enabled."; #endif #ifdef MACE_ENABLE_OPENMP @@ -143,7 +143,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( #ifdef MACE_ENABLE_OPENMP omp_set_num_threads(num_threads_hint); #else - LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled."; + VLOG(2) << "Set OpenMP threads number failed: OpenMP not enabled."; #endif return MaceStatus::MACE_SUCCESS; } diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h index 08584dd91865b33c23b8cdf42e696b43390b14b9..f8cb2111cedd5fa125a6dae264c2210359900c8d 100644 --- a/mace/core/runtime/cpu/cpu_runtime.h +++ b/mace/core/runtime/cpu/cpu_runtime.h @@ -35,24 +35,17 @@ class CPURuntime { public: CPURuntime(const int num_threads, CPUAffinityPolicy policy, - bool use_gemmlowp) + utils::ThreadPool *thread_pool) : num_threads_(num_threads), policy_(policy), gemm_context_(nullptr), - thread_pool_(static_cast(num_threads), policy) { + thread_pool_(thread_pool) { #ifdef MACE_ENABLE_QUANTIZE - if (use_gemmlowp) { - MACE_CHECK_NOTNULL(GetGemmlowpContext()); - } -#else - MACE_UNUSED(use_gemmlowp); + MACE_CHECK_NOTNULL(GetGemmlowpContext()); #endif // MACE_ENABLE_QUANTIZE SetOpenMPThreadsAndAffinityPolicy(num_threads_, policy_, gemm_context_); - // TODO(liyin): After we replace OpenMP to thread_pool, uncomment the - // following line. - // thread_pool_.Init(); } #ifdef MACE_ENABLE_QUANTIZE @@ -80,12 +73,8 @@ class CPURuntime { return policy_; } - bool use_gemmlowp() const { - return gemm_context_ != nullptr; - } - utils::ThreadPool &thread_pool() { - return thread_pool_; + return *thread_pool_; } private: @@ -97,7 +86,7 @@ class CPURuntime { int num_threads_; CPUAffinityPolicy policy_; void *gemm_context_; - utils::ThreadPool thread_pool_; + utils::ThreadPool *thread_pool_; }; } // namespace mace diff --git a/mace/core/runtime/hexagon/hexagon_device.h b/mace/core/runtime/hexagon/hexagon_device.h index f80607d3196582f850d0911fec0429784cabaca0..b17b19e5469cb5bb01e42f9beecdba286d8454af 100644 --- a/mace/core/runtime/hexagon/hexagon_device.h +++ b/mace/core/runtime/hexagon/hexagon_device.h @@ -31,8 +31,9 @@ namespace mace { class HexagonDevice : public CPUDevice { public: - explicit HexagonDevice(DeviceType device_type) - : CPUDevice(0, AFFINITY_NONE, false), + explicit HexagonDevice(DeviceType device_type, + utils::ThreadPool *thread_pool) + : CPUDevice(0, AFFINITY_NONE, thread_pool), device_type_(device_type) {} DeviceType device_type() const override { @@ -44,9 +45,9 @@ class HexagonDevice : public CPUDevice { }; std::unique_ptr CreateHexagonControlWrapper( - DeviceType device_type) { + Device *device) { std::unique_ptr hexagon_controller; - + auto device_type = device->device_type(); switch (device_type) { #ifdef MACE_ENABLE_HEXAGON case HEXAGON: @@ -55,11 +56,10 @@ std::unique_ptr CreateHexagonControlWrapper( #endif #ifdef MACE_ENABLE_HTA case HTA: - hexagon_controller = make_unique(); + hexagon_controller = make_unique(device); break; #endif - default: - LOG(FATAL) << "Not supported Hexagon device type: " << device_type; + default:LOG(FATAL) << "Not supported Hexagon device type: " << device_type; } return hexagon_controller; diff --git a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc index 0b285ee2bd7171a4f21baddfee31a0f695d48982..a617e7c7f5f534d8bb765529c28524c1807b96ea 100644 --- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc +++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h" + #include #include #include @@ -22,7 +24,6 @@ #include #include -#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h" #include "mace/core/runtime/hexagon/hexagon_dsp_ops.h" #include "mace/core/types.h" #include "mace/port/env.h" diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc index c4191e7f25dff4f55d6cec283df7a6b0d733b94b..06dadc3a9ae986cf8da9d2da8e4e212edf3d93cd 100644 --- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc +++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc @@ -26,11 +26,15 @@ #include "mace/core/runtime/hexagon/hexagon_hta_ops.h" #include "mace/core/types.h" #include "mace/utils/memory.h" -#include "mace/utils/quantize.h" +#include "mace/core/quantize.h" #include "third_party/hta/hta_hexagon_api.h" namespace mace { +HexagonHTAWrapper::HexagonHTAWrapper(Device *device) + : device_(device), quantize_util_(&device->cpu_runtime()->thread_pool()) { +} + int HexagonHTAWrapper::GetVersion() { int version; MACE_CHECK(hexagon_hta_nn_version(&version) == 0, "get version error"); @@ -237,8 +241,8 @@ bool HexagonHTAWrapper::ExecuteGraph(const Tensor &input_tensor, } bool HexagonHTAWrapper::ExecuteGraphNew( - const std::map &input_tensors, - std::map *output_tensors) { + const std::map &input_tensors, + std::map *output_tensors) { VLOG(2) << "Execute graph new: " << nn_id_; uint32_t num_inputs = static_cast(input_tensors.size()); uint32_t num_outputs = static_cast(output_tensors->size()); @@ -261,11 +265,11 @@ bool HexagonHTAWrapper::ExecuteGraphNew( const float *input_data = input_tensor->data(); uint8_t *input_data_u8 = input_info_[i].tensor_u8->mutable_data(); - QuantizeWithScaleAndZeropoint(input_data, - input_tensor->size(), - input_info_[i].scale, - input_info_[i].zero_point, - input_data_u8); + quantize_util_.QuantizeWithScaleAndZeropoint(input_data, + input_tensor->size(), + input_info_[i].scale, + input_info_[i].zero_point, + input_data_u8); inputs[i].data = const_cast( reinterpret_cast( @@ -315,11 +319,11 @@ bool HexagonHTAWrapper::ExecuteGraphNew( const uint8_t *output_data_u8 = output_info_[i].tensor_u8->data(); float *output_data = output_tensor->mutable_data(); - Dequantize(output_data_u8, - output_info_[i].tensor_u8->size(), - output_info_[i].scale, - output_info_[i].zero_point, - output_data); + quantize_util_.Dequantize(output_data_u8, + output_info_[i].tensor_u8->size(), + output_info_[i].scale, + output_info_[i].zero_point, + output_data); } return res == 0; diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h index 66d02e0290c82f7bfcadf17cdba94db8e035db94..af8294b1c2993111ba9f5d31986d6c8346a765a8 100644 --- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h +++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h @@ -19,15 +19,18 @@ #include #include +#include "mace/utils/thread_pool.h" +#include "mace/core/quantize.h" #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" #include "mace/core/tensor.h" +#include "mace/core/device.h" #include "mace/public/mace.h" namespace mace { class HexagonHTAWrapper : public HexagonControlWrapper { public: - HexagonHTAWrapper() = default; + explicit HexagonHTAWrapper(Device *device); int GetVersion() override; bool Config() override; @@ -46,6 +49,9 @@ class HexagonHTAWrapper : public HexagonControlWrapper { void ResetPerfInfo() override; void SetDebugLevel(int level) override; + private: + Device *device_; + QuantizeUtil quantize_util_; MACE_DISABLE_COPY_AND_ASSIGN(HexagonHTAWrapper); }; } // namespace mace diff --git a/mace/core/runtime/opencl/gpu_device.cc b/mace/core/runtime/opencl/gpu_device.cc index 2bdf6802af34983fa1d0b1c3ae8527b46f762152..a4d3f8b268679f89bce320bb08b84330a02cdbdb 100644 --- a/mace/core/runtime/opencl/gpu_device.cc +++ b/mace/core/runtime/opencl/gpu_device.cc @@ -25,8 +25,10 @@ GPUDevice::GPUDevice(std::shared_ptr> tuner, std::shared_ptr opencl_binary_storage, const int num_threads, CPUAffinityPolicy cpu_affinity_policy, - bool use_gemmlowp) : - CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp), + utils::ThreadPool *thread_pool) : + CPUDevice(num_threads, + cpu_affinity_policy, + thread_pool), runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf, opencl_binary_storage, tuner)), allocator_(new OpenCLAllocator(runtime_.get())), @@ -35,7 +37,7 @@ GPUDevice::GPUDevice(std::shared_ptr> tuner, GPUDevice::~GPUDevice() = default; -GPURuntime* GPUDevice::gpu_runtime() { +GPURuntime *GPUDevice::gpu_runtime() { return gpu_runtime_.get(); } diff --git a/mace/core/runtime/opencl/gpu_device.h b/mace/core/runtime/opencl/gpu_device.h index 768ea378b5bf3dd2128b2cceb97cfca69e0f0323..ef2ceb5a46e943a337b713b2d6c1b7ee846153e5 100644 --- a/mace/core/runtime/opencl/gpu_device.h +++ b/mace/core/runtime/opencl/gpu_device.h @@ -33,7 +33,7 @@ class GPUDevice : public CPUDevice { std::shared_ptr opencl_binary_storage = nullptr, const int num_threads = -1, CPUAffinityPolicy cpu_affinity_policy = AFFINITY_NONE, - bool use_gemmlowp = false); + utils::ThreadPool *thread_pool = nullptr); ~GPUDevice(); GPURuntime *gpu_runtime() override; Allocator *allocator() override; diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h index 89494eabc4effeca33896ea1fd411acb640a35f4..1082510ac33e067c5ef27c0bc45cce7a9b978540 100644 --- a/mace/core/testing/test_benchmark.h +++ b/mace/core/testing/test_benchmark.h @@ -20,6 +20,8 @@ #include #include +#include "mace/core/types.h" + #define MACE_BENCHMARK(n) \ static ::mace::testing::Benchmark *__benchmark_##n = \ (new ::mace::testing::Benchmark(#n, (n))) diff --git a/mace/core/testing/test_benchmark_main.cc b/mace/core/testing/test_benchmark_main.cc index da78d3ffc1646e2187f05db04d16dc16c96a8acf..9da650f8192f5e384a2367098938deff19902d81 100644 --- a/mace/core/testing/test_benchmark_main.cc +++ b/mace/core/testing/test_benchmark_main.cc @@ -33,8 +33,7 @@ int main(int argc, char **argv) { // config runtime mace::ops::test::OpTestContext::Get( FLAGS_omp_num_threads, - static_cast(FLAGS_cpu_affinity_policy), - true); + static_cast(FLAGS_cpu_affinity_policy)); mace::testing::Benchmark::Run(FLAGS_filter.c_str()); return 0; diff --git a/mace/core/types.h b/mace/core/types.h index 4ac00a54f736ace49e50219b81f32a53996272c4..8dde57fd48d4bfd29405b28bfdcbc05a67d0c897 100644 --- a/mace/core/types.h +++ b/mace/core/types.h @@ -54,6 +54,12 @@ MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF); MACE_MAPPING_DATA_TYPE_AND_ENUM(float, DT_FLOAT); MACE_MAPPING_DATA_TYPE_AND_ENUM(uint8_t, DT_UINT8); MACE_MAPPING_DATA_TYPE_AND_ENUM(int32_t, DT_INT32); + +enum FrameworkType { + TENSORFLOW = 0, + CAFFE = 1, +}; + } // namespace mace #endif // MACE_CORE_TYPES_H_ diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 8009fda180a7d186ec9e27b0c0751cd34eeb0a11..7cb97fe77cb1a7f4ee6e2e1cf41aaa0d2062070e 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -19,7 +19,7 @@ #include "mace/core/arg_helper.h" #include "mace/core/memory_optimizer.h" -#include "mace/utils/quantize.h" +#include "mace/core/quantize.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/opencl_runtime.h" @@ -95,8 +95,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, model_data_size = std::max( model_data_size, static_cast(const_tensor.offset() + - const_tensor.data_size() * - GetEnumTypeSize(const_tensor.data_type()))); + const_tensor.data_size() * + GetEnumTypeSize(const_tensor.data_type()))); } VLOG(3) << "Model data size: " << model_data_size; @@ -163,11 +163,13 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, auto quantized_data = reinterpret_cast( model_data + const_tensor.offset()); auto dequantized_data = tensor->mutable_data(); - Dequantize(quantized_data, - tensor->size(), - const_tensor.scale(), - const_tensor.zero_point(), - dequantized_data); + QuantizeUtil + quantize_util(&device->cpu_runtime()->thread_pool()); + quantize_util.Dequantize(quantized_data, + tensor->size(), + const_tensor.scale(), + const_tensor.zero_point(), + dequantized_data); } else { tensor->CopyBytes(model_data + const_tensor.offset(), const_tensor.data_size() * @@ -185,14 +187,14 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, if (device_type == DeviceType::CPU) { tensor_buffer_ = std::unique_ptr( new Buffer(device->allocator(), - const_cast(model_data), + const_cast(model_data), model_data_size)); } else { tensor_buffer_ = std::unique_ptr( new Buffer(device->allocator())); MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size)); tensor_buffer_->Map(nullptr); - tensor_buffer_->Copy(const_cast(model_data), + tensor_buffer_->Copy(const_cast(model_data), 0, model_data_size); tensor_buffer_->UnMap(); } diff --git a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc index 442e3a1a25f5d22bc8198e0c5b6d87894738f4e8..f3aac339dc957f33432e7036e86c19a2951bfbfa 100755 --- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc +++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc @@ -112,8 +112,7 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( mace::MaceEngineConfig config(mace_context.device_type); status = config.SetCPUThreadPolicy( omp_num_threads, - static_cast(cpu_affinity_policy), - true); + static_cast(cpu_affinity_policy)); if (status != mace::MaceStatus::MACE_SUCCESS) { __android_log_print(ANDROID_LOG_ERROR, "image_classify attrs", diff --git a/mace/examples/cli/BUILD.bazel b/mace/examples/cli/BUILD.bazel index efd4454dafa4fa6d790908b6234822532b0c4098..edd9170d56399309c71b183f44e21ebffe7b3c6d 100644 --- a/mace/examples/cli/BUILD.bazel +++ b/mace/examples/cli/BUILD.bazel @@ -5,6 +5,7 @@ load( "if_darwin", "if_hexagon_enabled", "if_hta_enabled", + "if_linux", "if_opencl_enabled", "if_openmp_enabled", ) @@ -21,13 +22,12 @@ cc_binary( linkopts = [ "-lm", "-ldl", - ] + if_darwin( - [], + ] + if_linux(["-lpthread"]) + if_darwin( + ["-lpthread"], default_value = ["-fuse-ld=gold"], ) + if_openmp_enabled([ "-fopenmp", ]) + if_android([ - "-ldl", "-pie", "-llog", ]), @@ -60,11 +60,10 @@ cc_binary( linkopts = [ "-lm", "-ldl", - ] + if_darwin( - [], + ] + if_linux(["-lpthread"]) + if_darwin( + ["-lpthread"], default_value = ["-fuse-ld=gold"], ) + if_android([ - "-ldl", "-pie", "-llog", ]), diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index da43f5e27dafdecf90c3e18ce83bb66d42165343..c5e16b762a57e6eddcebc269d7f369ffabac28dd 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -149,7 +149,7 @@ GPUContextBuilder &GPUContextBuilder::SetOpenCLBinaryPaths( return *this; } -GPUContextBuilder& GPUContextBuilder::SetOpenCLBinary( +GPUContextBuilder &GPUContextBuilder::SetOpenCLBinary( const unsigned char *data, const size_t size) { impl_->SetOpenCLBinary(data, size); return *this; @@ -161,7 +161,7 @@ GPUContextBuilder &GPUContextBuilder::SetOpenCLParameterPath( return *this; } -GPUContextBuilder& GPUContextBuilder::SetOpenCLParameter( +GPUContextBuilder &GPUContextBuilder::SetOpenCLParameter( const unsigned char *data, const size_t size) { impl_->SetOpenCLParameter(data, size); return *this; @@ -181,8 +181,7 @@ class MaceEngineConfig::Impl { MaceStatus SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); MaceStatus SetCPUThreadPolicy(int num_threads_hint, - CPUAffinityPolicy policy, - bool use_gemmlowp); + CPUAffinityPolicy policy); inline DeviceType device_type() const { return device_type_; @@ -196,10 +195,6 @@ class MaceEngineConfig::Impl { return cpu_affinity_policy_; } - inline bool use_gemmlowp() const { - return use_gemmlowp_; - } - inline std::shared_ptr gpu_context() const { return gpu_context_; } @@ -216,7 +211,6 @@ class MaceEngineConfig::Impl { DeviceType device_type_; int num_threads_; CPUAffinityPolicy cpu_affinity_policy_; - bool use_gemmlowp_; std::shared_ptr gpu_context_; GPUPriorityHint gpu_priority_hint_; GPUPerfHint gpu_perf_hint_; @@ -226,7 +220,6 @@ MaceEngineConfig::Impl::Impl(const DeviceType device_type) : device_type_(device_type), num_threads_(-1), cpu_affinity_policy_(CPUAffinityPolicy::AFFINITY_NONE), - use_gemmlowp_(false), gpu_context_(new GPUContext), gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW), gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {} @@ -247,15 +240,12 @@ MaceStatus MaceEngineConfig::Impl::SetGPUHints( MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy( int num_threads, - CPUAffinityPolicy policy, - bool use_gemmlowp) { + CPUAffinityPolicy policy) { num_threads_ = num_threads; cpu_affinity_policy_ = policy; - use_gemmlowp_ = use_gemmlowp; return MaceStatus::MACE_SUCCESS; } - MaceEngineConfig::MaceEngineConfig( const DeviceType device_type) : impl_(new MaceEngineConfig::Impl(device_type)) {} @@ -275,9 +265,8 @@ MaceStatus MaceEngineConfig::SetGPUHints( MaceStatus MaceEngineConfig::SetCPUThreadPolicy( int num_threads_hint, - CPUAffinityPolicy policy, - bool use_gemmlowp) { - return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp); + CPUAffinityPolicy policy) { + return impl_->SetCPUThreadPolicy(num_threads_hint, policy); } // Mace Tensor @@ -407,6 +396,7 @@ class MaceEngine::Impl { #endif std::map input_info_map_; std::map output_info_map_; + std::unique_ptr thread_pool_; MACE_DISABLE_COPY_AND_ASSIGN(Impl); }; @@ -418,16 +408,19 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config) device_(nullptr), ws_(new Workspace()), net_(nullptr), - is_quantized_model_(false) + is_quantized_model_(false), + thread_pool_(new utils::ThreadPool(config.impl_->num_threads(), + config.impl_->cpu_affinity_policy())) #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) - , hexagon_controller_(nullptr) +, hexagon_controller_(nullptr) #endif { LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion(); + thread_pool_->Init(); if (device_type_ == DeviceType::CPU) { device_.reset(new CPUDevice(config.impl_->num_threads(), config.impl_->cpu_affinity_policy(), - config.impl_->use_gemmlowp())); + thread_pool_.get())); } #ifdef MACE_ENABLE_OPENCL if (device_type_ == DeviceType::GPU) { @@ -439,12 +432,13 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config) config.impl_->gpu_context()->opencl_binary_storage(), config.impl_->num_threads(), config.impl_->cpu_affinity_policy(), - config.impl_->use_gemmlowp())); + thread_pool_.get())); } #endif #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) - if (device_type_ == DeviceType::HEXAGON || device_type_ == DeviceType::HTA) { - device_.reset(new HexagonDevice(device_type_)); + if (device_type_ == DeviceType::HEXAGON + || device_type_ == DeviceType::HTA) { + device_.reset(new HexagonDevice(device_type_, thread_pool_.get())); } #endif MACE_CHECK_NOTNULL(device_); @@ -506,7 +500,7 @@ MaceStatus MaceEngine::Impl::Init( } #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) if (device_type_ == HEXAGON || device_type_ == HTA) { - hexagon_controller_ = CreateHexagonControlWrapper(device_type_); + hexagon_controller_ = CreateHexagonControlWrapper(device_.get()); MACE_CHECK(hexagon_controller_->Config(), "hexagon config error"); MACE_CHECK(hexagon_controller_->Init(), "hexagon init error"); hexagon_controller_->SetDebugLevel( @@ -518,26 +512,26 @@ MaceStatus MaceEngine::Impl::Init( } } else { #endif - MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def, - device_.get(), - model_data)); - - MemoryOptimizer mem_optimizer; - // Init model - net_ = std::unique_ptr(new SerialNet(op_registry_.get(), - net_def, - ws_.get(), - device_.get(), - &mem_optimizer)); - - // Preallocate all output tensors of ops - MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def, - &mem_optimizer, - device_.get())); - if (device_type_ == DeviceType::GPU) { - ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator()); - } - MACE_RETURN_IF_ERROR(net_->Init()); + MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def, + device_.get(), + model_data)); + + MemoryOptimizer mem_optimizer; + // Init model + net_ = std::unique_ptr(new SerialNet(op_registry_.get(), + net_def, + ws_.get(), + device_.get(), + &mem_optimizer)); + + // Preallocate all output tensors of ops + MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def, + &mem_optimizer, + device_.get())); + if (device_type_ == DeviceType::GPU) { + ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator()); + } + MACE_RETURN_IF_ERROR(net_->Init()); #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) } #endif @@ -554,10 +548,10 @@ MaceStatus MaceEngine::Impl::Init( auto fs = GetFileSystem(); MACE_RETURN_IF_ERROR(fs->NewReadOnlyMemoryRegionFromFile( - model_data_file.c_str(), &model_data_)); + model_data_file.c_str(), &model_data_)); MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, - reinterpret_cast(model_data_->data()))); + reinterpret_cast(model_data_->data()))); if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON || device_type_ == DeviceType::HTA || @@ -611,18 +605,18 @@ MaceStatus MaceEngine::Impl::TransposeInput( Tensor::MappingGuard input_guard(input_tensor); if (input_dt == DataType::DT_FLOAT) { auto input_data = input_tensor->mutable_data(); - return ops::Transpose(input.second.data().get(), + return ops::Transpose(thread_pool_.get(), + input.second.data().get(), input.second.shape(), dst_dims, - input_data, - input_dt); + input_data); } else if (input_dt == DataType::DT_INT32) { auto input_data = input_tensor->mutable_data(); - return ops::Transpose(input.second.data().get(), + return ops::Transpose(thread_pool_.get(), + input.second.data().get(), input.second.shape(), dst_dims, - input_data, - input_dt); + input_data); } else { LOG(FATAL) << "MACE do not support the input data type: " << input_dt; } @@ -668,7 +662,7 @@ MaceStatus MaceEngine::Impl::TransposeOutput( output->second.data_format() == NCHW) { dst_dims = {0, 3, 1, 2}; } else { - LOG(FATAL) <<"Not supported output data format: " + LOG(FATAL) << "Not supported output data format: " << output->second.data_format() << " vs " << output_tensor->data_format(); } @@ -688,17 +682,18 @@ MaceStatus MaceEngine::Impl::TransposeOutput( Tensor::MappingGuard output_guard(output_tensor); if (output_dt == DataType::DT_FLOAT) { auto output_data = output_tensor->data(); - return ops::Transpose(output_data, + return ops::Transpose(thread_pool_.get(), + output_data, output_tensor->shape(), dst_dims, output->second.data().get()); } else if (output_dt == DataType::DT_INT32) { auto output_data = output_tensor->data(); - return ops::Transpose(output_data, + return ops::Transpose(thread_pool_.get(), + output_data, output_tensor->shape(), dst_dims, - output->second.data().get(), - output_dt); + output->second.data().get()); } else { LOG(FATAL) << "MACE do not support the output data type: " << output_dt; return MaceStatus::MACE_INVALID_ARGS; @@ -719,8 +714,8 @@ MaceStatus MaceEngine::Impl::TransposeOutput( output_size * sizeof(float)); } else if (output_dt == DataType::DT_INT32) { std::memcpy(output->second.data().get(), - output_tensor->data(), - output_size * sizeof(int)); + output_tensor->data(), + output_size * sizeof(int)); } else { LOG(FATAL) << "MACE do not support the output data type: " << output_dt; } @@ -736,8 +731,8 @@ MaceStatus MaceEngine::Impl::Run( std::map *outputs, RunMetadata *run_metadata) { MACE_CHECK_NOTNULL(outputs); - std::map input_tensors; - std::map output_tensors; + std::map input_tensors; + std::map output_tensors; for (auto &input : inputs) { if (input_info_map_.find(input.first) == input_info_map_.end()) { LOG(FATAL) << "'" << input.first @@ -766,7 +761,7 @@ MaceStatus MaceEngine::Impl::Run( hexagon_controller_->ExecuteGraphNew(input_tensors, &output_tensors); } else { #endif - MACE_RETURN_IF_ERROR(net_->Run(run_metadata)); + MACE_RETURN_IF_ERROR(net_->Run(run_metadata)); #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) } #endif @@ -785,7 +780,7 @@ MaceStatus MaceEngine::Impl::Run( return MaceStatus::MACE_SUCCESS; } -MaceEngine::MaceEngine(const MaceEngineConfig &config): +MaceEngine::MaceEngine(const MaceEngineConfig &config) : impl_(make_unique(config)) {} MaceEngine::~MaceEngine() = default; @@ -797,7 +792,6 @@ MaceStatus MaceEngine::Init(const NetDef *net_def, return impl_->Init(net_def, input_nodes, output_nodes, model_data); } - MaceStatus MaceEngine::Init(const NetDef *net_def, const std::vector &input_nodes, const std::vector &output_nodes, diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel index 255250fd945e388981fb46f7fa5443f624059227..5d2d2cb26668c6ac304c38fbbe14c8e95da96303 100644 --- a/mace/ops/BUILD.bazel +++ b/mace/ops/BUILD.bazel @@ -279,7 +279,6 @@ cc_library( srcs = glob( [ "*.cc", - "arm/*.cc", # remove it after refactor ], exclude = [ "*_test.cc", @@ -303,7 +302,6 @@ cc_library( hdrs = glob( [ "*.h", - "arm/*.h", # remove it after refactor ], exclude = [ "ops_registry.h", diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index 29fee227df0ebac83d9a2e8c9a275a62aff8c68a..bcdcd8e062b21c91b3a44bf8dd999237a385f3c6 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -15,9 +15,14 @@ #include "mace/ops/activation.h" #include - #include "mace/core/operator.h" +#if defined(MACE_ENABLE_NEON) +#include "mace/ops/arm/fp32/activation.h" +#else +#include "mace/ops/ref/activation.h" +#endif + #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/activation.h" @@ -27,52 +32,54 @@ namespace mace { namespace ops { -template +template class ActivationOp; -template <> +template<> class ActivationOp : public Operation { public: explicit ActivationOp(OpConstructContext *context) : Operation(context), - activation_(ops::StringToActivationType( + activation_type_(ops::StringToActivationType( Operation::GetOptionalArg("activation", "NOOP"))), - relux_max_limit_(Operation::GetOptionalArg("max_limit", - 0.0f)), - leakyrelu_coefficient_(Operation::GetOptionalArg( - "leakyrelu_coefficient", 0.0f)) {} + activation_delegator_(activation_type_, + Operation::GetOptionalArg("max_limit", + 0.0f), + Operation::GetOptionalArg( + "leakyrelu_coefficient", 0.0f)) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); const Tensor *input = this->Input(0); Tensor *output = this->Output(0); - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - const float *input_ptr = input->data(); - float *output_ptr = output->mutable_data(); - if (activation_ == PRELU) { + if (activation_type_ == PRELU) { + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + const float *input_ptr = input->data(); + float *output_ptr = output->mutable_data(); MACE_CHECK(this->InputSize() > 1); const Tensor *alpha = this->Input(1); const float *alpha_ptr = alpha->data(); const index_t outer_size = output->dim(0); const index_t inner_size = output->dim(2) * output->dim(3); - PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size, + PReLUActivation(context, input_ptr, outer_size, input->dim(1), inner_size, alpha_ptr, output_ptr); } else { - DoActivation(input_ptr, output_ptr, output->size(), activation_, - relux_max_limit_, leakyrelu_coefficient_); + activation_delegator_.Compute(context, input, output); } return MaceStatus::MACE_SUCCESS; } private: - ActivationType activation_; - float relux_max_limit_; - float leakyrelu_coefficient_; + ActivationType activation_type_; +#if defined(MACE_ENABLE_NEON) + arm::fp32::Activation activation_delegator_; +#else + ref::Activation activation_delegator_; +#endif // MACE_ENABLE_NEON }; - #ifdef MACE_ENABLE_OPENCL template class ActivationOp : public Operation { @@ -114,7 +121,6 @@ class ActivationOp : public Operation { }; #endif // MACE_ENABLE_OPENCL - void RegisterActivation(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, DeviceType::CPU, float); diff --git a/mace/ops/activation.h b/mace/ops/activation.h index 9981652c78d4290289fc2ce8392adc6550fe267c..9ceae6e07ff983e5c577406d60b6616c56da4fc3 100644 --- a/mace/ops/activation.h +++ b/mace/ops/activation.h @@ -20,8 +20,8 @@ #include #include "mace/core/types.h" +#include "mace/core/op_context.h" #include "mace/ops/common/activation_type.h" -#include "mace/ops/arm/activation_neon.h" #include "mace/utils/logging.h" namespace mace { @@ -41,118 +41,39 @@ inline ActivationType StringToActivationType(const std::string type) { } else if (type == "NOOP") { return ActivationType::NOOP; } else if (type == "LEAKYRELU") { - return ActivationType ::LEAKYRELU; + return ActivationType::LEAKYRELU; } else { LOG(FATAL) << "Unknown activation type: " << type; } return ActivationType::NOOP; } -template -void DoActivation(const T *input_ptr, - T *output_ptr, - const index_t size, - const ActivationType type, - const float relux_max_limit, - const float leakyrelu_coefficient) { - MACE_CHECK(DataTypeToEnum::value != DataType::DT_HALF); - - switch (type) { - case NOOP: - break; - case RELU: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = std::max(input_ptr[i], static_cast(0)); - } - break; - case RELUX: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = std::min(std::max(input_ptr[i], static_cast(0)), - static_cast(relux_max_limit)); - } - break; - case TANH: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = std::tanh(input_ptr[i]); - } - break; - case SIGMOID: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i])); - } - break; - case LEAKYRELU: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = std::max(input_ptr[i], static_cast(0)) - + leakyrelu_coefficient * std::min(input_ptr[i], static_cast(0)); - } - break; - default: - LOG(FATAL) << "Unknown activation type: " << type; - } -} - -template<> -inline void DoActivation(const float *input_ptr, - float *output_ptr, - const index_t size, - const ActivationType type, - const float relux_max_limit, - const float leakyrelu_coefficient) { - switch (type) { - case NOOP: - break; - case RELU: - ReluNeon(input_ptr, size, output_ptr); - break; - case RELUX: - ReluxNeon(input_ptr, relux_max_limit, size, output_ptr); - break; - case TANH: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = std::tanh(input_ptr[i]); - } - break; - case SIGMOID: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i])); - } - break; - case LEAKYRELU: - LeakyReluNeon(input_ptr, leakyrelu_coefficient, size, output_ptr); - break; - default: - LOG(FATAL) << "Unknown activation type: " << type; - } -} - -template -void PReLUActivation(const T *input_ptr, +template +void PReLUActivation(const OpContext *context, + const T *input_ptr, const index_t outer_size, const index_t input_chan, const index_t inner_size, const T *alpha_ptr, T *output_ptr) { -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t i = 0; i < outer_size; ++i) { - for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) { - for (index_t j = 0; j < inner_size; ++j) { - index_t idx = i * input_chan * inner_size + chan_idx * inner_size + j; - if (input_ptr[idx] < 0) { - output_ptr[idx] = input_ptr[idx] * alpha_ptr[chan_idx]; - } else { - output_ptr[idx] = input_ptr[idx]; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t chan_idx = start1; chan_idx < end1; chan_idx += step1) { + for (index_t j = 0; j < inner_size; ++j) { + index_t idx = i * input_chan * inner_size + chan_idx * inner_size + j; + if (input_ptr[idx] < 0) { + output_ptr[idx] = input_ptr[idx] * alpha_ptr[chan_idx]; + } else { + output_ptr[idx] = input_ptr[idx]; + } } } } - } + }, 0, outer_size, 1, 0, input_chan, 1); } } // namespace ops diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index 5e387d87684d833eb40c5ebe30e564ef74bb55cd..ea6458d475751a064cacb118cef64ef498a29e48 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -42,61 +42,23 @@ class AddNOp : public Operation { MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); - Tensor *output_tensor = this->Output(0); - size_t input_size = this->inputs_.size(); - MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(inputs_[0])); - index_t size = output_tensor->size(); - Tensor::MappingGuard output_map(output_tensor); - float *output_data = output_tensor->mutable_data(); - memset(output_data, 0, size * sizeof(float)); - int64_t cost = size * input_size; - int64_t groups = 1; - if (cost > kCostPerGroup) { - groups = cost / kCostPerGroup; - } - int64_t element_per_group = size / groups; + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(inputs_[0])); + const index_t size = output->size(); - std::vector mappers; - for (size_t i = 0; i < input_size; ++i) { - MACE_CHECK(inputs_[0]->dim_size() == inputs_[i]->dim_size()); - MACE_CHECK(inputs_[0]->size() == inputs_[i]->size()) - << "Input 0: " << MakeString(inputs_[0]->shape()) - << ", size: " << inputs_[0]->size() << ". Input " << i << ": " - << MakeString(inputs_[i]->shape()) << ", size: " << inputs_[i]->size(); - mappers.emplace_back(Tensor::MappingGuard(inputs_[i])); - } + Tensor::MappingGuard output_guard(output); + auto output_data = output->mutable_data(); + memset(output_data, 0, size * sizeof(float)); -#pragma omp parallel for - for (int64_t i = 0; i < size; i += element_per_group) { - int64_t count = std::min(element_per_group, size - i); - int nn = count >> 2; - int remain = count - (nn << 2); - for (size_t j = 0; j < input_size; ++j) { - const float *input_data = inputs_[j]->data(); - const float *input_ptr = input_data + i; - float *output_ptr = output_data + i; - for (int k = 0; k < nn; ++k) { -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) - float32x4_t in = vld1q_f32(input_ptr); - float32x4_t out = vld1q_f32(output_ptr); - out = vaddq_f32(out, in); - vst1q_f32(output_ptr, out); -#else - for (int m = 0; m < 4; ++m) { - output_ptr[m] += input_ptr[m]; - } -#endif + for (auto &input : inputs_) { + Tensor::MappingGuard input_guard(input); + auto input_data = input->data(); - input_ptr += 4; - output_ptr += 4; - } - for (int k = 0; k < remain; ++k) { - *output_ptr += *input_ptr; - ++input_ptr; - ++output_ptr; - } + for (index_t j = 0; j < size; ++j) { + output_data[j] += input_data[j]; } } + return MaceStatus::MACE_SUCCESS; } }; diff --git a/mace/ops/argmax.cc b/mace/ops/argmax.cc index 71a1090205b15e38d4357fa78ded3883ef9ea536..32007d6ccbcd59cd78670ad7f46aced4a3e6fa4c 100644 --- a/mace/ops/argmax.cc +++ b/mace/ops/argmax.cc @@ -71,7 +71,6 @@ class ArgMaxOp : public Operation { index_t inner_size = input->dim(axis_value); if (argmin_) { -#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < outer_size; ++i) { int idx = 0; T min_value = std::numeric_limits::max(); @@ -85,7 +84,6 @@ class ArgMaxOp : public Operation { output_data[i] = idx; } } else { -#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < outer_size; ++i) { int idx = 0; T max_value = std::numeric_limits::lowest(); diff --git a/mace/ops/arm/activation_neon.cc b/mace/ops/arm/activation_neon.cc deleted file mode 100644 index 09cfd8d4e0e0bd7ba09bf5f7e31c1bb57afa818b..0000000000000000000000000000000000000000 --- a/mace/ops/arm/activation_neon.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#include -#include "mace/ops/arm/activation_neon.h" - -namespace mace { -namespace ops { - -void ReluNeon(const float *input, const index_t size, float *output) { -#if defined(MACE_ENABLE_NEON) - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i <= size - 4; i += 4) { - float32x4_t v = vld1q_f32(input + i); - v = vmaxq_f32(v, vzero); - vst1q_f32(output + i, v); - } - // remain - for (index_t i = (size >> 2) << 2; i < size; ++i) { - output[i] = std::max(input[i], 0.f); - } -#else -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::max(input[i], 0.f); - } -#endif -} - -void ReluxNeon(const float *input, const float limit, - const index_t size, float *output) { -#if defined(MACE_ENABLE_NEON) - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vlimit = vdupq_n_f32(limit); -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i <= size - 4; i += 4) { - float32x4_t v = vld1q_f32(input + i); - v = vmaxq_f32(v, vzero); - v = vminq_f32(v, vlimit); - vst1q_f32(output + i, v); - } - // remain - for (index_t i = (size >> 2) << 2; i < size; ++i) { - output[i] = std::min(std::max(input[i], 0.f), limit); - } -#else -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::min(std::max(input[i], 0.f), limit); - } -#endif -} - -void LeakyReluNeon(const float *input, const float alpha, - const index_t size, float *output) { -#if defined(MACE_ENABLE_NEON) - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t valpha = vdupq_n_f32(alpha); -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i <= size - 4; i += 4) { - float32x4_t v = vld1q_f32(input + i); - float32x4_t u = vminq_f32(v, vzero);; - v = vmaxq_f32(v, vzero); - v = vmlaq_f32(v, valpha, u); - - vst1q_f32(output + i, v); - } - // remain - for (index_t i = (size >> 2) << 2; i < size; ++i) { - output[i] = std::max(input[i], 0.f) + std::min(input[i], 0.f) * alpha; - } -#else -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::max(input[i], 0.f) + std::min(input[i], 0.f) * alpha; - } -#endif -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/deconv_2d_neon.h b/mace/ops/arm/deconv_2d_neon.h deleted file mode 100644 index f45fa923bdd19c6420a4ab0e6b751541ce3b1f76..0000000000000000000000000000000000000000 --- a/mace/ops/arm/deconv_2d_neon.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ARM_DECONV_2D_NEON_H_ -#define MACE_OPS_ARM_DECONV_2D_NEON_H_ - -#include "mace/core/types.h" -#include "mace/ops/arm/common_neon.h" - -namespace mace { -namespace ops { - -void Deconv2dNeonK2x2S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Deconv2dNeonK2x2S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Deconv2dNeonK3x3S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Deconv2dNeonK3x3S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Deconv2dNeonK4x4S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Deconv2dNeonK4x4S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ARM_DECONV_2D_NEON_H_ diff --git a/mace/ops/arm/deconv_2d_neon_2x2.cc b/mace/ops/arm/deconv_2d_neon_2x2.cc deleted file mode 100644 index 674864c8b6527631d4d5800a9e892bc662826bc7..0000000000000000000000000000000000000000 --- a/mace/ops/arm/deconv_2d_neon_2x2.cc +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/utils/macros.h" -#include "mace/ops/arm/deconv_2d_neon.h" - -namespace mace { -namespace ops { - -void Deconv2dNeonK2x2S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - - const index_t out_img_size = outh * outw; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t oc = 0; oc < outch; oc += 2) { - if (oc + 1 < outch) { - float *out_base0 = output + (b * outch + oc) * out_img_size; - float *out_base1 = out_base0 + out_img_size; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input + (b * inch + ic) * h * w; - const float *kernel_base0 = filter + (oc * inch + ic) * 4; - const float *kernel_base1 = kernel_base0 + inch * 4; - const float *in = input_base; - // output channel 0 - const float *k0 = kernel_base0; - // output channel 1 - const float *k1 = kernel_base1; -#if defined(MACE_ENABLE_NEON) - // load filter - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); -#endif - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; - float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; - - float *out_row_base1 = out_base1 + i * outw; - float *out_row1_0 = out_row_base1; - float *out_row1_1 = out_row_base1 + outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - - float32x4_t out00, out01, out02, out03; - float32x4_t out10, out11, out12, out13; - - out00 = vld1q_f32(out_row0_0); - out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); - vst1q_f32(out_row0_0, out00); - - out01 = vld1q_f32(out_row0_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); - vst1q_f32(out_row0_0 + 1, out01); - - out02 = vld1q_f32(out_row0_1); - out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); - vst1q_f32(out_row0_1, out02); - - out03 = vld1q_f32(out_row0_1 + 1); - out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); - vst1q_f32(out_row0_1 + 1, out03); - - out10 = vld1q_f32(out_row1_0); - out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); - vst1q_f32(out_row1_0, out10); - - out11 = vld1q_f32(out_row1_0 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); - vst1q_f32(out_row1_0 + 1, out11); - - out12 = vld1q_f32(out_row1_1); - out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); - vst1q_f32(out_row1_1, out12); - - out13 = vld1q_f32(out_row1_1 + 1); - out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); - vst1q_f32(out_row1_1 + 1, out13); - - in += 4; - out_row0_0 += 4; - out_row0_1 += 4; - out_row1_0 += 4; - out_row1_1 += 4; - } -#endif - for (; j < w; ++j) { - float val = in[0]; - for (int k = 0; k < 2; ++k) { - out_row0_0[k] += val * k0[k]; - out_row0_1[k] += val * k0[k + 2]; - out_row1_0[k] += val * k1[k]; - out_row1_1[k] += val * k1[k + 2]; - } - in++; - out_row0_0++; - out_row0_1++; - out_row1_0++; - out_row1_1++; - } - } - } - } else { - float *out_base0 = output + (b * outch + oc) * outh * outw; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input + (b * inch + ic) * h * w; - const float *kernel_base0 = filter + (oc * inch + ic) * 4; - const float *in = input_base; - const float *k0 = kernel_base0; - -#if defined(MACE_ENABLE_NEON) - // load filter - float32x4_t k0_vec = vld1q_f32(k0); -#endif - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; - float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - float32x4_t out00, out01, out02, out03; - - out00 = vld1q_f32(out_row0_0); - out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); - vst1q_f32(out_row0_0, out00); - - out01 = vld1q_f32(out_row0_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); - vst1q_f32(out_row0_0 + 1, out01); - - out02 = vld1q_f32(out_row0_1); - out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); - vst1q_f32(out_row0_1, out02); - - out03 = vld1q_f32(out_row0_1 + 1); - out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); - vst1q_f32(out_row0_1 + 1, out03); - - in += 4; - out_row0_0 += 4; - out_row0_1 += 4; - } -#endif - for (; j < w; ++j) { - float val = in[0]; - for (int k = 0; k < 2; ++k) { - out_row0_0[k] += val * k0[k]; - out_row0_1[k] += val * k0[k + 2]; - } - in++; - out_row0_0++; - out_row0_1++; - } - } - } - } - } - } -} - -void Deconv2dNeonK2x2S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t oc = 0; oc < outch; ++oc) { - float *out_base = output + (b * outch + oc) * out_img_size; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input + (b * inch + ic) * h * w; - const float *kernel_base = filter + (oc * inch + ic) * 4; - const float *in = input_base; - const float *k0 = kernel_base; -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); -#endif - for (index_t i = 0; i < h; ++i) { - float *out_row_base = out_base + i * 2 * outw; - float *out_row_0 = out_row_base; - float *out_row_1 = out_row_0 + outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - - // out row 0 - float32x4x2_t out00 = vld2q_f32(out_row_0); - out00.val[0] = - neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); - out00.val[1] = - neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); - vst2q_f32(out_row_0, out00); - - // out row 1 - float32x4x2_t out10 = vld2q_f32(out_row_1); - out10.val[0] = - neon_vfma_lane_2(out10.val[0], in_vec, k0_vec); - out10.val[1] = - neon_vfma_lane_3(out10.val[1], in_vec, k0_vec); - vst2q_f32(out_row_1, out10); - - in += 4; - out_row_0 += 8; - out_row_1 += 8; - } -#endif - for (; j < w; ++j) { - float val = in[0]; - for (int k = 0; k < 2; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k0[k + 2]; - } - in++; - out_row_0 += 2; - out_row_1 += 2; - } - } - } - } - } -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/deconv_2d_neon_3x3.cc b/mace/ops/arm/deconv_2d_neon_3x3.cc deleted file mode 100644 index 04f62325817f5a02919ea859c3e5c5ba4a974f40..0000000000000000000000000000000000000000 --- a/mace/ops/arm/deconv_2d_neon_3x3.cc +++ /dev/null @@ -1,392 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/utils/macros.h" -#include "mace/ops/arm/deconv_2d_neon.h" - -namespace mace { -namespace ops { - -void Deconv2dNeonK3x3S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - - const index_t out_img_size = outh * outw; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t oc = 0; oc < outch; oc += 2) { - if (oc + 1 < outch) { - float *out_base0 = output + (b * outch + oc) * out_img_size; - float *out_base1 = out_base0 + out_img_size; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input + (b * inch + ic) * h * w; - const float *kernel_base0 = filter + (oc * inch + ic) * 9; - const float *kernel_base1 = kernel_base0 + inch * 9; - const float *in = input_base; - - // output channel 0 - const float *k0_0 = kernel_base0; - const float *k0_1 = kernel_base0 + 3; - const float *k0_2 = kernel_base0 + 5; - // output channel 1 - const float *k1_0 = kernel_base1; - const float *k1_1 = kernel_base1 + 3; - const float *k1_2 = kernel_base1 + 5; - -#if defined(MACE_ENABLE_NEON) - // load filter - float32x4_t k00_vec, k01_vec, k02_vec; - float32x4_t k10_vec, k11_vec, k12_vec; - - k00_vec = vld1q_f32(k0_0); - k01_vec = vld1q_f32(k0_1); - k02_vec = vld1q_f32(k0_2); - - k10_vec = vld1q_f32(k1_0); - k11_vec = vld1q_f32(k1_1); - k12_vec = vld1q_f32(k1_2); -#endif - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; - float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; - float *out_row0_2 = out_row_base0 + 2 * outw; - - float *out_row_base1 = out_base1 + i * outw; - float *out_row1_0 = out_row_base1; - float *out_row1_1 = out_row_base1 + outw; - float *out_row1_2 = out_row_base1 + 2 * outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - - float32x4_t out00, out01, out02; - float32x4_t out10, out11, out12; - float32x4_t out20, out21, out22; - - out00 = vld1q_f32(out_row0_0); - out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); - vst1q_f32(out_row0_0, out00); - - out01 = vld1q_f32(out_row0_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); - vst1q_f32(out_row0_0 + 1, out01); - - out02 = vld1q_f32(out_row0_0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); - vst1q_f32(out_row0_0 + 2, out02); - - out10 = vld1q_f32(out_row0_1 + 0); - out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 0, out10); - - out11 = vld1q_f32(out_row0_1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 1, out11); - - out12 = vld1q_f32(out_row0_1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 2, out12); - - out20 = vld1q_f32(out_row0_2 + 0); - out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 0, out20); - - out21 = vld1q_f32(out_row0_2 + 1); - out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 1, out21); - - out22 = vld1q_f32(out_row0_2 + 2); - out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 2, out22); - - out00 = vld1q_f32(out_row1_0 + 0); - out00 = neon_vfma_lane_0(out00, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 0, out00); - - out01 = vld1q_f32(out_row1_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 1, out01); - - out02 = vld1q_f32(out_row1_0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 2, out02); - - out10 = vld1q_f32(out_row1_1 + 0); - out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 0, out10); - - out11 = vld1q_f32(out_row1_1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 1, out11); - - out12 = vld1q_f32(out_row1_1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 2, out12); - - out20 = vld1q_f32(out_row1_2 + 0); - out20 = neon_vfma_lane_1(out20, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 0, out20); - - out21 = vld1q_f32(out_row1_2 + 1); - out21 = neon_vfma_lane_2(out21, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 1, out21); - - out22 = vld1q_f32(out_row1_2 + 2); - out22 = neon_vfma_lane_3(out22, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 2, out22); - - in += 4; - out_row0_0 += 4; - out_row0_1 += 4; - out_row0_2 += 4; - out_row1_0 += 4; - out_row1_1 += 4; - out_row1_2 += 4; - } -#endif - for (; j < w; ++j) { - float val = in[0]; - for (int k = 0; k < 3; ++k) { - out_row0_0[k] += val * k0_0[k]; - out_row0_1[k] += val * k0_1[k]; - out_row0_2[k] += val * k0_2[k + 1]; - out_row1_0[k] += val * k1_0[k]; - out_row1_1[k] += val * k1_1[k]; - out_row1_2[k] += val * k1_2[k + 1]; - } - in++; - out_row0_0++; - out_row0_1++; - out_row0_2++; - out_row1_0++; - out_row1_1++; - out_row1_2++; - } - } - } - } else { - float *out_base0 = output + (b * outch + oc) * outh * outw; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input + (b * inch + ic) * h * w; - const float *kernel_base0 = filter + (oc * inch + ic) * 9; - const float *in = input_base; - const float *k0_0 = kernel_base0; - const float *k0_1 = kernel_base0 + 3; - const float *k0_2 = kernel_base0 + 5; - -#if defined(MACE_ENABLE_NEON) - // load filter - float32x4_t k00_vec = vld1q_f32(k0_0); - float32x4_t k01_vec = vld1q_f32(k0_1); - float32x4_t k02_vec = vld1q_f32(k0_2); -#endif - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; - float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; - float *out_row0_2 = out_row_base0 + 2 * outw; - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - - float32x4_t out00, out01, out02; - float32x4_t out10, out11, out12; - float32x4_t out20, out21, out22; - - out00 = vld1q_f32(out_row0_0 + 0); - out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); - vst1q_f32(out_row0_0 + 0, out00); - - out01 = vld1q_f32(out_row0_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); - vst1q_f32(out_row0_0 + 1, out01); - - out02 = vld1q_f32(out_row0_0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); - vst1q_f32(out_row0_0 + 2, out02); - - out10 = vld1q_f32(out_row0_1 + 0); - out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 0, out10); - - out11 = vld1q_f32(out_row0_1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 1, out11); - - out12 = vld1q_f32(out_row0_1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 2, out12); - - out20 = vld1q_f32(out_row0_2 + 0); - out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 0, out20); - - out21 = vld1q_f32(out_row0_2 + 1); - out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 1, out21); - - out22 = vld1q_f32(out_row0_2 + 2); - out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 2, out22); - - in += 4; - out_row0_0 += 4; - out_row0_1 += 4; - out_row0_2 += 4; - } -#endif - for (; j < w; ++j) { - float val = in[0]; - for (int k = 0; k < 3; ++k) { - out_row0_0[k] += val * k0_0[k]; - out_row0_1[k] += val * k0_1[k]; - out_row0_2[k] += val * k0_2[k + 1]; - } - in++; - out_row0_0++; - out_row0_1++; - out_row0_2++; - } - } - } - } - } - } -} - -void Deconv2dNeonK3x3S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t oc = 0; oc < outch; ++oc) { - float *out_base = output + (b * outch + oc) * out_img_size; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input + (b * inch + ic) * h * w; - const float *kernel_base = filter + (oc * inch + ic) * 9; - const float *in = input_base; - - const float *k0 = kernel_base; - const float *k1 = kernel_base + 3; - const float *k2 = kernel_base + 5; - -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); -#endif - for (index_t i = 0; i < h; ++i) { - float *out_row_base = out_base + i * 2 * outw; - float *out_row_0 = out_row_base; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (index_t n = 0; n + 9 < outw; n += 8) { - float32x4_t in_vec = vld1q_f32(in); - - // out row 0 - float32x4x2_t out00 = vld2q_f32(out_row_0); - out00.val[0] = - neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); - out00.val[1] = - neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); - vst2q_f32(out_row_0, out00); - - float32x4x2_t out01 = vld2q_f32(out_row_0 + 2); - out01.val[0] = - neon_vfma_lane_2(out01.val[0], in_vec, k0_vec); - vst2q_f32(out_row_0 + 2, out01); - - // out row 1 - float32x4x2_t out10 = vld2q_f32(out_row_1); - out10.val[0] = - neon_vfma_lane_0(out10.val[0], in_vec, k1_vec); - out10.val[1] = - neon_vfma_lane_1(out10.val[1], in_vec, k1_vec); - vst2q_f32(out_row_1, out10); - - float32x4x2_t out11 = vld2q_f32(out_row_1 + 2); - out11.val[0] = - neon_vfma_lane_2(out11.val[0], in_vec, k1_vec); - vst2q_f32(out_row_1 + 2, out11); - - // out row 2 - float32x4x2_t out20 = vld2q_f32(out_row_2); - out20.val[0] = - neon_vfma_lane_1(out20.val[0], in_vec, k2_vec); - out20.val[1] = - neon_vfma_lane_2(out20.val[1], in_vec, k2_vec); - vst2q_f32(out_row_2, out20); - - float32x4x2_t out21 = vld2q_f32(out_row_2 + 2); - out21.val[0] = - neon_vfma_lane_3(out21.val[0], in_vec, k2_vec); - vst2q_f32(out_row_2 + 2, out21); - - in += 4; - out_row_0 += 8; - out_row_1 += 8; - out_row_2 += 8; - j += 4; - } -#endif - for (; j < w; ++j) { - float val = in[0]; - - for (int k = 0; k < 3; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k1[k]; - out_row_2[k] += val * k2[k + 1]; - } - - in++; - out_row_0 += 2; - out_row_1 += 2; - out_row_2 += 2; - } - } - } - } - } -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/deconv_2d_neon_4x4.cc b/mace/ops/arm/deconv_2d_neon_4x4.cc deleted file mode 100644 index 443a188f322c448c6e8bf36b14b3babc91725cf4..0000000000000000000000000000000000000000 --- a/mace/ops/arm/deconv_2d_neon_4x4.cc +++ /dev/null @@ -1,506 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/utils/macros.h" -#include "mace/ops/arm/deconv_2d_neon.h" - -namespace mace { -namespace ops { - -void Deconv2dNeonK4x4S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t w = in_shape[3]; - const index_t h = in_shape[2]; - const index_t inch = in_shape[1]; - - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t outch = out_shape[1]; - const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t oc = 0; oc < outch; oc += 2) { - if (oc + 1 < outch) { - float *out_base = output + (b * outch + oc) * out_img_size; - float *out_base1 = out_base + out_img_size; - for (index_t q = 0; q < inch; q++) { - const float *input_base = input + (b * inch + q) * h * w; - const float *in = input_base; - const float *kernel_base = filter + (oc * inch + q) * 16; - const float *k0 = kernel_base; - const float *k1 = kernel_base + 4; - const float *k2 = kernel_base + 8; - const float *k3 = kernel_base + 12; - - const float *kernel_base1 = kernel_base + inch * 16; - const float *k10 = kernel_base1; - const float *k11 = kernel_base1 + 4; - const float *k12 = kernel_base1 + 8; - const float *k13 = kernel_base1 + 12; -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); - float32x4_t k3_vec = vld1q_f32(k3); - - float32x4_t k10_vec = vld1q_f32(k10); - float32x4_t k11_vec = vld1q_f32(k11); - float32x4_t k12_vec = vld1q_f32(k12); - float32x4_t k13_vec = vld1q_f32(k13); -#endif - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + i * outw; - - float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; - - float *out_row1 = out_base1 + i * outw; - - float *out_row1_0 = out_row1; - float *out_row1_1 = out_row1_0 + outw; - float *out_row1_2 = out_row1_1 + outw; - float *out_row1_3 = out_row1_2 + outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - float32x4_t out00, out01, out02, out03; - float32x4_t out10, out11, out12, out13; - - out00 = vld1q_f32(out_row_0); - out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); - vst1q_f32(out_row_0, out00); - - out10 = vld1q_f32(out_row1_0); - out10 = neon_vfma_lane_0(out10, in_vec, k10_vec); - vst1q_f32(out_row1_0, out10); - - out01 = vld1q_f32(out_row_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); - vst1q_f32(out_row_0 + 1, out01); - - out11 = vld1q_f32(out_row1_0 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 1, out11); - - out02 = vld1q_f32(out_row_0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); - vst1q_f32(out_row_0 + 2, out02); - - out12 = vld1q_f32(out_row1_0 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 2, out12); - - out03 = vld1q_f32(out_row_0 + 3); - out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); - vst1q_f32(out_row_0 + 3, out03); - - out13 = vld1q_f32(out_row1_0 + 3); - out13 = neon_vfma_lane_3(out13, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 3, out13); - - // - out00 = vld1q_f32(out_row_1); - out00 = neon_vfma_lane_0(out00, in_vec, k1_vec); - vst1q_f32(out_row_1, out00); - - out10 = vld1q_f32(out_row1_1); - out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); - vst1q_f32(out_row1_1, out10); - - out01 = vld1q_f32(out_row_1 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k1_vec); - vst1q_f32(out_row_1 + 1, out01); - - out11 = vld1q_f32(out_row1_1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 1, out11); - - out02 = vld1q_f32(out_row_1 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k1_vec); - vst1q_f32(out_row_1 + 2, out02); - - out12 = vld1q_f32(out_row1_1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 2, out12); - - out03 = vld1q_f32(out_row_1 + 3); - out03 = neon_vfma_lane_3(out03, in_vec, k1_vec); - vst1q_f32(out_row_1 + 3, out03); - - out13 = vld1q_f32(out_row1_1 + 3); - out13 = neon_vfma_lane_3(out13, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 3, out13); - - // - out00 = vld1q_f32(out_row_2 + 0); - out00 = neon_vfma_lane_0(out00, in_vec, k2_vec); - vst1q_f32(out_row_2 + 0, out00); - - out10 = vld1q_f32(out_row1_2 + 0); - out10 = neon_vfma_lane_0(out10, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 0, out10); - - out01 = vld1q_f32(out_row_2 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k2_vec); - vst1q_f32(out_row_2 + 1, out01); - - out11 = vld1q_f32(out_row1_2 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 1, out11); - - out02 = vld1q_f32(out_row_2 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k2_vec); - vst1q_f32(out_row_2 + 2, out02); - - out12 = vld1q_f32(out_row1_2 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 2, out12); - - out03 = vld1q_f32(out_row_2 + 3); - out03 = neon_vfma_lane_3(out03, in_vec, k2_vec); - vst1q_f32(out_row_2 + 3, out03); - - out13 = vld1q_f32(out_row1_2 + 3); - out13 = neon_vfma_lane_3(out13, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 3, out13); - - // - out00 = vld1q_f32(out_row_3 + 0); - out00 = neon_vfma_lane_0(out00, in_vec, k3_vec); - vst1q_f32(out_row_3 + 0, out00); - - out10 = vld1q_f32(out_row1_3 + 0); - out10 = neon_vfma_lane_0(out10, in_vec, k13_vec); - vst1q_f32(out_row1_3 + 0, out10); - - out01 = vld1q_f32(out_row_3 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k3_vec); - vst1q_f32(out_row_3 + 1, out01); - - out11 = vld1q_f32(out_row1_3 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k13_vec); - vst1q_f32(out_row1_3 + 1, out11); - - out02 = vld1q_f32(out_row_3 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k3_vec); - vst1q_f32(out_row_3 + 2, out02); - - out12 = vld1q_f32(out_row1_3 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k13_vec); - vst1q_f32(out_row1_3 + 2, out12); - - out03 = vld1q_f32(out_row_3 + 3); - out03 = neon_vfma_lane_3(out03, in_vec, k3_vec); - vst1q_f32(out_row_3 + 3, out03); - - out13 = vld1q_f32(out_row1_3 + 3); - out13 = neon_vfma_lane_3(out13, in_vec, k13_vec); - vst1q_f32(out_row1_3 + 3, out13); - - in += 4; - out_row_0 += 4; - out_row_1 += 4; - out_row_2 += 4; - out_row_3 += 4; - out_row1_0 += 4; - out_row1_1 += 4; - out_row1_2 += 4; - out_row1_3 += 4; - } -#endif - for (; j < w; j++) { - float val = in[0]; - for (int k = 0; k < 4; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k1[k]; - out_row_2[k] += val * k2[k]; - out_row_3[k] += val * k3[k]; - out_row1_0[k] += val * k10[k]; - out_row1_1[k] += val * k11[k]; - out_row1_2[k] += val * k12[k]; - out_row1_3[k] += val * k13[k]; - } - in++; - out_row_0++; - out_row_1++; - out_row_2++; - out_row_3++; - out_row1_0++; - out_row1_1++; - out_row1_2++; - out_row1_3++; - } - } - } - } else { - float *out_base = output + (b * outch + oc) * out_img_size; - for (index_t q = 0; q < inch; q++) { - const float *input_base = input + (b * inch + q) * h * w; - const float *kernel_base = filter + (oc * inch + q) * 16; - const float *in = input_base; - const float *k0 = kernel_base; - const float *k1 = kernel_base + 4; - const float *k2 = kernel_base + 8; - const float *k3 = kernel_base + 12; -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); - float32x4_t k3_vec = vld1q_f32(k3); -#endif - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + i * outw; - float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; - int j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - - float32x4_t out00 = vld1q_f32(out_row_0); - out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); - vst1q_f32(out_row_0, out00); - - float32x4_t out01 = vld1q_f32(out_row_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); - vst1q_f32(out_row_0 + 1, out01); - - float32x4_t out02 = vld1q_f32(out_row_0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); - vst1q_f32(out_row_0 + 2, out02); - - float32x4_t out03 = vld1q_f32(out_row_0 + 3); - out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); - vst1q_f32(out_row_0 + 3, out03); - - // - float32x4_t out10 = vld1q_f32(out_row_1); - out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); - vst1q_f32(out_row_1, out10); - - float32x4_t out11 = vld1q_f32(out_row_1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); - vst1q_f32(out_row_1 + 1, out11); - - float32x4_t out12 = vld1q_f32(out_row_1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); - vst1q_f32(out_row_1 + 2, out12); - - float32x4_t out13 = vld1q_f32(out_row_1 + 3); - out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); - vst1q_f32(out_row_1 + 3, out13); - - // - float32x4_t out20 = vld1q_f32(out_row_2 + 0); - out20 = neon_vfma_lane_0(out20, in_vec, k2_vec); - vst1q_f32(out_row_2 + 0, out20); - - float32x4_t out21 = vld1q_f32(out_row_2 + 1); - out21 = neon_vfma_lane_1(out21, in_vec, k2_vec); - vst1q_f32(out_row_2 + 1, out21); - - float32x4_t out22 = vld1q_f32(out_row_2 + 2); - out22 = neon_vfma_lane_2(out22, in_vec, k2_vec); - vst1q_f32(out_row_2 + 2, out22); - - float32x4_t out23 = vld1q_f32(out_row_2 + 3); - out23 = neon_vfma_lane_3(out23, in_vec, k2_vec); - vst1q_f32(out_row_2 + 3, out23); - - // - float32x4_t out30 = vld1q_f32(out_row_3 + 0); - out30 = neon_vfma_lane_0(out30, in_vec, k3_vec); - vst1q_f32(out_row_3 + 0, out30); - - float32x4_t out31 = vld1q_f32(out_row_3 + 1); - out31 = neon_vfma_lane_1(out31, in_vec, k3_vec); - vst1q_f32(out_row_3 + 1, out31); - - float32x4_t out32 = vld1q_f32(out_row_3 + 2); - out32 = neon_vfma_lane_2(out32, in_vec, k3_vec); - vst1q_f32(out_row_3 + 2, out32); - - float32x4_t out33 = vld1q_f32(out_row_3 + 3); - out33 = neon_vfma_lane_3(out33, in_vec, k3_vec); - vst1q_f32(out_row_3 + 3, out33); - - in += 4; - out_row_0 += 4; - out_row_1 += 4; - out_row_2 += 4; - out_row_3 += 4; - } -#endif - for (; j < w; j++) { - float val = in[0]; - for (int k = 0; k < 4; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k1[k]; - out_row_2[k] += val * k2[k]; - out_row_3[k] += val * k3[k]; - } - in++; - out_row_0++; - out_row_1++; - out_row_2++; - out_row_3++; - } - } - } - } - } - } -} - -void Deconv2dNeonK4x4S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t w = in_shape[3]; - const index_t h = in_shape[2]; - const index_t inch = in_shape[1]; - - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t outch = out_shape[1]; - const index_t out_img_size = outh * outw; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t p = 0; p < outch; p++) { - float *out_base = output + (b * outch + p) * out_img_size; - for (index_t q = 0; q < inch; q++) { - const float *input_base = input + (b * inch + q) * h * w; - const float *kernel_base = filter + (p * inch + q) * 16; - const float *in = input_base; - - const float *k0 = kernel_base; - const float *k1 = kernel_base + 4; - const float *k2 = kernel_base + 8; - const float *k3 = kernel_base + 12; -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); - float32x4_t k3_vec = vld1q_f32(k3); -#endif - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + 2 * i * outw; - - float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (index_t n = 0; n + 9 < outw; n += 8) { - float32x4_t in_vec = vld1q_f32(in); - - // row 0 - float32x4x2_t out0 = vld2q_f32(out_row_0); - out0.val[0] = - neon_vfma_lane_0(out0.val[0], in_vec, k0_vec); - out0.val[1] = - neon_vfma_lane_1(out0.val[1], in_vec, k0_vec); - vst2q_f32(out_row_0, out0); - out0 = vld2q_f32(out_row_0 + 2); - out0.val[0] = - neon_vfma_lane_2(out0.val[0], in_vec, k0_vec); - out0.val[1] = - neon_vfma_lane_3(out0.val[1], in_vec, k0_vec); - vst2q_f32(out_row_0 + 2, out0); - - // row 1 - float32x4x2_t out1 = vld2q_f32(out_row_1); - out1.val[0] = - neon_vfma_lane_0(out1.val[0], in_vec, k1_vec); - out1.val[1] = - neon_vfma_lane_1(out1.val[1], in_vec, k1_vec); - vst2q_f32(out_row_1, out1); - out1 = vld2q_f32(out_row_1 + 2); - out1.val[0] = - neon_vfma_lane_2(out1.val[0], in_vec, k1_vec); - out1.val[1] = - neon_vfma_lane_3(out1.val[1], in_vec, k1_vec); - vst2q_f32(out_row_1 + 2, out1); - - // row 2 - float32x4x2_t out2 = vld2q_f32(out_row_2); - out2.val[0] = - neon_vfma_lane_0(out2.val[0], in_vec, k2_vec); - out2.val[1] = - neon_vfma_lane_1(out2.val[1], in_vec, k2_vec); - vst2q_f32(out_row_2, out2); - out2 = vld2q_f32(out_row_2 + 2); - out2.val[0] = - neon_vfma_lane_2(out2.val[0], in_vec, k2_vec); - out2.val[1] = - neon_vfma_lane_3(out2.val[1], in_vec, k2_vec); - vst2q_f32(out_row_2 + 2, out2); - - // row 3 - float32x4x2_t out3 = vld2q_f32(out_row_3); - out3.val[0] = - neon_vfma_lane_0(out3.val[0], in_vec, k3_vec); - out3.val[1] = - neon_vfma_lane_1(out3.val[1], in_vec, k3_vec); - vst2q_f32(out_row_3, out3); - out3 = vld2q_f32(out_row_3 + 2); - out3.val[0] = - neon_vfma_lane_2(out3.val[0], in_vec, k3_vec); - out3.val[1] = - neon_vfma_lane_3(out3.val[1], in_vec, k3_vec); - vst2q_f32(out_row_3 + 2, out3); - - in += 4; - out_row_0 += 8; - out_row_1 += 8; - out_row_2 += 8; - out_row_3 += 8; - j += 4; - } -#endif - for (; j < w; j++) { - float val = in[0]; - for (int k = 0; k < 4; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k1[k]; - out_row_2[k] += val * k2[k]; - out_row_3[k] += val * k3[k]; - } - in++; - out_row_0 += 2; - out_row_1 += 2; - out_row_2 += 2; - out_row_3 += 2; - } - } - } - } - } -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/depthwise_deconv2d_neon.h b/mace/ops/arm/depthwise_deconv2d_neon.h deleted file mode 100644 index 8df6dba15bd61d22054f0d0ecac2b35bd060ec76..0000000000000000000000000000000000000000 --- a/mace/ops/arm/depthwise_deconv2d_neon.h +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_ -#define MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_ - -#include "mace/core/types.h" -#include "mace/ops/arm/common_neon.h" - -namespace mace { -namespace ops { - -void DepthwiseDeconv2dNeonK3x3S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void DepthwiseDeconv2dNeonK3x3S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void DepthwiseDeconv2dNeonK4x4S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void DepthwiseDeconv2dNeonK4x4S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void GroupDeconv2dNeonK3x3S1(const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void GroupDeconv2dNeonK3x3S2(const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void GroupDeconv2dNeonK4x4S1(const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void GroupDeconv2dNeonK4x4S2(const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_ diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc deleted file mode 100644 index 6bba47c280bfb1fe22055c7440e9180b6afdc98e..0000000000000000000000000000000000000000 --- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc +++ /dev/null @@ -1,629 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/utils/macros.h" -#include "mace/ops/arm/depthwise_deconv2d_neon.h" - -namespace mace { -namespace ops { - -void DepthwiseDeconv2dNeonK3x3S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t channels = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - const index_t in_img_size = h * w; - - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t c = 0; c < channels; ++c) { - const index_t offset = b * channels + c; - float *out_base = output + offset * out_img_size; - const float *input_base = input + offset * in_img_size; - const float *kernel_base = filter + c * 9; - const float *in = input_base; - const float *k0 = kernel_base; - const float *k1 = kernel_base + 3; - const float *k2 = kernel_base + 5; - -#if defined(MACE_ENABLE_NEON) - // load filter - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); -#endif - for (index_t i = 0; i < h; ++i) { - float *out_row_base = out_base + i * outw; - float *out_row0 = out_row_base; - float *out_row1 = out_row_base + outw; - float *out_row2 = out_row_base + 2 * outw; - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - - float32x4_t out00, out01, out02; - float32x4_t out10, out11, out12; - float32x4_t out20, out21, out22; - - out00 = vld1q_f32(out_row0 + 0); - out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); - vst1q_f32(out_row0 + 0, out00); - - out01 = vld1q_f32(out_row0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); - vst1q_f32(out_row0 + 1, out01); - - out02 = vld1q_f32(out_row0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); - vst1q_f32(out_row0 + 2, out02); - - out10 = vld1q_f32(out_row1 + 0); - out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); - vst1q_f32(out_row1 + 0, out10); - - out11 = vld1q_f32(out_row1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); - vst1q_f32(out_row1 + 1, out11); - - out12 = vld1q_f32(out_row1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); - vst1q_f32(out_row1 + 2, out12); - - out20 = vld1q_f32(out_row2 + 0); - out20 = neon_vfma_lane_1(out20, in_vec, k2_vec); - vst1q_f32(out_row2 + 0, out20); - - out21 = vld1q_f32(out_row2 + 1); - out21 = neon_vfma_lane_2(out21, in_vec, k2_vec); - vst1q_f32(out_row2 + 1, out21); - - out22 = vld1q_f32(out_row2 + 2); - out22 = neon_vfma_lane_3(out22, in_vec, k2_vec); - vst1q_f32(out_row2 + 2, out22); - - in += 4; - out_row0 += 4; - out_row1 += 4; - out_row2 += 4; - } -#endif - for (; j < w; ++j) { - float val = in[0]; - for (int k = 0; k < 3; ++k) { - out_row0[k] += val * k0[k]; - out_row1[k] += val * k1[k]; - out_row2[k] += val * k2[k + 1]; - } - in++; - out_row0++; - out_row1++; - out_row2++; - } - } - } - } -} - -void DepthwiseDeconv2dNeonK3x3S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t channels = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - const index_t in_img_size = h * w; - - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t c = 0; c < channels; ++c) { - const index_t offset = b * channels + c; - float *out_base = output + offset * out_img_size; - const float *input_base = input + offset * in_img_size; - const float *kernel_base = filter + c * 9; - const float *in = input_base; - - const float *k0 = kernel_base; - const float *k1 = kernel_base + 3; - const float *k2 = kernel_base + 5; - -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); -#endif - for (index_t i = 0; i < h; ++i) { - float *out_row_base = out_base + i * 2 * outw; - float *out_row_0 = out_row_base; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (index_t n = 0; n + 9 < outw; n += 8) { - float32x4_t in_vec = vld1q_f32(in); - - // out row 0 - float32x4x2_t out00 = vld2q_f32(out_row_0); - out00.val[0] = - neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); - out00.val[1] = - neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); - vst2q_f32(out_row_0, out00); - - float32x4x2_t out01 = vld2q_f32(out_row_0 + 2); - out01.val[0] = - neon_vfma_lane_2(out01.val[0], in_vec, k0_vec); - vst2q_f32(out_row_0 + 2, out01); - - // out row 1 - float32x4x2_t out10 = vld2q_f32(out_row_1); - out10.val[0] = - neon_vfma_lane_0(out10.val[0], in_vec, k1_vec); - out10.val[1] = - neon_vfma_lane_1(out10.val[1], in_vec, k1_vec); - vst2q_f32(out_row_1, out10); - - float32x4x2_t out11 = vld2q_f32(out_row_1 + 2); - out11.val[0] = - neon_vfma_lane_2(out11.val[0], in_vec, k1_vec); - vst2q_f32(out_row_1 + 2, out11); - - // out row 2 - float32x4x2_t out20 = vld2q_f32(out_row_2); - out20.val[0] = - neon_vfma_lane_1(out20.val[0], in_vec, k2_vec); - out20.val[1] = - neon_vfma_lane_2(out20.val[1], in_vec, k2_vec); - vst2q_f32(out_row_2, out20); - - float32x4x2_t out21 = vld2q_f32(out_row_2 + 2); - out21.val[0] = - neon_vfma_lane_3(out21.val[0], in_vec, k2_vec); - vst2q_f32(out_row_2 + 2, out21); - - in += 4; - out_row_0 += 8; - out_row_1 += 8; - out_row_2 += 8; - j += 4; - } -#endif - for (; j < w; ++j) { - float val = in[0]; - - for (int k = 0; k < 3; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k1[k]; - out_row_2[k] += val * k2[k + 1]; - } - - in++; - out_row_0 += 2; - out_row_1 += 2; - out_row_2 += 2; - } - } - } - } -} - -void GroupDeconv2dNeonK3x3S1(const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - - const index_t in_img_size = h * w; - const index_t out_img_size = outh * outw; - - const index_t inch_g = inch / group; - const index_t outch_g = outch / group; - -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (int g = 0; g < group; ++g) { - for (index_t oc = 0; oc < outch_g; oc += 2) { - if (oc + 1 < outch_g) { - const index_t out_offset = b * outch + outch_g * g + oc; - float *out_base0 = output + out_offset * out_img_size; - float *out_base1 = out_base0 + out_img_size; - for (index_t ic = 0; ic < inch_g; ++ic) { - const index_t in_offset = b * inch + inch_g * g + ic; - const float *input_base = input + in_offset * in_img_size; - const index_t kernel_offset = (oc * group + g) * inch_g + ic; - const float *kernel_base0 = filter + kernel_offset * 9; - const float *kernel_base1 = kernel_base0 + inch * 9; - const float *in = input_base; - - // output channel 0 - const float *k0_0 = kernel_base0; - const float *k0_1 = kernel_base0 + 3; - const float *k0_2 = kernel_base0 + 5; - // output channel 1 - const float *k1_0 = kernel_base1; - const float *k1_1 = kernel_base1 + 3; - const float *k1_2 = kernel_base1 + 5; - -#if defined(MACE_ENABLE_NEON) - // load filter - float32x4_t k00_vec, k01_vec, k02_vec; - float32x4_t k10_vec, k11_vec, k12_vec; - - k00_vec = vld1q_f32(k0_0); - k01_vec = vld1q_f32(k0_1); - k02_vec = vld1q_f32(k0_2); - - k10_vec = vld1q_f32(k1_0); - k11_vec = vld1q_f32(k1_1); - k12_vec = vld1q_f32(k1_2); -#endif - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; - float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; - float *out_row0_2 = out_row_base0 + 2 * outw; - - float *out_row_base1 = out_base1 + i * outw; - float *out_row1_0 = out_row_base1; - float *out_row1_1 = out_row_base1 + outw; - float *out_row1_2 = out_row_base1 + 2 * outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - - float32x4_t out00, out01, out02; - float32x4_t out10, out11, out12; - float32x4_t out20, out21, out22; - - out00 = vld1q_f32(out_row0_0); - out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); - vst1q_f32(out_row0_0, out00); - - out01 = vld1q_f32(out_row0_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); - vst1q_f32(out_row0_0 + 1, out01); - - out02 = vld1q_f32(out_row0_0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); - vst1q_f32(out_row0_0 + 2, out02); - - out10 = vld1q_f32(out_row0_1 + 0); - out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 0, out10); - - out11 = vld1q_f32(out_row0_1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 1, out11); - - out12 = vld1q_f32(out_row0_1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 2, out12); - - out20 = vld1q_f32(out_row0_2 + 0); - out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 0, out20); - - out21 = vld1q_f32(out_row0_2 + 1); - out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 1, out21); - - out22 = vld1q_f32(out_row0_2 + 2); - out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 2, out22); - - out00 = vld1q_f32(out_row1_0 + 0); - out00 = neon_vfma_lane_0(out00, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 0, out00); - - out01 = vld1q_f32(out_row1_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 1, out01); - - out02 = vld1q_f32(out_row1_0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 2, out02); - - out10 = vld1q_f32(out_row1_1 + 0); - out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 0, out10); - - out11 = vld1q_f32(out_row1_1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 1, out11); - - out12 = vld1q_f32(out_row1_1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 2, out12); - - out20 = vld1q_f32(out_row1_2 + 0); - out20 = neon_vfma_lane_1(out20, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 0, out20); - - out21 = vld1q_f32(out_row1_2 + 1); - out21 = neon_vfma_lane_2(out21, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 1, out21); - - out22 = vld1q_f32(out_row1_2 + 2); - out22 = neon_vfma_lane_3(out22, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 2, out22); - - in += 4; - out_row0_0 += 4; - out_row0_1 += 4; - out_row0_2 += 4; - out_row1_0 += 4; - out_row1_1 += 4; - out_row1_2 += 4; - } -#endif - for (; j < w; ++j) { - float val = in[0]; - for (int k = 0; k < 3; ++k) { - out_row0_0[k] += val * k0_0[k]; - out_row0_1[k] += val * k0_1[k]; - out_row0_2[k] += val * k0_2[k + 1]; - out_row1_0[k] += val * k1_0[k]; - out_row1_1[k] += val * k1_1[k]; - out_row1_2[k] += val * k1_2[k + 1]; - } - in++; - out_row0_0++; - out_row0_1++; - out_row0_2++; - out_row1_0++; - out_row1_1++; - out_row1_2++; - } - } - } - } else { - const index_t out_offset = b * outch + outch_g * g + oc; - float *out_base0 = output + out_offset * out_img_size; - for (index_t ic = 0; ic < inch_g; ++ic) { - const index_t in_offset = (b * group + g) * inch_g + ic; - const float *input_base = input + in_offset * in_img_size; - const index_t kernel_offset = (oc * group + g) * inch_g + ic; - const float *kernel_base0 = filter + kernel_offset * 9; - const float *in = input_base; - const float *k0_0 = kernel_base0; - const float *k0_1 = kernel_base0 + 3; - const float *k0_2 = kernel_base0 + 5; - -#if defined(MACE_ENABLE_NEON) - // load filter - float32x4_t k00_vec = vld1q_f32(k0_0); - float32x4_t k01_vec = vld1q_f32(k0_1); - float32x4_t k02_vec = vld1q_f32(k0_2); -#endif - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; - float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; - float *out_row0_2 = out_row_base0 + 2 * outw; - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - - float32x4_t out00, out01, out02; - float32x4_t out10, out11, out12; - float32x4_t out20, out21, out22; - - out00 = vld1q_f32(out_row0_0 + 0); - out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); - vst1q_f32(out_row0_0 + 0, out00); - - out01 = vld1q_f32(out_row0_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); - vst1q_f32(out_row0_0 + 1, out01); - - out02 = vld1q_f32(out_row0_0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); - vst1q_f32(out_row0_0 + 2, out02); - - out10 = vld1q_f32(out_row0_1 + 0); - out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 0, out10); - - out11 = vld1q_f32(out_row0_1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 1, out11); - - out12 = vld1q_f32(out_row0_1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); - vst1q_f32(out_row0_1 + 2, out12); - - out20 = vld1q_f32(out_row0_2 + 0); - out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 0, out20); - - out21 = vld1q_f32(out_row0_2 + 1); - out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 1, out21); - - out22 = vld1q_f32(out_row0_2 + 2); - out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); - vst1q_f32(out_row0_2 + 2, out22); - - in += 4; - out_row0_0 += 4; - out_row0_1 += 4; - out_row0_2 += 4; - } -#endif - for (; j < w; ++j) { - float val = in[0]; - for (int k = 0; k < 3; ++k) { - out_row0_0[k] += val * k0_0[k]; - out_row0_1[k] += val * k0_1[k]; - out_row0_2[k] += val * k0_2[k + 1]; - } - in++; - out_row0_0++; - out_row0_1++; - out_row0_2++; - } - } - } - } - } - } - } -} - -void GroupDeconv2dNeonK3x3S2(const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - - const index_t in_img_size = h * w; - const index_t out_img_size = outh * outw; - - const index_t inch_g = inch / group; - const index_t outch_g = outch / group; - -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (int g = 0; g < group; ++g) { - for (index_t oc = 0; oc < outch_g; ++oc) { - const index_t out_offset = b * outch + outch_g * g + oc; - float *out_base = output + out_offset * out_img_size; - for (index_t ic = 0; ic < inch_g; ++ic) { - const index_t in_offset = b * inch + inch_g * g + ic; - const float *input_base = input + in_offset * in_img_size; - const index_t kernel_offset = (oc * group + g) * inch_g + ic; - const float *kernel_base = filter + kernel_offset * 9; - const float *in = input_base; - - const float *k0 = kernel_base; - const float *k1 = kernel_base + 3; - const float *k2 = kernel_base + 5; - -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); -#endif - for (index_t i = 0; i < h; ++i) { - float *out_row_base = out_base + i * 2 * outw; - float *out_row_0 = out_row_base; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (index_t n = 0; n + 9 < outw; n += 8) { - float32x4_t in_vec = vld1q_f32(in); - - // out row 0 - float32x4x2_t out00 = vld2q_f32(out_row_0); - out00.val[0] = - neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); - out00.val[1] = - neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); - vst2q_f32(out_row_0, out00); - - float32x4x2_t out01 = vld2q_f32(out_row_0 + 2); - out01.val[0] = - neon_vfma_lane_2(out01.val[0], in_vec, k0_vec); - vst2q_f32(out_row_0 + 2, out01); - - // out row 1 - float32x4x2_t out10 = vld2q_f32(out_row_1); - out10.val[0] = - neon_vfma_lane_0(out10.val[0], in_vec, k1_vec); - out10.val[1] = - neon_vfma_lane_1(out10.val[1], in_vec, k1_vec); - vst2q_f32(out_row_1, out10); - - float32x4x2_t out11 = vld2q_f32(out_row_1 + 2); - out11.val[0] = - neon_vfma_lane_2(out11.val[0], in_vec, k1_vec); - vst2q_f32(out_row_1 + 2, out11); - - // out row 2 - float32x4x2_t out20 = vld2q_f32(out_row_2); - out20.val[0] = - neon_vfma_lane_1(out20.val[0], in_vec, k2_vec); - out20.val[1] = - neon_vfma_lane_2(out20.val[1], in_vec, k2_vec); - vst2q_f32(out_row_2, out20); - - float32x4x2_t out21 = vld2q_f32(out_row_2 + 2); - out21.val[0] = - neon_vfma_lane_3(out21.val[0], in_vec, k2_vec); - vst2q_f32(out_row_2 + 2, out21); - - in += 4; - out_row_0 += 8; - out_row_1 += 8; - out_row_2 += 8; - j += 4; - } -#endif - for (; j < w; ++j) { - float val = in[0]; - - for (int k = 0; k < 3; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k1[k]; - out_row_2[k] += val * k2[k + 1]; - } - - in++; - out_row_0 += 2; - out_row_1 += 2; - out_row_2 += 2; - } - } - } - } - } - } -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc deleted file mode 100644 index 677eb152bb5f7d984a9f7bd003bcbf0e42a1da1f..0000000000000000000000000000000000000000 --- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc +++ /dev/null @@ -1,807 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/utils/macros.h" -#include "mace/ops/arm/deconv_2d_neon.h" - -namespace mace { -namespace ops { - -void DepthwiseDeconv2dNeonK4x4S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t batch = in_shape[0]; - const index_t channels = in_shape[1]; - const index_t w = in_shape[3]; - const index_t h = in_shape[2]; - const index_t in_img_size = h * w; - - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - const index_t offset = b * channels + c; - float *out_base = output + offset * out_img_size; - const float *input_base = input + offset * in_img_size; - const float *kernel_base = filter + c * 16; - const float *in = input_base; - const float *k0 = kernel_base; - const float *k1 = kernel_base + 4; - const float *k2 = kernel_base + 8; - const float *k3 = kernel_base + 12; -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); - float32x4_t k3_vec = vld1q_f32(k3); -#endif - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + i * outw; - float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - - float32x4_t out00 = vld1q_f32(out_row_0); - out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); - vst1q_f32(out_row_0, out00); - - float32x4_t out01 = vld1q_f32(out_row_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); - vst1q_f32(out_row_0 + 1, out01); - - float32x4_t out02 = vld1q_f32(out_row_0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); - vst1q_f32(out_row_0 + 2, out02); - - float32x4_t out03 = vld1q_f32(out_row_0 + 3); - out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); - vst1q_f32(out_row_0 + 3, out03); - - // - float32x4_t out10 = vld1q_f32(out_row_1); - out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); - vst1q_f32(out_row_1, out10); - - float32x4_t out11 = vld1q_f32(out_row_1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); - vst1q_f32(out_row_1 + 1, out11); - - float32x4_t out12 = vld1q_f32(out_row_1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); - vst1q_f32(out_row_1 + 2, out12); - - float32x4_t out13 = vld1q_f32(out_row_1 + 3); - out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); - vst1q_f32(out_row_1 + 3, out13); - - // - float32x4_t out20 = vld1q_f32(out_row_2 + 0); - out20 = neon_vfma_lane_0(out20, in_vec, k2_vec); - vst1q_f32(out_row_2 + 0, out20); - - float32x4_t out21 = vld1q_f32(out_row_2 + 1); - out21 = neon_vfma_lane_1(out21, in_vec, k2_vec); - vst1q_f32(out_row_2 + 1, out21); - - float32x4_t out22 = vld1q_f32(out_row_2 + 2); - out22 = neon_vfma_lane_2(out22, in_vec, k2_vec); - vst1q_f32(out_row_2 + 2, out22); - - float32x4_t out23 = vld1q_f32(out_row_2 + 3); - out23 = neon_vfma_lane_3(out23, in_vec, k2_vec); - vst1q_f32(out_row_2 + 3, out23); - - // - float32x4_t out30 = vld1q_f32(out_row_3 + 0); - out30 = neon_vfma_lane_0(out30, in_vec, k3_vec); - vst1q_f32(out_row_3 + 0, out30); - - float32x4_t out31 = vld1q_f32(out_row_3 + 1); - out31 = neon_vfma_lane_1(out31, in_vec, k3_vec); - vst1q_f32(out_row_3 + 1, out31); - - float32x4_t out32 = vld1q_f32(out_row_3 + 2); - out32 = neon_vfma_lane_2(out32, in_vec, k3_vec); - vst1q_f32(out_row_3 + 2, out32); - - float32x4_t out33 = vld1q_f32(out_row_3 + 3); - out33 = neon_vfma_lane_3(out33, in_vec, k3_vec); - vst1q_f32(out_row_3 + 3, out33); - - in += 4; - out_row_0 += 4; - out_row_1 += 4; - out_row_2 += 4; - out_row_3 += 4; - } -#endif - for (; j < w; j++) { - float val = in[0]; - for (int k = 0; k < 4; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k1[k]; - out_row_2[k] += val * k2[k]; - out_row_3[k] += val * k3[k]; - } - in++; - out_row_0++; - out_row_1++; - out_row_2++; - out_row_3++; - } - } - } - } -} - -void DepthwiseDeconv2dNeonK4x4S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t w = in_shape[3]; - const index_t h = in_shape[2]; - const index_t channels = in_shape[1]; - const index_t in_img_size = h * w; - - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t c = 0; c < channels; ++c) { - const index_t offset = b * channels + c; - float *out_base = output + offset * out_img_size; - const float *input_base = input + offset * in_img_size; - const float *kernel_base = filter + c * 16; - const float *in = input_base; - - const float *k0 = kernel_base; - const float *k1 = kernel_base + 4; - const float *k2 = kernel_base + 8; - const float *k3 = kernel_base + 12; -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); - float32x4_t k3_vec = vld1q_f32(k3); -#endif - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + 2 * i * outw; - - float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (index_t n = 0; n + 9 < outw; n += 8) { - float32x4_t in_vec = vld1q_f32(in); - - // row 0 - float32x4x2_t out0 = vld2q_f32(out_row_0); - out0.val[0] = - neon_vfma_lane_0(out0.val[0], in_vec, k0_vec); - out0.val[1] = - neon_vfma_lane_1(out0.val[1], in_vec, k0_vec); - vst2q_f32(out_row_0, out0); - out0 = vld2q_f32(out_row_0 + 2); - out0.val[0] = - neon_vfma_lane_2(out0.val[0], in_vec, k0_vec); - out0.val[1] = - neon_vfma_lane_3(out0.val[1], in_vec, k0_vec); - vst2q_f32(out_row_0 + 2, out0); - - // row 1 - float32x4x2_t out1 = vld2q_f32(out_row_1); - out1.val[0] = - neon_vfma_lane_0(out1.val[0], in_vec, k1_vec); - out1.val[1] = - neon_vfma_lane_1(out1.val[1], in_vec, k1_vec); - vst2q_f32(out_row_1, out1); - out1 = vld2q_f32(out_row_1 + 2); - out1.val[0] = - neon_vfma_lane_2(out1.val[0], in_vec, k1_vec); - out1.val[1] = - neon_vfma_lane_3(out1.val[1], in_vec, k1_vec); - vst2q_f32(out_row_1 + 2, out1); - - // row 2 - float32x4x2_t out2 = vld2q_f32(out_row_2); - out2.val[0] = - neon_vfma_lane_0(out2.val[0], in_vec, k2_vec); - out2.val[1] = - neon_vfma_lane_1(out2.val[1], in_vec, k2_vec); - vst2q_f32(out_row_2, out2); - out2 = vld2q_f32(out_row_2 + 2); - out2.val[0] = - neon_vfma_lane_2(out2.val[0], in_vec, k2_vec); - out2.val[1] = - neon_vfma_lane_3(out2.val[1], in_vec, k2_vec); - vst2q_f32(out_row_2 + 2, out2); - - // row 3 - float32x4x2_t out3 = vld2q_f32(out_row_3); - out3.val[0] = - neon_vfma_lane_0(out3.val[0], in_vec, k3_vec); - out3.val[1] = - neon_vfma_lane_1(out3.val[1], in_vec, k3_vec); - vst2q_f32(out_row_3, out3); - out3 = vld2q_f32(out_row_3 + 2); - out3.val[0] = - neon_vfma_lane_2(out3.val[0], in_vec, k3_vec); - out3.val[1] = - neon_vfma_lane_3(out3.val[1], in_vec, k3_vec); - vst2q_f32(out_row_3 + 2, out3); - - in += 4; - out_row_0 += 8; - out_row_1 += 8; - out_row_2 += 8; - out_row_3 += 8; - j += 4; - } -#endif - for (; j < w; j++) { - float val = in[0]; - for (int k = 0; k < 4; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k1[k]; - out_row_2[k] += val * k2[k]; - out_row_3[k] += val * k3[k]; - } - in++; - out_row_0 += 2; - out_row_1 += 2; - out_row_2 += 2; - out_row_3 += 2; - } - } - } - } -} - -void GroupDeconv2dNeonK4x4S1(const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t w = in_shape[3]; - const index_t h = in_shape[2]; - const index_t inch = in_shape[1]; - - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t outch = out_shape[1]; - - const index_t in_img_size = h * w; - const index_t out_img_size = outh * outw; - - const index_t inch_g = inch / group; - const index_t outch_g = outch / group; - -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (int g = 0; g < group; ++g) { - for (index_t oc = 0; oc < outch_g; oc += 2) { - if (oc + 1 < outch_g) { - const index_t out_offset = - (b * outch + outch_g * g + oc) * out_img_size; - float *out_base = output + out_offset; - float *out_base1 = out_base + out_img_size; - for (index_t ic = 0; ic < inch_g; ic++) { - const index_t in_offset = - (b * inch + inch_g * g + ic) * in_img_size; - const float *input_base = input + in_offset; - const float *in = input_base; - const index_t kernel_offset = - ((oc * group + g) * inch_g + ic) * 16; - const float *kernel_base = filter + kernel_offset; - const float *k0 = kernel_base; - const float *k1 = kernel_base + 4; - const float *k2 = kernel_base + 8; - const float *k3 = kernel_base + 12; - - const float *kernel_base1 = kernel_base + inch * 16; - const float *k10 = kernel_base1; - const float *k11 = kernel_base1 + 4; - const float *k12 = kernel_base1 + 8; - const float *k13 = kernel_base1 + 12; -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); - float32x4_t k3_vec = vld1q_f32(k3); - - float32x4_t k10_vec = vld1q_f32(k10); - float32x4_t k11_vec = vld1q_f32(k11); - float32x4_t k12_vec = vld1q_f32(k12); - float32x4_t k13_vec = vld1q_f32(k13); -#endif - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + i * outw; - - float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; - - float *out_row1 = out_base1 + i * outw; - - float *out_row1_0 = out_row1; - float *out_row1_1 = out_row1_0 + outw; - float *out_row1_2 = out_row1_1 + outw; - float *out_row1_3 = out_row1_2 + outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - float32x4_t out00, out01, out02, out03; - float32x4_t out10, out11, out12, out13; - - out00 = vld1q_f32(out_row_0); - out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); - vst1q_f32(out_row_0, out00); - - out10 = vld1q_f32(out_row1_0); - out10 = neon_vfma_lane_0(out10, in_vec, k10_vec); - vst1q_f32(out_row1_0, out10); - - out01 = vld1q_f32(out_row_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); - vst1q_f32(out_row_0 + 1, out01); - - out11 = vld1q_f32(out_row1_0 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 1, out11); - - out02 = vld1q_f32(out_row_0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); - vst1q_f32(out_row_0 + 2, out02); - - out12 = vld1q_f32(out_row1_0 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 2, out12); - - out03 = vld1q_f32(out_row_0 + 3); - out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); - vst1q_f32(out_row_0 + 3, out03); - - out13 = vld1q_f32(out_row1_0 + 3); - out13 = neon_vfma_lane_3(out13, in_vec, k10_vec); - vst1q_f32(out_row1_0 + 3, out13); - - // - out00 = vld1q_f32(out_row_1); - out00 = neon_vfma_lane_0(out00, in_vec, k1_vec); - vst1q_f32(out_row_1, out00); - - out10 = vld1q_f32(out_row1_1); - out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); - vst1q_f32(out_row1_1, out10); - - out01 = vld1q_f32(out_row_1 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k1_vec); - vst1q_f32(out_row_1 + 1, out01); - - out11 = vld1q_f32(out_row1_1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 1, out11); - - out02 = vld1q_f32(out_row_1 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k1_vec); - vst1q_f32(out_row_1 + 2, out02); - - out12 = vld1q_f32(out_row1_1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 2, out12); - - out03 = vld1q_f32(out_row_1 + 3); - out03 = neon_vfma_lane_3(out03, in_vec, k1_vec); - vst1q_f32(out_row_1 + 3, out03); - - out13 = vld1q_f32(out_row1_1 + 3); - out13 = neon_vfma_lane_3(out13, in_vec, k11_vec); - vst1q_f32(out_row1_1 + 3, out13); - - // - out00 = vld1q_f32(out_row_2 + 0); - out00 = neon_vfma_lane_0(out00, in_vec, k2_vec); - vst1q_f32(out_row_2 + 0, out00); - - out10 = vld1q_f32(out_row1_2 + 0); - out10 = neon_vfma_lane_0(out10, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 0, out10); - - out01 = vld1q_f32(out_row_2 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k2_vec); - vst1q_f32(out_row_2 + 1, out01); - - out11 = vld1q_f32(out_row1_2 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 1, out11); - - out02 = vld1q_f32(out_row_2 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k2_vec); - vst1q_f32(out_row_2 + 2, out02); - - out12 = vld1q_f32(out_row1_2 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 2, out12); - - out03 = vld1q_f32(out_row_2 + 3); - out03 = neon_vfma_lane_3(out03, in_vec, k2_vec); - vst1q_f32(out_row_2 + 3, out03); - - out13 = vld1q_f32(out_row1_2 + 3); - out13 = neon_vfma_lane_3(out13, in_vec, k12_vec); - vst1q_f32(out_row1_2 + 3, out13); - - // - out00 = vld1q_f32(out_row_3 + 0); - out00 = neon_vfma_lane_0(out00, in_vec, k3_vec); - vst1q_f32(out_row_3 + 0, out00); - - out10 = vld1q_f32(out_row1_3 + 0); - out10 = neon_vfma_lane_0(out10, in_vec, k13_vec); - vst1q_f32(out_row1_3 + 0, out10); - - out01 = vld1q_f32(out_row_3 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k3_vec); - vst1q_f32(out_row_3 + 1, out01); - - out11 = vld1q_f32(out_row1_3 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k13_vec); - vst1q_f32(out_row1_3 + 1, out11); - - out02 = vld1q_f32(out_row_3 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k3_vec); - vst1q_f32(out_row_3 + 2, out02); - - out12 = vld1q_f32(out_row1_3 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k13_vec); - vst1q_f32(out_row1_3 + 2, out12); - - out03 = vld1q_f32(out_row_3 + 3); - out03 = neon_vfma_lane_3(out03, in_vec, k3_vec); - vst1q_f32(out_row_3 + 3, out03); - - out13 = vld1q_f32(out_row1_3 + 3); - out13 = neon_vfma_lane_3(out13, in_vec, k13_vec); - vst1q_f32(out_row1_3 + 3, out13); - - in += 4; - out_row_0 += 4; - out_row_1 += 4; - out_row_2 += 4; - out_row_3 += 4; - out_row1_0 += 4; - out_row1_1 += 4; - out_row1_2 += 4; - out_row1_3 += 4; - } -#endif - for (; j < w; j++) { - float val = in[0]; - for (int k = 0; k < 4; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k1[k]; - out_row_2[k] += val * k2[k]; - out_row_3[k] += val * k3[k]; - out_row1_0[k] += val * k10[k]; - out_row1_1[k] += val * k11[k]; - out_row1_2[k] += val * k12[k]; - out_row1_3[k] += val * k13[k]; - } - in++; - out_row_0++; - out_row_1++; - out_row_2++; - out_row_3++; - out_row1_0++; - out_row1_1++; - out_row1_2++; - out_row1_3++; - } - } - } - } else { - const index_t out_offset = - (b * outch + outch_g * g + oc) * out_img_size; - float *out_base = output + out_offset; - for (index_t ic = 0; ic < inch_g; ++ic) { - const index_t in_offset = - (b * inch + inch_g * g + ic) * in_img_size; - const index_t kernel_offset = - ((oc * group + g) * inch_g + ic) * 16; - - const float *input_base = input + in_offset; - const float *kernel_base = filter + kernel_offset; - const float *in = input_base; - const float *k0 = kernel_base; - const float *k1 = kernel_base + 4; - const float *k2 = kernel_base + 8; - const float *k3 = kernel_base + 12; -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); - float32x4_t k3_vec = vld1q_f32(k3); -#endif - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + i * outw; - float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { - float32x4_t in_vec = vld1q_f32(in); - - float32x4_t out00 = vld1q_f32(out_row_0); - out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); - vst1q_f32(out_row_0, out00); - - float32x4_t out01 = vld1q_f32(out_row_0 + 1); - out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); - vst1q_f32(out_row_0 + 1, out01); - - float32x4_t out02 = vld1q_f32(out_row_0 + 2); - out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); - vst1q_f32(out_row_0 + 2, out02); - - float32x4_t out03 = vld1q_f32(out_row_0 + 3); - out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); - vst1q_f32(out_row_0 + 3, out03); - - // - float32x4_t out10 = vld1q_f32(out_row_1); - out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); - vst1q_f32(out_row_1, out10); - - float32x4_t out11 = vld1q_f32(out_row_1 + 1); - out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); - vst1q_f32(out_row_1 + 1, out11); - - float32x4_t out12 = vld1q_f32(out_row_1 + 2); - out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); - vst1q_f32(out_row_1 + 2, out12); - - float32x4_t out13 = vld1q_f32(out_row_1 + 3); - out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); - vst1q_f32(out_row_1 + 3, out13); - - // - float32x4_t out20 = vld1q_f32(out_row_2 + 0); - out20 = neon_vfma_lane_0(out20, in_vec, k2_vec); - vst1q_f32(out_row_2 + 0, out20); - - float32x4_t out21 = vld1q_f32(out_row_2 + 1); - out21 = neon_vfma_lane_1(out21, in_vec, k2_vec); - vst1q_f32(out_row_2 + 1, out21); - - float32x4_t out22 = vld1q_f32(out_row_2 + 2); - out22 = neon_vfma_lane_2(out22, in_vec, k2_vec); - vst1q_f32(out_row_2 + 2, out22); - - float32x4_t out23 = vld1q_f32(out_row_2 + 3); - out23 = neon_vfma_lane_3(out23, in_vec, k2_vec); - vst1q_f32(out_row_2 + 3, out23); - - // - float32x4_t out30 = vld1q_f32(out_row_3 + 0); - out30 = neon_vfma_lane_0(out30, in_vec, k3_vec); - vst1q_f32(out_row_3 + 0, out30); - - float32x4_t out31 = vld1q_f32(out_row_3 + 1); - out31 = neon_vfma_lane_1(out31, in_vec, k3_vec); - vst1q_f32(out_row_3 + 1, out31); - - float32x4_t out32 = vld1q_f32(out_row_3 + 2); - out32 = neon_vfma_lane_2(out32, in_vec, k3_vec); - vst1q_f32(out_row_3 + 2, out32); - - float32x4_t out33 = vld1q_f32(out_row_3 + 3); - out33 = neon_vfma_lane_3(out33, in_vec, k3_vec); - vst1q_f32(out_row_3 + 3, out33); - - in += 4; - out_row_0 += 4; - out_row_1 += 4; - out_row_2 += 4; - out_row_3 += 4; - } -#endif - for (; j < w; j++) { - float val = in[0]; - for (int k = 0; k < 4; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k1[k]; - out_row_2[k] += val * k2[k]; - out_row_3[k] += val * k3[k]; - } - in++; - out_row_0++; - out_row_1++; - out_row_2++; - out_row_3++; - } - } - } - } - } - } - } -} - -void GroupDeconv2dNeonK4x4S2(const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t w = in_shape[3]; - const index_t h = in_shape[2]; - const index_t inch = in_shape[1]; - - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t outch = out_shape[1]; - const index_t in_img_size = h * w; - const index_t out_img_size = outh * outw; - - const index_t inch_g = inch / group; - const index_t outch_g = outch / group; - -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (int g = 0; g < group; ++g) { - for (index_t oc = 0; oc < outch_g; oc++) { - const index_t out_offset = - (b * outch + outch_g * g + oc) * out_img_size; - float *out_base = output + out_offset; - for (index_t ic = 0; ic < inch_g; ic++) { - const index_t in_offset = - (b * inch + inch_g * g + ic) * in_img_size; - const index_t kernel_offset = - ((oc * group + g) * inch_g + ic) * 16; - const float *input_base = input + in_offset; - const float *kernel_base = filter + kernel_offset; - const float *in = input_base; - - const float *k0 = kernel_base; - const float *k1 = kernel_base + 4; - const float *k2 = kernel_base + 8; - const float *k3 = kernel_base + 12; -#if defined(MACE_ENABLE_NEON) - float32x4_t k0_vec = vld1q_f32(k0); - float32x4_t k1_vec = vld1q_f32(k1); - float32x4_t k2_vec = vld1q_f32(k2); - float32x4_t k3_vec = vld1q_f32(k3); -#endif - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + 2 * i * outw; - - float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; - - index_t j = 0; -#if defined(MACE_ENABLE_NEON) - for (index_t n = 0; n + 9 < outw; n += 8) { - float32x4_t in_vec = vld1q_f32(in); - - // row 0 - float32x4x2_t out0 = vld2q_f32(out_row_0); - out0.val[0] = - neon_vfma_lane_0(out0.val[0], in_vec, k0_vec); - out0.val[1] = - neon_vfma_lane_1(out0.val[1], in_vec, k0_vec); - vst2q_f32(out_row_0, out0); - out0 = vld2q_f32(out_row_0 + 2); - out0.val[0] = - neon_vfma_lane_2(out0.val[0], in_vec, k0_vec); - out0.val[1] = - neon_vfma_lane_3(out0.val[1], in_vec, k0_vec); - vst2q_f32(out_row_0 + 2, out0); - - // row 1 - float32x4x2_t out1 = vld2q_f32(out_row_1); - out1.val[0] = - neon_vfma_lane_0(out1.val[0], in_vec, k1_vec); - out1.val[1] = - neon_vfma_lane_1(out1.val[1], in_vec, k1_vec); - vst2q_f32(out_row_1, out1); - out1 = vld2q_f32(out_row_1 + 2); - out1.val[0] = - neon_vfma_lane_2(out1.val[0], in_vec, k1_vec); - out1.val[1] = - neon_vfma_lane_3(out1.val[1], in_vec, k1_vec); - vst2q_f32(out_row_1 + 2, out1); - - // row 2 - float32x4x2_t out2 = vld2q_f32(out_row_2); - out2.val[0] = - neon_vfma_lane_0(out2.val[0], in_vec, k2_vec); - out2.val[1] = - neon_vfma_lane_1(out2.val[1], in_vec, k2_vec); - vst2q_f32(out_row_2, out2); - out2 = vld2q_f32(out_row_2 + 2); - out2.val[0] = - neon_vfma_lane_2(out2.val[0], in_vec, k2_vec); - out2.val[1] = - neon_vfma_lane_3(out2.val[1], in_vec, k2_vec); - vst2q_f32(out_row_2 + 2, out2); - - // row 3 - float32x4x2_t out3 = vld2q_f32(out_row_3); - out3.val[0] = - neon_vfma_lane_0(out3.val[0], in_vec, k3_vec); - out3.val[1] = - neon_vfma_lane_1(out3.val[1], in_vec, k3_vec); - vst2q_f32(out_row_3, out3); - out3 = vld2q_f32(out_row_3 + 2); - out3.val[0] = - neon_vfma_lane_2(out3.val[0], in_vec, k3_vec); - out3.val[1] = - neon_vfma_lane_3(out3.val[1], in_vec, k3_vec); - vst2q_f32(out_row_3 + 2, out3); - - in += 4; - out_row_0 += 8; - out_row_1 += 8; - out_row_2 += 8; - out_row_3 += 8; - j += 4; - } -#endif - for (; j < w; j++) { - float val = in[0]; - for (int k = 0; k < 4; ++k) { - out_row_0[k] += val * k0[k]; - out_row_1[k] += val * k1[k]; - out_row_2[k] += val * k2[k]; - out_row_3[k] += val * k3[k]; - } - in++; - out_row_0 += 2; - out_row_1 += 2; - out_row_2 += 2; - out_row_3 += 2; - } - } - } - } - } - } -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/fp32/activation.cc b/mace/ops/arm/fp32/activation.cc new file mode 100644 index 0000000000000000000000000000000000000000..cac3badb523262663820b93e2527588f49be4923 --- /dev/null +++ b/mace/ops/arm/fp32/activation.cc @@ -0,0 +1,183 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/fp32/activation.h" + +#include +#include + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +Activation::Activation(ActivationType type, + const float limit, + const float leakyrelu_coefficient) + : type_(type), + limit_(limit), + leakyrelu_coefficient_(leakyrelu_coefficient) {} + +MaceStatus Activation::Compute(const OpContext *context, + const Tensor *input, + Tensor *output) { + Tensor::MappingGuard input_guard(input); + if (input != output) { + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + Tensor::MappingGuard output_guard(output); + DoActivation(context, input, output); + } else { + DoActivation(context, input, output); + } + + return MaceStatus::MACE_SUCCESS; +} + +void Activation::DoActivation(const OpContext *context, + const Tensor *input, + Tensor *output) { + auto input_data = input->data(); + auto output_data = output->mutable_data(); + const index_t size = input->size(); + + utils::ThreadPool &thread_pool = + context->device()->cpu_runtime()->thread_pool(); + + switch (type_) { + case RELU: { + const float32x4_t vzero = vdupq_n_f32(0.f); + const index_t block_count = size / 4; + + thread_pool.Compute1D( + [=](index_t start, index_t end, index_t step) { + auto input_ptr = input_data + start * 4; + auto output_ptr = output_data + start * 4; + + for (index_t i = start; i < end; i += step) { + float32x4_t v = vld1q_f32(input_ptr); + v = vmaxq_f32(v, vzero); + vst1q_f32(output_ptr, v); + + input_ptr += 4; + output_ptr += 4; + } + }, + 0, block_count, 1); + + // remain + for (index_t i = block_count * 4; i < size; ++i) { + output_data[i] = std::max(0.f, input_data[i]); + } + + break; + } + + case RELUX: { + const float32x4_t vzero = vdupq_n_f32(0.f); + const float32x4_t vlimit = vdupq_n_f32(limit_); + const index_t block_count = size / 4; + + thread_pool.Compute1D( + [=](index_t start, index_t end, index_t step) { + auto input_ptr = input_data + start * 4; + auto output_ptr = output_data + start * 4; + + for (index_t i = start; i < end; i += step) { + float32x4_t v = vld1q_f32(input_ptr); + v = vmaxq_f32(v, vzero); + v = vminq_f32(v, vlimit); + vst1q_f32(output_ptr, v); + + input_ptr += 4; + output_ptr += 4; + } + }, + 0, block_count, 1); + + // remain + for (index_t i = block_count * 4; i < size; ++i) { + output_data[i] = std::max(0.f, std::min(limit_, input_data[i])); + } + + break; + } + + case LEAKYRELU: { + const float32x4_t vzero = vdupq_n_f32(0.f); + const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_); + const index_t block_count = size / 4; + + thread_pool.Compute1D( + [=](index_t start, index_t end, index_t step) { + auto input_ptr = input_data + start * 4; + auto output_ptr = output_data + start * 4; + + for (index_t i = start; i < end; i += step) { + float32x4_t v = vld1q_f32(input_ptr); + float32x4_t u = vminq_f32(v, vzero); + v = vmaxq_f32(v, vzero); + v = vmlaq_f32(v, valpha, u); + vst1q_f32(output_ptr, v); + + input_ptr += 4; + output_ptr += 4; + } + }, + 0, block_count, 1); + + // remain + for (index_t i = block_count * 4; i < size; ++i) { + output_data[i] = std::max(input_data[i], 0.f) + + std::min(input_data[i], 0.f) * leakyrelu_coefficient_; + } + + break; + } + + case TANH: { + thread_pool.Compute1D( + [=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + output_data[i] = std::tanh(input_data[i]); + } + }, + 0, size, 1); + + break; + } + + case SIGMOID: { + thread_pool.Compute1D( + [=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + output_data[i] = 1 / (1 + std::exp(-(input_data[i]))); + } + }, + 0, size, 1); + + break; + } + + case NOOP: + break; + + default: + MACE_NOT_IMPLEMENTED; + } +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/activation.h b/mace/ops/arm/fp32/activation.h new file mode 100644 index 0000000000000000000000000000000000000000..265915d0c3a8d3bdbab3e4c0d0f60521730dec34 --- /dev/null +++ b/mace/ops/arm/fp32/activation.h @@ -0,0 +1,53 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_ACTIVATION_H_ +#define MACE_OPS_ARM_FP32_ACTIVATION_H_ + +#include "mace/core/op_context.h" +#include "mace/ops/common/activation_type.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Activation { + public: + explicit Activation(ActivationType type, + const float limit, + const float leakyrelu_coefficient); + ~Activation() = default; + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + Tensor *output); + + private: + void DoActivation(const OpContext *context, + const Tensor *input, + Tensor *output); + + ActivationType type_; + const float limit_; + const float leakyrelu_coefficient_; +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_ACTIVATION_H_ diff --git a/mace/ops/arm/fp32/bias_add.cc b/mace/ops/arm/fp32/bias_add.cc new file mode 100644 index 0000000000000000000000000000000000000000..de4b6d575b194b253243cdfb3ffe7ceebec3f045 --- /dev/null +++ b/mace/ops/arm/fp32/bias_add.cc @@ -0,0 +1,95 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/fp32/bias_add.h" + +#include + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus BiasAdd::Compute(const OpContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output) { + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard bias_guard(bias); + if (input != output) { + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + if (bias == nullptr) { + output->Copy(*input); + } else { + Tensor::MappingGuard output_guard(output); + AddBias(context, input, bias, output); + } + } else { + if (bias != nullptr) { + AddBias(context, input, bias, output); + } + } + + return MaceStatus::MACE_SUCCESS; +} + +void BiasAdd::AddBias(const OpContext *context, + const Tensor *input, + const Tensor *bias, + mace::Tensor *output) { + auto input_data = input->data(); + auto bias_data = bias->data(); + auto output_data = output->mutable_data(); + + const index_t batch = input->dim(0); + const index_t channels = input->dim(1); + const index_t height = output->dim(2); + const index_t width = output->dim(3); + const index_t image_size = height * width; + const index_t block_count = image_size / 4; + const index_t remain = image_size % 4; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const index_t offset = (b * channels + c) * image_size; + auto input_ptr = input_data + offset; + auto output_ptr = output_data + offset; + const float bias = bias_data[c]; + float32x4_t vbias = vdupq_n_f32(bias); + + for (index_t i = 0; i < block_count; ++i) { + float32x4_t v = vld1q_f32(input_ptr); + v = vaddq_f32(v, vbias); + vst1q_f32(output_ptr, v); + + input_ptr += 4; + output_ptr += 4; + } + for (index_t i = 0; i < remain; ++i) { + (*output_ptr++) = (*input_ptr++) + bias; + } + } + } + }, 0, batch, 1, 0, channels, 1); +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + diff --git a/mace/ops/arm/fp32/bias_add.h b/mace/ops/arm/fp32/bias_add.h new file mode 100644 index 0000000000000000000000000000000000000000..a3e6849157472bc9df8117299cf3f0d01ca203d8 --- /dev/null +++ b/mace/ops/arm/fp32/bias_add.h @@ -0,0 +1,48 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_BIAS_ADD_H_ +#define MACE_OPS_ARM_FP32_BIAS_ADD_H_ + +#include "mace/core/op_context.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class BiasAdd { + public: + BiasAdd() = default; + ~BiasAdd() = default; + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output); + + private: + void AddBias(const OpContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output); +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_BIAS_ADD_H_ diff --git a/mace/ops/arm/common_neon.h b/mace/ops/arm/fp32/common_neon.h similarity index 90% rename from mace/ops/arm/common_neon.h rename to mace/ops/arm/fp32/common_neon.h index 8d28f5581c6ad43dd90fe1965e16e6ab7bec48c8..8ac2cb7c787bf386fb15678bfd014ae760933dba 100644 --- a/mace/ops/arm/common_neon.h +++ b/mace/ops/arm/fp32/common_neon.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_COMMON_NEON_H_ -#define MACE_OPS_ARM_COMMON_NEON_H_ +#ifndef MACE_OPS_ARM_FP32_COMMON_NEON_H_ +#define MACE_OPS_ARM_FP32_COMMON_NEON_H_ #if defined(MACE_ENABLE_NEON) #include @@ -21,6 +21,8 @@ namespace mace { namespace ops { +namespace arm { +namespace fp32 { #ifdef MACE_ENABLE_NEON inline float32x4_t neon_vfma_lane_0(float32x4_t a, @@ -64,7 +66,9 @@ inline float32x4_t neon_vfma_lane_3(float32x4_t a, } #endif +} // namespace fp32 +} // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_COMMON_NEON_H_ +#endif // MACE_OPS_ARM_FP32_COMMON_NEON_H_ diff --git a/mace/ops/arm/fp32/conv_2d.cc b/mace/ops/arm/fp32/conv_2d.cc index 2602279423dc753be085c66bf67bb4cbee86bcc7..357b47754b0b9bf814302be042f56651883594a5 100644 --- a/mace/ops/arm/fp32/conv_2d.cc +++ b/mace/ops/arm/fp32/conv_2d.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/ops/arm/fp32/conv_2d.h" + #include #include #include -#include "mace/ops/arm/fp32/conv_2d.h" #include "mace/utils/memory.h" namespace mace { @@ -195,7 +196,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context, void Conv2dBase::PadInput(const Tensor &src, const int pad_top, const int pad_left, - mace::Tensor *dst) { + Tensor *dst) { if (dst == &src) return; const index_t batch = src.dim(0); const index_t channels = src.dim(1); @@ -211,7 +212,6 @@ void Conv2dBase::PadInput(const Tensor &src, const index_t img_size = height * width; const index_t padded_img_size = padded_height * padded_width; -#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { const index_t bc = b * channels + c; @@ -238,7 +238,7 @@ void Conv2dBase::PadInput(const Tensor &src, } } -void Conv2dBase::UnPadOutput(const mace::Tensor &src, mace::Tensor *dst) { +void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) { if (dst == &src) return; const index_t batch = dst->dim(0); const index_t channels = dst->dim(1); @@ -253,7 +253,6 @@ void Conv2dBase::UnPadOutput(const mace::Tensor &src, mace::Tensor *dst) { const index_t img_size = height * width; const index_t padded_img_size = padded_height * padded_width; -#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { const index_t bc = (b * channels + c); diff --git a/mace/ops/arm/fp32/conv_2d.h b/mace/ops/arm/fp32/conv_2d.h index 1383767bf278f1f6c11aec8047732aca98afa45a..dc8d0effd101e77df88473c884fcdb670768379e 100644 --- a/mace/ops/arm/fp32/conv_2d.h +++ b/mace/ops/arm/fp32/conv_2d.h @@ -31,9 +31,9 @@ namespace fp32 { class Conv2dBase { public: - Conv2dBase(const std::vector strides, - const std::vector dilations, - const std::vector paddings, + Conv2dBase(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, const Padding padding_type) : strides_(strides), dilations_(dilations), diff --git a/mace/ops/arm/fp32/conv_2d_1x1.h b/mace/ops/arm/fp32/conv_2d_1x1.h index 68b792fd96b3c5dd77504614894d3008bbd01e01..cde94ea01927ad544bb347eaea53bcb55b01f7f8 100644 --- a/mace/ops/arm/fp32/conv_2d_1x1.h +++ b/mace/ops/arm/fp32/conv_2d_1x1.h @@ -29,7 +29,7 @@ namespace fp32 { class Conv2dK1x1 : public Conv2dBase { public: - Conv2dK1x1(const std::vector paddings, const Padding padding_type) + Conv2dK1x1(const std::vector &paddings, const Padding padding_type) : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK1x1() {} @@ -37,7 +37,7 @@ class Conv2dK1x1 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; private: Gemm gemm_; diff --git a/mace/ops/arm/fp32/conv_2d_1xn.cc b/mace/ops/arm/fp32/conv_2d_1xn.cc index 1ff99d8021438d8b851b65d6ee2c662e01e72917..3be9e3eb5dca7ecf4ecf66b1371796872c5cd0b5 100644 --- a/mace/ops/arm/fp32/conv_2d_1xn.cc +++ b/mace/ops/arm/fp32/conv_2d_1xn.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/ops/arm/fp32/conv_2d_1xn.h" + #include #include -#include "mace/ops/arm/fp32/conv_2d_1xn.h" namespace mace { namespace ops { @@ -37,11 +38,11 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, &padded_input, &padded_output); const Tensor *in_tensor = input; - if (padded_input.get() != nullptr) { + if (padded_input != nullptr) { in_tensor = padded_input.get(); } Tensor *out_tensor = output; - if (padded_output.get() != nullptr) { + if (padded_output != nullptr) { out_tensor = padded_output.get(); } out_tensor->Clear(); @@ -53,82 +54,90 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, auto input_data = in_tensor->data(); auto output_data = out_tensor->mutable_data(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); - - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; m += 4) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - if (m + 3 < out_channels) { - float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; - const float - *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; - const float - *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; - const float - *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; - /* load filter (4 outch x 1 height x 4 width) */ - float32x4_t vf00, vf01; - float32x4_t vf10, vf11; - float32x4_t vf20, vf21; - float32x4_t vf30, vf31; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - vf10 = vld1q_f32(filter_ptr1); - vf11 = vld1q_f32(filter_ptr1 + 3); - vf20 = vld1q_f32(filter_ptr2); - vf21 = vld1q_f32(filter_ptr2 + 3); - vf30 = vld1q_f32(filter_ptr3); - vf31 = vld1q_f32(filter_ptr3 + 3); - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // output (4 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0, vo1, vo2, vo3; - // load output - index_t out_offset = h * out_width + w; - vo0 = vld1q_f32(out_ptr0_base + out_offset); - vo1 = vld1q_f32(out_ptr1_base + out_offset); - vo2 = vld1q_f32(out_ptr2_base + out_offset); - vo3 = vld1q_f32(out_ptr3_base + out_offset); - - // input (3 slide) - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; - // input offset - index_t in_offset = h * in_width + w; - // load input - vi0 = vld1q_f32(in_ptr_base + in_offset); - vi4 = vld1q_f32(in_ptr_base + in_offset + 4); - vi8 = vld1q_f32(in_ptr_base + in_offset + 8); - vi1 = vextq_f32(vi0, vi4, 1); - vi2 = vextq_f32(vi0, vi4, 2); - vi3 = vextq_f32(vi0, vi4, 3); - vi5 = vextq_f32(vi4, vi8, 1); - vi6 = vextq_f32(vi4, vi8, 2); + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + if (m + 3 < out_channels) { + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output_data + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output_data + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output_data + b * out_batch_size + (m + 3) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; + const float + *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; + const float + *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; + const float + *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; + /* load filter (4 outch x 1 height x 4 width) */ + float32x4_t vf00, vf01; + float32x4_t vf10, vf11; + float32x4_t vf20, vf21; + float32x4_t vf30, vf31; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + vf10 = vld1q_f32(filter_ptr1); + vf11 = vld1q_f32(filter_ptr1 + 3); + vf20 = vld1q_f32(filter_ptr2); + vf21 = vld1q_f32(filter_ptr2 + 3); + vf30 = vld1q_f32(filter_ptr3); + vf31 = vld1q_f32(filter_ptr3 + 3); + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // output (4 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0, vo1, vo2, vo3; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + vo1 = vld1q_f32(out_ptr1_base + out_offset); + vo2 = vld1q_f32(out_ptr2_base + out_offset); + vo3 = vld1q_f32(out_ptr3_base + out_offset); + + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; + // input offset + index_t in_offset = h * in_width + w; + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); #if defined(__aarch64__) - /* outch 0 */ + /* outch 0 */ vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); @@ -161,92 +170,7 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); #else - /* outch 0 */ - vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); - vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); - /* outch 1 */ - vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); - vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); - vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); - /* outch 2 */ - vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); - vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); - vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); - /* outch 3 */ - vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); - vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); - vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); -#endif - - vst1q_f32(out_ptr0_base + out_offset, vo0); - vst1q_f32(out_ptr1_base + out_offset, vo1); - vst1q_f32(out_ptr2_base + out_offset, vo2); - vst1q_f32(out_ptr3_base + out_offset, vo3); - } // w - } // h - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { - float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; - /* load filter (1 outch x 1 height x 4 width) */ - float32x4_t vf00, vf01; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // output (1 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0; - // load output - index_t out_offset = h * out_width + w; - vo0 = vld1q_f32(out_ptr0_base + out_offset); - - // input (3 slide) - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; - // input offset - index_t in_offset = h * in_width + w; - // load input - vi0 = vld1q_f32(in_ptr_base + in_offset); - vi4 = vld1q_f32(in_ptr_base + in_offset + 4); - vi8 = vld1q_f32(in_ptr_base + in_offset + 8); - vi1 = vextq_f32(vi0, vi4, 1); - vi2 = vextq_f32(vi0, vi4, 2); - vi3 = vextq_f32(vi0, vi4, 3); - vi5 = vextq_f32(vi4, vi8, 1); - vi6 = vextq_f32(vi4, vi8, 2); - -#if defined(__aarch64__) - vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); - vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); - vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); - vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); -#else + /* outch 0 */ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); @@ -254,16 +178,103 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); + /* outch 1 */ + vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); + vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); + vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); + /* outch 2 */ + vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); + vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); + vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); + /* outch 3 */ + vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); + vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); + vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); #endif vst1q_f32(out_ptr0_base + out_offset, vo0); + vst1q_f32(out_ptr1_base + out_offset, vo1); + vst1q_f32(out_ptr2_base + out_offset, vo2); + vst1q_f32(out_ptr3_base + out_offset, vo3); } // w } // h } // c - } - } // if - } // m - } // b + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; + /* load filter (1 outch x 1 height x 4 width) */ + float32x4_t vf00, vf01; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; + // input offset + index_t in_offset = h * in_width + w; + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + +#if defined(__aarch64__) + vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); + vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); + vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); + vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); +#else + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); +#endif + + vst1q_f32(out_ptr0_base + out_offset, vo0); + } // w + } // h + } // c + } + } // if + } // m + } // b + }, 0, batch, 1, 0, out_channels, 4); + UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } @@ -284,11 +295,11 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, &padded_input, &padded_output); const Tensor *in_tensor = input; - if (padded_input.get() != nullptr) { + if (padded_input != nullptr) { in_tensor = padded_input.get(); } Tensor *out_tensor = output; - if (padded_output.get() != nullptr) { + if (padded_output != nullptr) { out_tensor = padded_output.get(); } out_tensor->Clear(); @@ -300,206 +311,84 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, auto input_data = in_tensor->data(); auto output_data = out_tensor->mutable_data(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); - - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; m += 4) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - if (m + 3 < out_channels) { - float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; - const float - *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; - const float - *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; - const float - *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; - /* load filter (4 outch x 4 height x 1 width) */ - float32x4_t vf00, vf01; - float32x4_t vf10, vf11; - float32x4_t vf20, vf21; - float32x4_t vf30, vf31; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - vf10 = vld1q_f32(filter_ptr1); - vf11 = vld1q_f32(filter_ptr1 + 3); - vf20 = vld1q_f32(filter_ptr2); - vf21 = vld1q_f32(filter_ptr2 + 3); - vf30 = vld1q_f32(filter_ptr3); - vf31 = vld1q_f32(filter_ptr3 + 3); - - for (index_t h = 0; h + 3 < out_height; h += 4) { - for (index_t w = 0; w < out_width; ++w) { - // load output - index_t out_offset = h * out_width + w; - // output (4 outch x 4 height x 1 width): vo_outch_height - float32x4_t vo0 = {out_ptr0_base[out_offset], - out_ptr0_base[out_offset + out_width], - out_ptr0_base[out_offset + 2 * out_width], - out_ptr0_base[out_offset + 3 * out_width]}; - float32x4_t vo1 = {out_ptr1_base[out_offset], - out_ptr1_base[out_offset + out_width], - out_ptr1_base[out_offset + 2 * out_width], - out_ptr1_base[out_offset + 3 * out_width]}; - float32x4_t vo2 = {out_ptr2_base[out_offset], - out_ptr2_base[out_offset + out_width], - out_ptr2_base[out_offset + 2 * out_width], - out_ptr2_base[out_offset + 3 * out_width]}; - float32x4_t vo3 = {out_ptr3_base[out_offset], - out_ptr3_base[out_offset + out_width], - out_ptr3_base[out_offset + 2 * out_width], - out_ptr3_base[out_offset + 3 * out_width]}; - - // input offset - index_t in_offset = h * in_width + w; - // input (3 slide) - float32x4_t vi0 = {in_ptr_base[in_offset], - in_ptr_base[in_offset + in_width], - in_ptr_base[in_offset + 2 * in_width], - in_ptr_base[in_offset + 3 * in_width]}; - float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], - in_ptr_base[in_offset + 5 * in_width], - in_ptr_base[in_offset + 6 * in_width], - in_ptr_base[in_offset + 7 * in_width]}; - float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width]}; - float32x4_t vi1 = vextq_f32(vi0, vi4, 1); - float32x4_t vi2 = vextq_f32(vi0, vi4, 2); - float32x4_t vi3 = vextq_f32(vi0, vi4, 3); - float32x4_t vi5 = vextq_f32(vi4, vi8, 1); - float32x4_t vi6 = vextq_f32(vi4, vi8, 2); - -#if defined(__aarch64__) - /* outch 0 */ - vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); - vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); - vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); - vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); - /* outch 1 */ - vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); - vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); - vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); - vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); - vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); - vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); - vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); - /* outch 2 */ - vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); - vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); - vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); - vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); - vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); - vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); - vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); - /* outch 3 */ - vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); - vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); - vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); - vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); - vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); - vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); - vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); -#else - /* outch 0 */ - vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); - vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); - /* outch 1 */ - vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); - vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); - vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); - /* outch 2 */ - vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); - vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); - vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); - /* outch 3 */ - vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); - vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); - vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); -#endif - - out_ptr0_base[out_offset] = vo0[0]; - out_ptr0_base[out_offset + out_width] = vo0[1]; - out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; - out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; - out_ptr1_base[out_offset] = vo1[0]; - out_ptr1_base[out_offset + out_width] = vo1[1]; - out_ptr1_base[out_offset + 2 * out_width] = vo1[2]; - out_ptr1_base[out_offset + 3 * out_width] = vo1[3]; - out_ptr2_base[out_offset] = vo2[0]; - out_ptr2_base[out_offset + out_width] = vo2[1]; - out_ptr2_base[out_offset + 2 * out_width] = vo2[2]; - out_ptr2_base[out_offset + 3 * out_width] = vo2[3]; - out_ptr3_base[out_offset] = vo3[0]; - out_ptr3_base[out_offset + out_width] = vo3[1]; - out_ptr3_base[out_offset + 2 * out_width] = vo3[2]; - out_ptr3_base[out_offset + 3 * out_width] = vo3[3]; - } // w - } // h - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + if (m + 3 < out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output_data + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output_data + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output_data + b * out_batch_size + (m + 3) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = input_data + b * in_batch_size + c * in_image_size; const float - *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; - /* load filter (1 outch x 4 height x 1 width) */ + *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; + const float + *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; + const float + *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; + const float + *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; + /* load filter (4 outch x 4 height x 1 width) */ float32x4_t vf00, vf01; + float32x4_t vf10, vf11; + float32x4_t vf20, vf21; + float32x4_t vf30, vf31; vf00 = vld1q_f32(filter_ptr0); vf01 = vld1q_f32(filter_ptr0 + 3); + vf10 = vld1q_f32(filter_ptr1); + vf11 = vld1q_f32(filter_ptr1 + 3); + vf20 = vld1q_f32(filter_ptr2); + vf21 = vld1q_f32(filter_ptr2 + 3); + vf30 = vld1q_f32(filter_ptr3); + vf31 = vld1q_f32(filter_ptr3 + 3); for (index_t h = 0; h + 3 < out_height; h += 4) { for (index_t w = 0; w < out_width; ++w) { // load output index_t out_offset = h * out_width + w; - // output (1 outch x 4 height x 1 width): vo_outch_height + // output (4 outch x 4 height x 1 width): vo_outch_height float32x4_t vo0 = {out_ptr0_base[out_offset], out_ptr0_base[out_offset + out_width], out_ptr0_base[out_offset + 2 * out_width], out_ptr0_base[out_offset + 3 * out_width]}; + float32x4_t vo1 = {out_ptr1_base[out_offset], + out_ptr1_base[out_offset + out_width], + out_ptr1_base[out_offset + 2 * out_width], + out_ptr1_base[out_offset + 3 * out_width]}; + float32x4_t vo2 = {out_ptr2_base[out_offset], + out_ptr2_base[out_offset + out_width], + out_ptr2_base[out_offset + 2 * out_width], + out_ptr2_base[out_offset + 3 * out_width]}; + float32x4_t vo3 = {out_ptr3_base[out_offset], + out_ptr3_base[out_offset + out_width], + out_ptr3_base[out_offset + 2 * out_width], + out_ptr3_base[out_offset + 3 * out_width]}; // input offset index_t in_offset = h * in_width + w; @@ -513,9 +402,7 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, in_ptr_base[in_offset + 6 * in_width], in_ptr_base[in_offset + 7 * in_width]}; float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width], - in_ptr_base[in_offset + 10 * in_width], - in_ptr_base[in_offset + 11 * in_width]}; + in_ptr_base[in_offset + 9 * in_width]}; float32x4_t vi1 = vextq_f32(vi0, vi4, 1); float32x4_t vi2 = vextq_f32(vi0, vi4, 2); float32x4_t vi3 = vextq_f32(vi0, vi4, 3); @@ -523,6 +410,7 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, float32x4_t vi6 = vextq_f32(vi4, vi8, 2); #if defined(__aarch64__) + /* outch 0 */ vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); @@ -530,7 +418,32 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); + /* outch 1 */ + vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); + vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); + vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); + vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); + vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); + vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); + vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); + /* outch 2 */ + vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); + vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); + vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); + vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); + vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); + vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); + vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); + /* outch 3 */ + vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); + vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); + vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); + vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); + vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); + vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); + vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); #else + /* outch 0 */ vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); @@ -538,26 +451,131 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); + /* outch 1 */ + vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); + vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); + vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); + /* outch 2 */ + vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); + vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); + vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); + /* outch 3 */ + vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); + vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); + vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); #endif out_ptr0_base[out_offset] = vo0[0]; out_ptr0_base[out_offset + out_width] = vo0[1]; out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; + out_ptr1_base[out_offset] = vo1[0]; + out_ptr1_base[out_offset + out_width] = vo1[1]; + out_ptr1_base[out_offset + 2 * out_width] = vo1[2]; + out_ptr1_base[out_offset + 3 * out_width] = vo1[3]; + out_ptr2_base[out_offset] = vo2[0]; + out_ptr2_base[out_offset + out_width] = vo2[1]; + out_ptr2_base[out_offset + 2 * out_width] = vo2[2]; + out_ptr2_base[out_offset + 3 * out_width] = vo2[3]; + out_ptr3_base[out_offset] = vo3[0]; + out_ptr3_base[out_offset + out_width] = vo3[1]; + out_ptr3_base[out_offset + 2 * out_width] = vo3[2]; + out_ptr3_base[out_offset + 3 * out_width] = vo3[3]; } // w } // h } // c - } - } // if - } // m - } // b + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; + /* load filter (1 outch x 4 height x 1 width) */ + float32x4_t vf00, vf01; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + + for (index_t h = 0; h + 3 < out_height; h += 4) { + for (index_t w = 0; w < out_width; ++w) { + // load output + index_t out_offset = h * out_width + w; + // output (1 outch x 4 height x 1 width): vo_outch_height + float32x4_t vo0 = {out_ptr0_base[out_offset], + out_ptr0_base[out_offset + out_width], + out_ptr0_base[out_offset + 2 * out_width], + out_ptr0_base[out_offset + 3 * out_width]}; + + // input offset + index_t in_offset = h * in_width + w; + // input (3 slide) + float32x4_t vi0 = {in_ptr_base[in_offset], + in_ptr_base[in_offset + in_width], + in_ptr_base[in_offset + 2 * in_width], + in_ptr_base[in_offset + 3 * in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], + in_ptr_base[in_offset + 5 * in_width], + in_ptr_base[in_offset + 6 * in_width], + in_ptr_base[in_offset + 7 * in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], + in_ptr_base[in_offset + 9 * in_width], + in_ptr_base[in_offset + 10 * in_width], + in_ptr_base[in_offset + 11 * in_width]}; + float32x4_t vi1 = vextq_f32(vi0, vi4, 1); + float32x4_t vi2 = vextq_f32(vi0, vi4, 2); + float32x4_t vi3 = vextq_f32(vi0, vi4, 3); + float32x4_t vi5 = vextq_f32(vi4, vi8, 1); + float32x4_t vi6 = vextq_f32(vi4, vi8, 2); + +#if defined(__aarch64__) + vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); + vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); + vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); + vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); +#else + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); +#endif + + out_ptr0_base[out_offset] = vo0[0]; + out_ptr0_base[out_offset + out_width] = vo0[1]; + out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; + out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; + } // w + } // h + } // c + } + } // if + } // m + } // b + }, 0, batch, 1, 0, out_channels, 4); + UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } - -// ==== - MaceStatus Conv2dK1x15S1::Compute(const OpContext *context, const Tensor *input, const Tensor *filter, @@ -590,91 +608,104 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context, auto input_data = in_tensor->data(); auto output_data = out_tensor->mutable_data(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; const index_t tile_height = - out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2]; - -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; ++m) { - for (index_t h = 0; h < out_shape[2]; h += tile_height) { - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - float *out_ptr_base = - output_data + b * out_batch_size + m * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15; - /* load filter (1 outch x 4 height x 1 width) */ - float32x4_t vf0, vf1, vf2, vf3; - vf0 = vld1q_f32(filter_ptr); - vf1 = vld1q_f32(filter_ptr + 4); - vf2 = vld1q_f32(filter_ptr + 8); - vf3 = vld1q_f32(filter_ptr + 11); - - for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // output (1 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo; - // load output - index_t out_offset = (h + ht) * out_width + w; - vo = vld1q_f32(out_ptr_base + out_offset); - - // input (3 slide) - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9, - vi10, vi11, vi12, vi13, vi14, vi16; - // input offset - index_t in_offset = (h + ht) * in_width + w; - // load input - vi0 = vld1q_f32(in_ptr_base + in_offset); - vi4 = vld1q_f32(in_ptr_base + in_offset + 4); - vi8 = vld1q_f32(in_ptr_base + in_offset + 8); - vi12 = vld1q_f32(in_ptr_base + in_offset + 12); - vi16 = vld1q_f32(in_ptr_base + in_offset + 16); - vi1 = vextq_f32(vi0, vi4, 1); - vi2 = vextq_f32(vi0, vi4, 2); - vi3 = vextq_f32(vi0, vi4, 3); - vi5 = vextq_f32(vi4, vi8, 1); - vi6 = vextq_f32(vi4, vi8, 2); - vi7 = vextq_f32(vi4, vi8, 3); - vi9 = vextq_f32(vi8, vi12, 1); - vi10 = vextq_f32(vi8, vi12, 2); - vi11 = vextq_f32(vi8, vi12, 3); - vi13 = vextq_f32(vi12, vi16, 1); - vi14 = vextq_f32(vi12, vi16, 2); - - vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); - vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); - vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); - - vst1q_f32(out_ptr_base + out_offset, vo); - } // w - } // ht - } // c - } // h - } // m - } // b + out_channels < 4 ? RoundUpDiv4(out_height) : out_height; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + for (index_t h = 0; h < out_height; h += tile_height) { + float *out_ptr_base = + output_data + b * out_batch_size + m * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr = filter_data + m * in_channels * 15 + c * 15; + /* load filter (1 outch x 4 height x 1 width) */ + float32x4_t vf0, vf1, vf2, vf3; + vf0 = vld1q_f32(filter_ptr); + vf1 = vld1q_f32(filter_ptr + 4); + vf2 = vld1q_f32(filter_ptr + 8); + vf3 = vld1q_f32(filter_ptr + 11); + + for (index_t ht = 0; ht < tile_height && h + ht < out_height; + ++ht) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo; + // load output + index_t out_offset = (h + ht) * out_width + w; + vo = vld1q_f32(out_ptr_base + out_offset); + + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9, + vi10, vi11, vi12, vi13, vi14, vi16; + // input offset + index_t in_offset = (h + ht) * in_width + w; + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi12 = vld1q_f32(in_ptr_base + in_offset + 12); + vi16 = vld1q_f32(in_ptr_base + in_offset + 16); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + vi7 = vextq_f32(vi4, vi8, 3); + vi9 = vextq_f32(vi8, vi12, 1); + vi10 = vextq_f32(vi8, vi12, 2); + vi11 = vextq_f32(vi8, vi12, 3); + vi13 = vextq_f32(vi12, vi16, 1); + vi14 = vextq_f32(vi12, vi16, 2); + + vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); + vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); + vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); + + vst1q_f32(out_ptr_base + out_offset, vo); + } // w + } // ht + } // c + } // h + } // m + } // b + }, 0, batch, 1, 0, out_channels, 1); + UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } @@ -711,106 +742,119 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context, auto input_data = in_tensor->data(); auto output_data = out_tensor->mutable_data(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; const index_t tile_width = - out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3]; - -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; ++m) { - for (index_t w = 0; w < out_shape[3]; w += tile_width) { - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - float *out_ptr_base = - output_data + b * out_batch_size + m * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15; - /* load filter (1 outch x 4 height x 1 width) */ - float32x4_t vf0, vf1, vf2, vf3; - vf0 = vld1q_f32(filter_ptr); - vf1 = vld1q_f32(filter_ptr + 4); - vf2 = vld1q_f32(filter_ptr + 8); - vf3 = vld1q_f32(filter_ptr + 11); - - for (index_t h = 0; h + 3 < out_height; h += 4) { - for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) { - // load output - index_t out_offset = h * out_width + w + wt; - // output (1 outch x 4 height x 1 width): vo_outch_height - float32x4_t vo = {out_ptr_base[out_offset], - out_ptr_base[out_offset + out_width], - out_ptr_base[out_offset + 2 * out_width], - out_ptr_base[out_offset + 3 * out_width]}; - - // input offset - index_t in_offset = h * in_width + w + wt; - // input (3 slide) - float32x4_t vi0 = {in_ptr_base[in_offset], - in_ptr_base[in_offset + in_width], - in_ptr_base[in_offset + 2 * in_width], - in_ptr_base[in_offset + 3 * in_width]}; - float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], - in_ptr_base[in_offset + 5 * in_width], - in_ptr_base[in_offset + 6 * in_width], - in_ptr_base[in_offset + 7 * in_width]}; - float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width], - in_ptr_base[in_offset + 10 * in_width], - in_ptr_base[in_offset + 11 * in_width]}; - float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width], - in_ptr_base[in_offset + 13 * in_width], - in_ptr_base[in_offset + 14 * in_width], - in_ptr_base[in_offset + 15 * in_width]}; - float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width], - in_ptr_base[in_offset + 17 * in_width]}; - float32x4_t vi1 = vextq_f32(vi0, vi4, 1); - float32x4_t vi2 = vextq_f32(vi0, vi4, 2); - float32x4_t vi3 = vextq_f32(vi0, vi4, 3); - float32x4_t vi5 = vextq_f32(vi4, vi8, 1); - float32x4_t vi6 = vextq_f32(vi4, vi8, 2); - float32x4_t vi7 = vextq_f32(vi4, vi8, 3); - float32x4_t vi9 = vextq_f32(vi8, vi12, 1); - float32x4_t vi10 = vextq_f32(vi8, vi12, 2); - float32x4_t vi11 = vextq_f32(vi8, vi12, 3); - float32x4_t vi13 = vextq_f32(vi12, vi16, 1); - float32x4_t vi14 = vextq_f32(vi12, vi16, 2); - - vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); - vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); - vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); - - out_ptr_base[out_offset] = vo[0]; - out_ptr_base[out_offset + out_width] = vo[1]; - out_ptr_base[out_offset + 2 * out_width] = vo[2]; - out_ptr_base[out_offset + 3 * out_width] = vo[3]; - } // wt - } // h - } // c - } // w - } // m - } // b + out_channels < 4 ? RoundUpDiv4(out_width) : out_width; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + for (index_t w = 0; w < out_width; w += tile_width) { + float *out_ptr_base = + output_data + b * out_batch_size + m * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr = filter_data + m * in_channels * 15 + c * 15; + /* load filter (1 outch x 4 height x 1 width) */ + float32x4_t vf0, vf1, vf2, vf3; + vf0 = vld1q_f32(filter_ptr); + vf1 = vld1q_f32(filter_ptr + 4); + vf2 = vld1q_f32(filter_ptr + 8); + vf3 = vld1q_f32(filter_ptr + 11); + + for (index_t h = 0; h + 3 < out_height; h += 4) { + for (index_t wt = 0; wt < tile_width && w + wt < out_width; + ++wt) { + // load output + index_t out_offset = h * out_width + w + wt; + // output (1 outch x 4 height x 1 width): vo_outch_height + float32x4_t vo = {out_ptr_base[out_offset], + out_ptr_base[out_offset + out_width], + out_ptr_base[out_offset + 2 * out_width], + out_ptr_base[out_offset + 3 * out_width]}; + + // input offset + index_t in_offset = h * in_width + w + wt; + // input (3 slide) + float32x4_t vi0 = {in_ptr_base[in_offset], + in_ptr_base[in_offset + in_width], + in_ptr_base[in_offset + 2 * in_width], + in_ptr_base[in_offset + 3 * in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], + in_ptr_base[in_offset + 5 * in_width], + in_ptr_base[in_offset + 6 * in_width], + in_ptr_base[in_offset + 7 * in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], + in_ptr_base[in_offset + 9 * in_width], + in_ptr_base[in_offset + 10 * in_width], + in_ptr_base[in_offset + 11 * in_width]}; + float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width], + in_ptr_base[in_offset + 13 * in_width], + in_ptr_base[in_offset + 14 * in_width], + in_ptr_base[in_offset + 15 * in_width]}; + float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width], + in_ptr_base[in_offset + 17 * in_width]}; + float32x4_t vi1 = vextq_f32(vi0, vi4, 1); + float32x4_t vi2 = vextq_f32(vi0, vi4, 2); + float32x4_t vi3 = vextq_f32(vi0, vi4, 3); + float32x4_t vi5 = vextq_f32(vi4, vi8, 1); + float32x4_t vi6 = vextq_f32(vi4, vi8, 2); + float32x4_t vi7 = vextq_f32(vi4, vi8, 3); + float32x4_t vi9 = vextq_f32(vi8, vi12, 1); + float32x4_t vi10 = vextq_f32(vi8, vi12, 2); + float32x4_t vi11 = vextq_f32(vi8, vi12, 3); + float32x4_t vi13 = vextq_f32(vi12, vi16, 1); + float32x4_t vi14 = vextq_f32(vi12, vi16, 2); + + vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); + vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); + vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); + + out_ptr_base[out_offset] = vo[0]; + out_ptr_base[out_offset + out_width] = vo[1]; + out_ptr_base[out_offset + 2 * out_width] = vo[2]; + out_ptr_base[out_offset + 3 * out_width] = vo[3]; + } // wt + } // h + } // c + } // w + } // m + } // b + }, 0, batch, 1, 0, out_channels, 1); + UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/arm/fp32/conv_2d_1xn.h b/mace/ops/arm/fp32/conv_2d_1xn.h index a4a5e8995f9ebf5b85c2622684c13e558eb2900f..0bdd66737907627f7dd44e1cb94c24803ea0c8fc 100644 --- a/mace/ops/arm/fp32/conv_2d_1xn.h +++ b/mace/ops/arm/fp32/conv_2d_1xn.h @@ -28,7 +28,7 @@ namespace fp32 { class Conv2dK1x7S1 : public Conv2dBase { public: - Conv2dK1x7S1(const std::vector paddings, const Padding padding_type) + Conv2dK1x7S1(const std::vector &paddings, const Padding padding_type) : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK1x7S1() {} @@ -36,12 +36,12 @@ class Conv2dK1x7S1 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; class Conv2dK7x1S1 : public Conv2dBase { public: - Conv2dK7x1S1(const std::vector paddings, const Padding padding_type) + Conv2dK7x1S1(const std::vector &paddings, const Padding padding_type) : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK7x1S1() {} @@ -49,12 +49,12 @@ class Conv2dK7x1S1 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; class Conv2dK1x15S1 : public Conv2dBase { public: - Conv2dK1x15S1(const std::vector paddings, const Padding padding_type) + Conv2dK1x15S1(const std::vector &paddings, const Padding padding_type) : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK1x15S1() {} @@ -62,12 +62,12 @@ class Conv2dK1x15S1 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; class Conv2dK15x1S1 : public Conv2dBase { public: - Conv2dK15x1S1(const std::vector paddings, const Padding padding_type) + Conv2dK15x1S1(const std::vector &paddings, const Padding padding_type) : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK15x1S1() {} @@ -75,7 +75,7 @@ class Conv2dK15x1S1 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; } // namespace fp32 diff --git a/mace/ops/arm/fp32/conv_2d_3x3.cc b/mace/ops/arm/fp32/conv_2d_3x3.cc index a8ce5fa64074c08362d0e839a80d111221bc19cb..95c3034138d9ecab67d1aae0ee770ff07ab20788 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3.cc +++ b/mace/ops/arm/fp32/conv_2d_3x3.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/ops/arm/fp32/conv_2d_3x3.h" + #include #include -#include "mace/ops/arm/fp32/conv_2d_3x3.h" namespace mace { namespace ops { @@ -36,11 +37,11 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, &padded_input, &padded_output); const Tensor *in_tensor = input; - if (padded_input.get() != nullptr) { + if (padded_input != nullptr) { in_tensor = padded_input.get(); } Tensor *out_tensor = output; - if (padded_output.get() != nullptr) { + if (padded_output != nullptr) { out_tensor = padded_output.get(); } out_tensor->Clear(); @@ -52,291 +53,41 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, auto input_data = in_tensor->data(); auto output_data = out_tensor->mutable_data(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); - - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; m += 2) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - if (m + 1 < out_channels) { - float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float - *in_ptr0 = input_data + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9; - - float *out_ptr1 = out_ptr1_base; - const float *in_ptr1 = - input_data + b * in_batch_size + c * in_image_size + 1 * in_width; - const float *in_ptr2 = - input_data + b * in_batch_size + c * in_image_size + 2 * in_width; - const float *in_ptr3 = - input_data + b * in_batch_size + c * in_image_size + 3 * in_width; - const float - *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9; - -#if defined(__aarch64__) - float *out_ptr0 = out_ptr0_base; - - // load filter (2 outch x 3 height x 3 width): vf_outch_height - float32x4_t vf00, vf01, vf02; - float32x4_t vf10, vf11, vf12; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - vf02 = vld1q_f32(filter_ptr0 + 6); - - vf10 = vld1q_f32(filter_ptr1); - vf11 = vld1q_f32(filter_ptr1 + 3); - vf12 = vld1q_f32(filter_ptr1 + 6); - - for (index_t h = 0; h + 1 < out_height; h += 2) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input (4 height x 3 slide): vi_height_slide - float32x4_t vi00, vi01, vi02; // reg count: 14 - float32x4_t vi10, vi11, vi12; - float32x4_t vi20, vi21, vi22; - float32x4_t vi30, vi31, vi32; - float32x4_t vo20, vo30; // tmp use - - // output (4 outch x 2 height x 4 width): vo_outch_height - float32x4_t vo00, vo01; - float32x4_t vo10, vo11; - - // load input - vi00 = vld1q_f32(in_ptr0); - vo00 = vld1q_f32(in_ptr0 + 4); // reuse vo00: vi0n - vi10 = vld1q_f32(in_ptr1); - vo10 = vld1q_f32(in_ptr1 + 4); - vi20 = vld1q_f32(in_ptr2); - vo20 = vld1q_f32(in_ptr2 + 4); - vi30 = vld1q_f32(in_ptr3); - vo30 = vld1q_f32(in_ptr3 + 4); - - vi01 = vextq_f32(vi00, vo00, 1); - vi02 = vextq_f32(vi00, vo00, 2); - vi11 = vextq_f32(vi10, vo10, 1); - vi12 = vextq_f32(vi10, vo10, 2); - vi21 = vextq_f32(vi20, vo20, 1); - vi22 = vextq_f32(vi20, vo20, 2); - vi31 = vextq_f32(vi30, vo30, 1); - vi32 = vextq_f32(vi30, vo30, 2); - - // load ouptut - vo00 = vld1q_f32(out_ptr0); - vo01 = vld1q_f32(out_ptr0 + out_width); - vo10 = vld1q_f32(out_ptr1); - vo11 = vld1q_f32(out_ptr1 + out_width); - - // outch 0, height 0 - vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); // reg count: 18 - vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1); - vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2); - vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0); - vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1); - vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2); - vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0); - vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1); - vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2); - - // outch 0, height 1 - vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0); - vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1); - vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2); - vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0); - vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1); - vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2); - vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0); - vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1); - vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2); - - // outch 1, height 0 - vo10 = vfmaq_laneq_f32(vo10, vi00, vf10, 0); - vo10 = vfmaq_laneq_f32(vo10, vi01, vf10, 1); - vo10 = vfmaq_laneq_f32(vo10, vi02, vf10, 2); - vo10 = vfmaq_laneq_f32(vo10, vi10, vf11, 0); - vo10 = vfmaq_laneq_f32(vo10, vi11, vf11, 1); - vo10 = vfmaq_laneq_f32(vo10, vi12, vf11, 2); - vo10 = vfmaq_laneq_f32(vo10, vi20, vf12, 0); - vo10 = vfmaq_laneq_f32(vo10, vi21, vf12, 1); - vo10 = vfmaq_laneq_f32(vo10, vi22, vf12, 2); - - // outch 1, height 1 - vo11 = vfmaq_laneq_f32(vo11, vi10, vf10, 0); - vo11 = vfmaq_laneq_f32(vo11, vi11, vf10, 1); - vo11 = vfmaq_laneq_f32(vo11, vi12, vf10, 2); - vo11 = vfmaq_laneq_f32(vo11, vi20, vf11, 0); - vo11 = vfmaq_laneq_f32(vo11, vi21, vf11, 1); - vo11 = vfmaq_laneq_f32(vo11, vi22, vf11, 2); - vo11 = vfmaq_laneq_f32(vo11, vi30, vf12, 0); - vo11 = vfmaq_laneq_f32(vo11, vi31, vf12, 1); - vo11 = vfmaq_laneq_f32(vo11, vi32, vf12, 2); - - vst1q_f32(out_ptr0, vo00); - vst1q_f32(out_ptr0 + out_width, vo01); - vst1q_f32(out_ptr1, vo10); - vst1q_f32(out_ptr1 + out_width, vo11); - - in_ptr0 += 4; - in_ptr1 += 4; - in_ptr2 += 4; - in_ptr3 += 4; - - out_ptr0 += 4; - out_ptr1 += 4; - } // w - - in_ptr0 += 2 + in_width; - in_ptr1 += 2 + in_width; - in_ptr2 += 2 + in_width; - in_ptr3 += 2 + in_width; - - out_ptr0 += out_width; - out_ptr1 += out_width; - } // h -#else // arm v7 - float *out_ptr0 = out_ptr0_base; - - // load filter (2 outch x 3 height x 3 width): vf_outch_height - float32x2_t vf001, vf023, vf045, vf067, vf089; - float32x2_t vf101, vf123, vf145, vf167, vf189; - vf001 = vld1_f32(filter_ptr0); - vf023 = vld1_f32(filter_ptr0 + 2); - vf045 = vld1_f32(filter_ptr0 + 4); - vf067 = vld1_f32(filter_ptr0 + 6); - vf089 = vld1_f32(filter_ptr0 + 8); - - vf101 = vld1_f32(filter_ptr1); - vf123 = vld1_f32(filter_ptr1 + 2); - vf145 = vld1_f32(filter_ptr1 + 4); - vf167 = vld1_f32(filter_ptr1 + 6); - vf189 = vld1_f32(filter_ptr1 + 8); - - for (index_t h = 0; h + 1 < out_height; h += 2) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input (4 height x 3 slide): vi_height_slide - float32x4_t vi00, vi01, vi02; // reg count: 14 - float32x4_t vi10, vi11, vi12; - float32x4_t vi20, vi21, vi22; - float32x4_t vi30, vi31, vi32; - float32x4_t vo20, vo30; // tmp use - - // output (4 outch x 2 height x 4 width): vo_outch_height - float32x4_t vo00, vo01; - float32x4_t vo10, vo11; - - // load input - vi00 = vld1q_f32(in_ptr0); - vo00 = vld1q_f32(in_ptr0 + 4); // reuse vo00: vi0n - vi10 = vld1q_f32(in_ptr1); - vo10 = vld1q_f32(in_ptr1 + 4); - vi20 = vld1q_f32(in_ptr2); - vo20 = vld1q_f32(in_ptr2 + 4); - vi30 = vld1q_f32(in_ptr3); - vo30 = vld1q_f32(in_ptr3 + 4); - - vi01 = vextq_f32(vi00, vo00, 1); - vi02 = vextq_f32(vi00, vo00, 2); - vi11 = vextq_f32(vi10, vo10, 1); - vi12 = vextq_f32(vi10, vo10, 2); - vi21 = vextq_f32(vi20, vo20, 1); - vi22 = vextq_f32(vi20, vo20, 2); - vi31 = vextq_f32(vi30, vo30, 1); - vi32 = vextq_f32(vi30, vo30, 2); - - // load ouptut - vo00 = vld1q_f32(out_ptr0); - vo01 = vld1q_f32(out_ptr0 + out_width); - vo10 = vld1q_f32(out_ptr1); - vo11 = vld1q_f32(out_ptr1 + out_width); - - // outch 0, height 0 - vo00 = vmlaq_lane_f32(vo00, vi00, vf001, 0); - vo00 = vmlaq_lane_f32(vo00, vi01, vf001, 1); - vo00 = vmlaq_lane_f32(vo00, vi02, vf023, 0); - vo00 = vmlaq_lane_f32(vo00, vi10, vf023, 1); - vo00 = vmlaq_lane_f32(vo00, vi11, vf045, 0); - vo00 = vmlaq_lane_f32(vo00, vi12, vf045, 1); - vo00 = vmlaq_lane_f32(vo00, vi20, vf067, 0); - vo00 = vmlaq_lane_f32(vo00, vi21, vf067, 1); - vo00 = vmlaq_lane_f32(vo00, vi22, vf089, 0); - - // outch 0, height 1 - vo01 = vmlaq_lane_f32(vo01, vi10, vf001, 0); - vo01 = vmlaq_lane_f32(vo01, vi11, vf001, 1); - vo01 = vmlaq_lane_f32(vo01, vi12, vf023, 0); - vo01 = vmlaq_lane_f32(vo01, vi20, vf023, 1); - vo01 = vmlaq_lane_f32(vo01, vi21, vf045, 0); - vo01 = vmlaq_lane_f32(vo01, vi22, vf045, 1); - vo01 = vmlaq_lane_f32(vo01, vi30, vf067, 0); - vo01 = vmlaq_lane_f32(vo01, vi31, vf067, 1); - vo01 = vmlaq_lane_f32(vo01, vi32, vf089, 0); - - // outch 1, height 0 - vo10 = vmlaq_lane_f32(vo10, vi00, vf101, 0); - vo10 = vmlaq_lane_f32(vo10, vi01, vf101, 1); - vo10 = vmlaq_lane_f32(vo10, vi02, vf123, 0); - vo10 = vmlaq_lane_f32(vo10, vi10, vf123, 1); - vo10 = vmlaq_lane_f32(vo10, vi11, vf145, 0); - vo10 = vmlaq_lane_f32(vo10, vi12, vf145, 1); - vo10 = vmlaq_lane_f32(vo10, vi20, vf167, 0); - vo10 = vmlaq_lane_f32(vo10, vi21, vf167, 1); - vo10 = vmlaq_lane_f32(vo10, vi22, vf189, 0); - - // outch 1, height 1 - vo11 = vmlaq_lane_f32(vo11, vi10, vf101, 0); - vo11 = vmlaq_lane_f32(vo11, vi11, vf101, 1); - vo11 = vmlaq_lane_f32(vo11, vi12, vf123, 0); - vo11 = vmlaq_lane_f32(vo11, vi20, vf123, 1); - vo11 = vmlaq_lane_f32(vo11, vi21, vf145, 0); - vo11 = vmlaq_lane_f32(vo11, vi22, vf145, 1); - vo11 = vmlaq_lane_f32(vo11, vi30, vf167, 0); - vo11 = vmlaq_lane_f32(vo11, vi31, vf167, 1); - vo11 = vmlaq_lane_f32(vo11, vi32, vf189, 0); - - vst1q_f32(out_ptr0, vo00); - vst1q_f32(out_ptr0 + out_width, vo01); - vst1q_f32(out_ptr1, vo10); - vst1q_f32(out_ptr1 + out_width, vo11); - - in_ptr0 += 4; - in_ptr1 += 4; - in_ptr2 += 4; - in_ptr3 += 4; - - out_ptr0 += 4; - out_ptr1 += 4; - } // w - - in_ptr0 += 2 + in_width; - in_ptr1 += 2 + in_width; - in_ptr2 += 2 + in_width; - in_ptr3 += 2 + in_width; - - out_ptr0 += out_width; - out_ptr1 += out_width; - } // h -#endif - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + if (m + 1 < out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output_data + b * out_batch_size + (m + 1) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr0 = - input_data + b * in_batch_size + c * in_image_size; + const float + *in_ptr0 = input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9; + + float *out_ptr1 = out_ptr1_base; const float *in_ptr1 = input_data + b * in_batch_size + c * in_image_size + 1 * in_width; @@ -347,61 +98,70 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, input_data + b * in_batch_size + c * in_image_size + 3 * in_width; const float - *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9; + *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9; #if defined(__aarch64__) float *out_ptr0 = out_ptr0_base; - // load filter (1 outch x 3 height x 3 width): vf_outch_height + // load filter (2 outch x 3 height x 3 width): vf_outch_height float32x4_t vf00, vf01, vf02; + float32x4_t vf10, vf11, vf12; vf00 = vld1q_f32(filter_ptr0); vf01 = vld1q_f32(filter_ptr0 + 3); - vf02 = vld1q_f32(filter_ptr0 + 5); + vf02 = vld1q_f32(filter_ptr0 + 6); + + vf10 = vld1q_f32(filter_ptr1); + vf11 = vld1q_f32(filter_ptr1 + 3); + vf12 = vld1q_f32(filter_ptr1 + 6); for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input (4 height x 3 slide): vi_height_slide - float32x4_t vi00, vi01, vi02, vi0n; - float32x4_t vi10, vi11, vi12, vi1n; - float32x4_t vi20, vi21, vi22, vi2n; - float32x4_t vi30, vi31, vi32, vi3n; + float32x4_t vi00, vi01, vi02; // reg count: 14 + float32x4_t vi10, vi11, vi12; + float32x4_t vi20, vi21, vi22; + float32x4_t vi30, vi31, vi32; + float32x4_t vo20, vo30; // tmp use - // output (1 outch x 2 height x 4 width): vo_outch_height + // output (4 outch x 2 height x 4 width): vo_outch_height float32x4_t vo00, vo01; + float32x4_t vo10, vo11; // load input vi00 = vld1q_f32(in_ptr0); - vi0n = vld1q_f32(in_ptr0 + 4); + vo00 = vld1q_f32(in_ptr0 + 4); // reuse vo00: vi0n vi10 = vld1q_f32(in_ptr1); - vi1n = vld1q_f32(in_ptr1 + 4); + vo10 = vld1q_f32(in_ptr1 + 4); vi20 = vld1q_f32(in_ptr2); - vi2n = vld1q_f32(in_ptr2 + 4); + vo20 = vld1q_f32(in_ptr2 + 4); vi30 = vld1q_f32(in_ptr3); - vi3n = vld1q_f32(in_ptr3 + 4); + vo30 = vld1q_f32(in_ptr3 + 4); - vi01 = vextq_f32(vi00, vi0n, 1); - vi02 = vextq_f32(vi00, vi0n, 2); - vi11 = vextq_f32(vi10, vi1n, 1); - vi12 = vextq_f32(vi10, vi1n, 2); - vi21 = vextq_f32(vi20, vi2n, 1); - vi22 = vextq_f32(vi20, vi2n, 2); - vi31 = vextq_f32(vi30, vi3n, 1); - vi32 = vextq_f32(vi30, vi3n, 2); + vi01 = vextq_f32(vi00, vo00, 1); + vi02 = vextq_f32(vi00, vo00, 2); + vi11 = vextq_f32(vi10, vo10, 1); + vi12 = vextq_f32(vi10, vo10, 2); + vi21 = vextq_f32(vi20, vo20, 1); + vi22 = vextq_f32(vi20, vo20, 2); + vi31 = vextq_f32(vi30, vo30, 1); + vi32 = vextq_f32(vi30, vo30, 2); // load ouptut vo00 = vld1q_f32(out_ptr0); vo01 = vld1q_f32(out_ptr0 + out_width); + vo10 = vld1q_f32(out_ptr1); + vo11 = vld1q_f32(out_ptr1 + out_width); // outch 0, height 0 - vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); + vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); // reg count: 18 vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1); vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2); vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0); vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1); vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2); - vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1); - vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2); - vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3); + vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0); + vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1); + vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2); // outch 0, height 1 vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0); @@ -410,12 +170,36 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0); vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1); vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2); - vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1); - vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2); - vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3); + vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0); + vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1); + vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2); + + // outch 1, height 0 + vo10 = vfmaq_laneq_f32(vo10, vi00, vf10, 0); + vo10 = vfmaq_laneq_f32(vo10, vi01, vf10, 1); + vo10 = vfmaq_laneq_f32(vo10, vi02, vf10, 2); + vo10 = vfmaq_laneq_f32(vo10, vi10, vf11, 0); + vo10 = vfmaq_laneq_f32(vo10, vi11, vf11, 1); + vo10 = vfmaq_laneq_f32(vo10, vi12, vf11, 2); + vo10 = vfmaq_laneq_f32(vo10, vi20, vf12, 0); + vo10 = vfmaq_laneq_f32(vo10, vi21, vf12, 1); + vo10 = vfmaq_laneq_f32(vo10, vi22, vf12, 2); + + // outch 1, height 1 + vo11 = vfmaq_laneq_f32(vo11, vi10, vf10, 0); + vo11 = vfmaq_laneq_f32(vo11, vi11, vf10, 1); + vo11 = vfmaq_laneq_f32(vo11, vi12, vf10, 2); + vo11 = vfmaq_laneq_f32(vo11, vi20, vf11, 0); + vo11 = vfmaq_laneq_f32(vo11, vi21, vf11, 1); + vo11 = vfmaq_laneq_f32(vo11, vi22, vf11, 2); + vo11 = vfmaq_laneq_f32(vo11, vi30, vf12, 0); + vo11 = vfmaq_laneq_f32(vo11, vi31, vf12, 1); + vo11 = vfmaq_laneq_f32(vo11, vi32, vf12, 2); vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0 + out_width, vo01); + vst1q_f32(out_ptr1, vo10); + vst1q_f32(out_ptr1 + out_width, vo11); in_ptr0 += 4; in_ptr1 += 4; @@ -423,6 +207,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, in_ptr3 += 4; out_ptr0 += 4; + out_ptr1 += 4; } // w in_ptr0 += 2 + in_width; @@ -431,76 +216,112 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, in_ptr3 += 2 + in_width; out_ptr0 += out_width; - } // h + out_ptr1 += out_width; + } // h #else // arm v7 float *out_ptr0 = out_ptr0_base; - // load filter (1 outch x 3 height x 3 width): vf_outch_height - float32x2_t vf01, vf23, vf45, vf67, vf78; - vf01 = vld1_f32(filter_ptr0); - vf23 = vld1_f32(filter_ptr0 + 2); - vf45 = vld1_f32(filter_ptr0 + 4); - vf67 = vld1_f32(filter_ptr0 + 6); - vf78 = vld1_f32(filter_ptr0 + 7); + // load filter (2 outch x 3 height x 3 width): vf_outch_height + float32x2_t vf001, vf023, vf045, vf067, vf089; + float32x2_t vf101, vf123, vf145, vf167, vf189; + vf001 = vld1_f32(filter_ptr0); + vf023 = vld1_f32(filter_ptr0 + 2); + vf045 = vld1_f32(filter_ptr0 + 4); + vf067 = vld1_f32(filter_ptr0 + 6); + vf089 = vld1_f32(filter_ptr0 + 8); + + vf101 = vld1_f32(filter_ptr1); + vf123 = vld1_f32(filter_ptr1 + 2); + vf145 = vld1_f32(filter_ptr1 + 4); + vf167 = vld1_f32(filter_ptr1 + 6); + vf189 = vld1_f32(filter_ptr1 + 8); for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input (4 height x 3 slide): vi_height_slide - float32x4_t vi00, vi01, vi02, vi0n; - float32x4_t vi10, vi11, vi12, vi1n; - float32x4_t vi20, vi21, vi22, vi2n; - float32x4_t vi30, vi31, vi32, vi3n; + float32x4_t vi00, vi01, vi02; // reg count: 14 + float32x4_t vi10, vi11, vi12; + float32x4_t vi20, vi21, vi22; + float32x4_t vi30, vi31, vi32; + float32x4_t vo20, vo30; // tmp use - // output (1 outch x 2 height x 4 width): vo_outch_height + // output (4 outch x 2 height x 4 width): vo_outch_height float32x4_t vo00, vo01; + float32x4_t vo10, vo11; // load input vi00 = vld1q_f32(in_ptr0); - vi0n = vld1q_f32(in_ptr0 + 4); + vo00 = vld1q_f32(in_ptr0 + 4); // reuse vo00: vi0n vi10 = vld1q_f32(in_ptr1); - vi1n = vld1q_f32(in_ptr1 + 4); + vo10 = vld1q_f32(in_ptr1 + 4); vi20 = vld1q_f32(in_ptr2); - vi2n = vld1q_f32(in_ptr2 + 4); + vo20 = vld1q_f32(in_ptr2 + 4); vi30 = vld1q_f32(in_ptr3); - vi3n = vld1q_f32(in_ptr3 + 4); + vo30 = vld1q_f32(in_ptr3 + 4); - vi01 = vextq_f32(vi00, vi0n, 1); - vi02 = vextq_f32(vi00, vi0n, 2); - vi11 = vextq_f32(vi10, vi1n, 1); - vi12 = vextq_f32(vi10, vi1n, 2); - vi21 = vextq_f32(vi20, vi2n, 1); - vi22 = vextq_f32(vi20, vi2n, 2); - vi31 = vextq_f32(vi30, vi3n, 1); - vi32 = vextq_f32(vi30, vi3n, 2); + vi01 = vextq_f32(vi00, vo00, 1); + vi02 = vextq_f32(vi00, vo00, 2); + vi11 = vextq_f32(vi10, vo10, 1); + vi12 = vextq_f32(vi10, vo10, 2); + vi21 = vextq_f32(vi20, vo20, 1); + vi22 = vextq_f32(vi20, vo20, 2); + vi31 = vextq_f32(vi30, vo30, 1); + vi32 = vextq_f32(vi30, vo30, 2); // load ouptut vo00 = vld1q_f32(out_ptr0); vo01 = vld1q_f32(out_ptr0 + out_width); + vo10 = vld1q_f32(out_ptr1); + vo11 = vld1q_f32(out_ptr1 + out_width); // outch 0, height 0 - vo00 = vmlaq_lane_f32(vo00, vi00, vf01, 0); - vo00 = vmlaq_lane_f32(vo00, vi01, vf01, 1); - vo00 = vmlaq_lane_f32(vo00, vi02, vf23, 0); - vo00 = vmlaq_lane_f32(vo00, vi10, vf23, 1); - vo00 = vmlaq_lane_f32(vo00, vi11, vf45, 0); - vo00 = vmlaq_lane_f32(vo00, vi12, vf45, 1); - vo00 = vmlaq_lane_f32(vo00, vi20, vf67, 0); - vo00 = vmlaq_lane_f32(vo00, vi21, vf67, 1); - vo00 = vmlaq_lane_f32(vo00, vi22, vf78, 1); + vo00 = vmlaq_lane_f32(vo00, vi00, vf001, 0); + vo00 = vmlaq_lane_f32(vo00, vi01, vf001, 1); + vo00 = vmlaq_lane_f32(vo00, vi02, vf023, 0); + vo00 = vmlaq_lane_f32(vo00, vi10, vf023, 1); + vo00 = vmlaq_lane_f32(vo00, vi11, vf045, 0); + vo00 = vmlaq_lane_f32(vo00, vi12, vf045, 1); + vo00 = vmlaq_lane_f32(vo00, vi20, vf067, 0); + vo00 = vmlaq_lane_f32(vo00, vi21, vf067, 1); + vo00 = vmlaq_lane_f32(vo00, vi22, vf089, 0); // outch 0, height 1 - vo01 = vmlaq_lane_f32(vo01, vi10, vf01, 0); - vo01 = vmlaq_lane_f32(vo01, vi11, vf01, 1); - vo01 = vmlaq_lane_f32(vo01, vi12, vf23, 0); - vo01 = vmlaq_lane_f32(vo01, vi20, vf23, 1); - vo01 = vmlaq_lane_f32(vo01, vi21, vf45, 0); - vo01 = vmlaq_lane_f32(vo01, vi22, vf45, 1); - vo01 = vmlaq_lane_f32(vo01, vi30, vf67, 0); - vo01 = vmlaq_lane_f32(vo01, vi31, vf67, 1); - vo01 = vmlaq_lane_f32(vo01, vi32, vf78, 1); + vo01 = vmlaq_lane_f32(vo01, vi10, vf001, 0); + vo01 = vmlaq_lane_f32(vo01, vi11, vf001, 1); + vo01 = vmlaq_lane_f32(vo01, vi12, vf023, 0); + vo01 = vmlaq_lane_f32(vo01, vi20, vf023, 1); + vo01 = vmlaq_lane_f32(vo01, vi21, vf045, 0); + vo01 = vmlaq_lane_f32(vo01, vi22, vf045, 1); + vo01 = vmlaq_lane_f32(vo01, vi30, vf067, 0); + vo01 = vmlaq_lane_f32(vo01, vi31, vf067, 1); + vo01 = vmlaq_lane_f32(vo01, vi32, vf089, 0); + + // outch 1, height 0 + vo10 = vmlaq_lane_f32(vo10, vi00, vf101, 0); + vo10 = vmlaq_lane_f32(vo10, vi01, vf101, 1); + vo10 = vmlaq_lane_f32(vo10, vi02, vf123, 0); + vo10 = vmlaq_lane_f32(vo10, vi10, vf123, 1); + vo10 = vmlaq_lane_f32(vo10, vi11, vf145, 0); + vo10 = vmlaq_lane_f32(vo10, vi12, vf145, 1); + vo10 = vmlaq_lane_f32(vo10, vi20, vf167, 0); + vo10 = vmlaq_lane_f32(vo10, vi21, vf167, 1); + vo10 = vmlaq_lane_f32(vo10, vi22, vf189, 0); + + // outch 1, height 1 + vo11 = vmlaq_lane_f32(vo11, vi10, vf101, 0); + vo11 = vmlaq_lane_f32(vo11, vi11, vf101, 1); + vo11 = vmlaq_lane_f32(vo11, vi12, vf123, 0); + vo11 = vmlaq_lane_f32(vo11, vi20, vf123, 1); + vo11 = vmlaq_lane_f32(vo11, vi21, vf145, 0); + vo11 = vmlaq_lane_f32(vo11, vi22, vf145, 1); + vo11 = vmlaq_lane_f32(vo11, vi30, vf167, 0); + vo11 = vmlaq_lane_f32(vo11, vi31, vf167, 1); + vo11 = vmlaq_lane_f32(vo11, vi32, vf189, 0); vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0 + out_width, vo01); + vst1q_f32(out_ptr1, vo10); + vst1q_f32(out_ptr1 + out_width, vo11); in_ptr0 += 4; in_ptr1 += 4; @@ -508,6 +329,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, in_ptr3 += 4; out_ptr0 += 4; + out_ptr1 += 4; } // w in_ptr0 += 2 + in_width; @@ -516,13 +338,204 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, in_ptr3 += 2 + in_width; out_ptr0 += out_width; + out_ptr1 += out_width; } // h #endif } // c - } // mm - } // if - } // m - } // b + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr0 = + input_data + b * in_batch_size + c * in_image_size; + const float *in_ptr1 = + input_data + b * in_batch_size + c * in_image_size + + 1 * in_width; + const float *in_ptr2 = + input_data + b * in_batch_size + c * in_image_size + + 2 * in_width; + const float *in_ptr3 = + input_data + b * in_batch_size + c * in_image_size + + 3 * in_width; + const float + *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9; + +#if defined(__aarch64__) + float *out_ptr0 = out_ptr0_base; + + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float32x4_t vf00, vf01, vf02; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + vf02 = vld1q_f32(filter_ptr0 + 5); + + for (index_t h = 0; h + 1 < out_height; h += 2) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input (4 height x 3 slide): vi_height_slide + float32x4_t vi00, vi01, vi02, vi0n; + float32x4_t vi10, vi11, vi12, vi1n; + float32x4_t vi20, vi21, vi22, vi2n; + float32x4_t vi30, vi31, vi32, vi3n; + + // output (1 outch x 2 height x 4 width): vo_outch_height + float32x4_t vo00, vo01; + + // load input + vi00 = vld1q_f32(in_ptr0); + vi0n = vld1q_f32(in_ptr0 + 4); + vi10 = vld1q_f32(in_ptr1); + vi1n = vld1q_f32(in_ptr1 + 4); + vi20 = vld1q_f32(in_ptr2); + vi2n = vld1q_f32(in_ptr2 + 4); + vi30 = vld1q_f32(in_ptr3); + vi3n = vld1q_f32(in_ptr3 + 4); + + vi01 = vextq_f32(vi00, vi0n, 1); + vi02 = vextq_f32(vi00, vi0n, 2); + vi11 = vextq_f32(vi10, vi1n, 1); + vi12 = vextq_f32(vi10, vi1n, 2); + vi21 = vextq_f32(vi20, vi2n, 1); + vi22 = vextq_f32(vi20, vi2n, 2); + vi31 = vextq_f32(vi30, vi3n, 1); + vi32 = vextq_f32(vi30, vi3n, 2); + + // load ouptut + vo00 = vld1q_f32(out_ptr0); + vo01 = vld1q_f32(out_ptr0 + out_width); + + // outch 0, height 0 + vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); + vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1); + vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2); + vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0); + vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1); + vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2); + vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1); + vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2); + vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3); + + // outch 0, height 1 + vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0); + vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1); + vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2); + vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0); + vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1); + vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2); + vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1); + vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2); + vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3); + + vst1q_f32(out_ptr0, vo00); + vst1q_f32(out_ptr0 + out_width, vo01); + + in_ptr0 += 4; + in_ptr1 += 4; + in_ptr2 += 4; + in_ptr3 += 4; + + out_ptr0 += 4; + } // w + + in_ptr0 += 2 + in_width; + in_ptr1 += 2 + in_width; + in_ptr2 += 2 + in_width; + in_ptr3 += 2 + in_width; + + out_ptr0 += out_width; + } // h +#else // arm v7 + float *out_ptr0 = out_ptr0_base; + + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float32x2_t vf01, vf23, vf45, vf67, vf78; + vf01 = vld1_f32(filter_ptr0); + vf23 = vld1_f32(filter_ptr0 + 2); + vf45 = vld1_f32(filter_ptr0 + 4); + vf67 = vld1_f32(filter_ptr0 + 6); + vf78 = vld1_f32(filter_ptr0 + 7); + + for (index_t h = 0; h + 1 < out_height; h += 2) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input (4 height x 3 slide): vi_height_slide + float32x4_t vi00, vi01, vi02, vi0n; + float32x4_t vi10, vi11, vi12, vi1n; + float32x4_t vi20, vi21, vi22, vi2n; + float32x4_t vi30, vi31, vi32, vi3n; + + // output (1 outch x 2 height x 4 width): vo_outch_height + float32x4_t vo00, vo01; + + // load input + vi00 = vld1q_f32(in_ptr0); + vi0n = vld1q_f32(in_ptr0 + 4); + vi10 = vld1q_f32(in_ptr1); + vi1n = vld1q_f32(in_ptr1 + 4); + vi20 = vld1q_f32(in_ptr2); + vi2n = vld1q_f32(in_ptr2 + 4); + vi30 = vld1q_f32(in_ptr3); + vi3n = vld1q_f32(in_ptr3 + 4); + + vi01 = vextq_f32(vi00, vi0n, 1); + vi02 = vextq_f32(vi00, vi0n, 2); + vi11 = vextq_f32(vi10, vi1n, 1); + vi12 = vextq_f32(vi10, vi1n, 2); + vi21 = vextq_f32(vi20, vi2n, 1); + vi22 = vextq_f32(vi20, vi2n, 2); + vi31 = vextq_f32(vi30, vi3n, 1); + vi32 = vextq_f32(vi30, vi3n, 2); + + // load ouptut + vo00 = vld1q_f32(out_ptr0); + vo01 = vld1q_f32(out_ptr0 + out_width); + + // outch 0, height 0 + vo00 = vmlaq_lane_f32(vo00, vi00, vf01, 0); + vo00 = vmlaq_lane_f32(vo00, vi01, vf01, 1); + vo00 = vmlaq_lane_f32(vo00, vi02, vf23, 0); + vo00 = vmlaq_lane_f32(vo00, vi10, vf23, 1); + vo00 = vmlaq_lane_f32(vo00, vi11, vf45, 0); + vo00 = vmlaq_lane_f32(vo00, vi12, vf45, 1); + vo00 = vmlaq_lane_f32(vo00, vi20, vf67, 0); + vo00 = vmlaq_lane_f32(vo00, vi21, vf67, 1); + vo00 = vmlaq_lane_f32(vo00, vi22, vf78, 1); + + // outch 0, height 1 + vo01 = vmlaq_lane_f32(vo01, vi10, vf01, 0); + vo01 = vmlaq_lane_f32(vo01, vi11, vf01, 1); + vo01 = vmlaq_lane_f32(vo01, vi12, vf23, 0); + vo01 = vmlaq_lane_f32(vo01, vi20, vf23, 1); + vo01 = vmlaq_lane_f32(vo01, vi21, vf45, 0); + vo01 = vmlaq_lane_f32(vo01, vi22, vf45, 1); + vo01 = vmlaq_lane_f32(vo01, vi30, vf67, 0); + vo01 = vmlaq_lane_f32(vo01, vi31, vf67, 1); + vo01 = vmlaq_lane_f32(vo01, vi32, vf78, 1); + + vst1q_f32(out_ptr0, vo00); + vst1q_f32(out_ptr0 + out_width, vo01); + + in_ptr0 += 4; + in_ptr1 += 4; + in_ptr2 += 4; + in_ptr3 += 4; + + out_ptr0 += 4; + } // w + + in_ptr0 += 2 + in_width; + in_ptr1 += 2 + in_width; + in_ptr2 += 2 + in_width; + in_ptr3 += 2 + in_width; + + out_ptr0 += out_width; + } // h +#endif + } // c + } // mm + } // if + } // m + } // b + }, 0, batch, 1, 0, out_channels, 2); UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; @@ -544,11 +557,11 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, &padded_input, &padded_output); const Tensor *in_tensor = input; - if (padded_input.get() != nullptr) { + if (padded_input != nullptr) { in_tensor = padded_input.get(); } Tensor *out_tensor = output; - if (padded_output.get() != nullptr) { + if (padded_output != nullptr) { out_tensor = padded_output.get(); } out_tensor->Clear(); @@ -560,153 +573,163 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, auto input_data = in_tensor->data(); auto output_data = out_tensor->mutable_data(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); - - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; ++m) { - for (index_t c = 0; c < in_shape[1]; ++c) { - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const float - *in_base = input_data + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9; - float *out_base = output_data + b * out_batch_size + m * out_image_size; + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + for (index_t c = 0; c < in_channels; ++c) { + const float + *in_base = input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9; + float + *out_base = output_data + b * out_batch_size + m * out_image_size; #if defined(__aarch64__) - // load filter (1 outch x 3 height x 3 width): vf_outch_height - float32x4_t vf00, vf01, vf02; - vf00 = vld1q_f32(filter_ptr); - vf01 = vld1q_f32(filter_ptr + 3); - vf02 = vld1q_f32(filter_ptr + 5); - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - float32x4x2_t vi0, vi1, vi2; - float32x4_t vi0n, vi1n, vi2n; - - // input (3 height x 3 slide): vi_height_slide - float32x4_t vi00, vi01, vi02; - float32x4_t vi10, vi11, vi12; - float32x4_t vi20, vi21, vi22; - - // output (1 outch x 1 height x 4 width): vo - float32x4_t vo; - - // load input - index_t in_h = h * 2; - index_t in_w = w * 2; - index_t in_offset = in_h * in_width + in_w; - vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] - vi1 = vld2q_f32(in_base + in_offset + in_width); - vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); - - vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] - vi1n = vld1q_f32(in_base + in_offset + in_width + 8); - vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); - - // load ouptut - index_t out_offset = h * out_width + w; - vo = vld1q_f32(out_base + out_offset); - - vi00 = vi0.val[0]; // [0.2.4.6] - vi01 = vi0.val[1]; // [1.3.5.7] - vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] - vi10 = vi1.val[0]; - vi11 = vi1.val[1]; - vi12 = vextq_f32(vi10, vi1n, 1); - vi20 = vi2.val[0]; - vi21 = vi2.val[1]; - vi22 = vextq_f32(vi20, vi2n, 1); - - // outch 0, height 0 - vo = vfmaq_laneq_f32(vo, vi00, vf00, 0); - vo = vfmaq_laneq_f32(vo, vi01, vf00, 1); - vo = vfmaq_laneq_f32(vo, vi02, vf00, 2); - vo = vfmaq_laneq_f32(vo, vi10, vf01, 0); - vo = vfmaq_laneq_f32(vo, vi11, vf01, 1); - vo = vfmaq_laneq_f32(vo, vi12, vf01, 2); - vo = vfmaq_laneq_f32(vo, vi20, vf02, 1); - vo = vfmaq_laneq_f32(vo, vi21, vf02, 2); - vo = vfmaq_laneq_f32(vo, vi22, vf02, 3); - - vst1q_f32(out_base + out_offset, vo); - } // w - } // h + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float32x4_t vf00, vf01, vf02; + vf00 = vld1q_f32(filter_ptr); + vf01 = vld1q_f32(filter_ptr + 3); + vf02 = vld1q_f32(filter_ptr + 5); + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + float32x4x2_t vi0, vi1, vi2; + float32x4_t vi0n, vi1n, vi2n; + + // input (3 height x 3 slide): vi_height_slide + float32x4_t vi00, vi01, vi02; + float32x4_t vi10, vi11, vi12; + float32x4_t vi20, vi21, vi22; + + // output (1 outch x 1 height x 4 width): vo + float32x4_t vo; + + // load input + index_t in_h = h * 2; + index_t in_w = w * 2; + index_t in_offset = in_h * in_width + in_w; + vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] + vi1 = vld2q_f32(in_base + in_offset + in_width); + vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); + + vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] + vi1n = vld1q_f32(in_base + in_offset + in_width + 8); + vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); + + // load ouptut + index_t out_offset = h * out_width + w; + vo = vld1q_f32(out_base + out_offset); + + vi00 = vi0.val[0]; // [0.2.4.6] + vi01 = vi0.val[1]; // [1.3.5.7] + vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] + vi10 = vi1.val[0]; + vi11 = vi1.val[1]; + vi12 = vextq_f32(vi10, vi1n, 1); + vi20 = vi2.val[0]; + vi21 = vi2.val[1]; + vi22 = vextq_f32(vi20, vi2n, 1); + + // outch 0, height 0 + vo = vfmaq_laneq_f32(vo, vi00, vf00, 0); + vo = vfmaq_laneq_f32(vo, vi01, vf00, 1); + vo = vfmaq_laneq_f32(vo, vi02, vf00, 2); + vo = vfmaq_laneq_f32(vo, vi10, vf01, 0); + vo = vfmaq_laneq_f32(vo, vi11, vf01, 1); + vo = vfmaq_laneq_f32(vo, vi12, vf01, 2); + vo = vfmaq_laneq_f32(vo, vi20, vf02, 1); + vo = vfmaq_laneq_f32(vo, vi21, vf02, 2); + vo = vfmaq_laneq_f32(vo, vi22, vf02, 3); + + vst1q_f32(out_base + out_offset, vo); + } // w + } // h #else // arm v7 - // load filter (1 outch x 3 height x 3 width): vf_outch_height - float32x2_t vf01, vf23, vf45, vf67, vf78; - vf01 = vld1_f32(filter_ptr); - vf23 = vld1_f32(filter_ptr + 2); - vf45 = vld1_f32(filter_ptr + 4); - vf67 = vld1_f32(filter_ptr + 6); - vf78 = vld1_f32(filter_ptr + 7); - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - float32x4x2_t vi0, vi1, vi2; - float32x4_t vi0n, vi1n, vi2n; - - // input (3 height x 3 slide): vi_height_slide - float32x4_t vi00, vi01, vi02; - float32x4_t vi10, vi11, vi12; - float32x4_t vi20, vi21, vi22; - - // output (1 outch x 1 height x 4 width): vo - float32x4_t vo; - - // load input - index_t in_h = h * 2; - index_t in_w = w * 2; - index_t in_offset = in_h * in_width + in_w; - vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] - vi1 = vld2q_f32(in_base + in_offset + in_width); - vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); - - vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] - vi1n = vld1q_f32(in_base + in_offset + in_width + 8); - vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); - - // load ouptut - index_t out_offset = h * out_width + w; - vo = vld1q_f32(out_base + out_offset); - - vi00 = vi0.val[0]; // [0.2.4.6] - vi01 = vi0.val[1]; // [1.3.5.7] - vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] - vi10 = vi1.val[0]; - vi11 = vi1.val[1]; - vi12 = vextq_f32(vi10, vi1n, 1); - vi20 = vi2.val[0]; - vi21 = vi2.val[1]; - vi22 = vextq_f32(vi20, vi2n, 1); - - // outch 0, height 0 - vo = vmlaq_lane_f32(vo, vi00, vf01, 0); - vo = vmlaq_lane_f32(vo, vi01, vf01, 1); - vo = vmlaq_lane_f32(vo, vi02, vf23, 0); - vo = vmlaq_lane_f32(vo, vi10, vf23, 1); - vo = vmlaq_lane_f32(vo, vi11, vf45, 0); - vo = vmlaq_lane_f32(vo, vi12, vf45, 1); - vo = vmlaq_lane_f32(vo, vi20, vf67, 0); - vo = vmlaq_lane_f32(vo, vi21, vf67, 1); - vo = vmlaq_lane_f32(vo, vi22, vf78, 1); - - vst1q_f32(out_base + out_offset, vo); - } // w - } // h + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float32x2_t vf01, vf23, vf45, vf67, vf78; + vf01 = vld1_f32(filter_ptr); + vf23 = vld1_f32(filter_ptr + 2); + vf45 = vld1_f32(filter_ptr + 4); + vf67 = vld1_f32(filter_ptr + 6); + vf78 = vld1_f32(filter_ptr + 7); + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + float32x4x2_t vi0, vi1, vi2; + float32x4_t vi0n, vi1n, vi2n; + + // input (3 height x 3 slide): vi_height_slide + float32x4_t vi00, vi01, vi02; + float32x4_t vi10, vi11, vi12; + float32x4_t vi20, vi21, vi22; + + // output (1 outch x 1 height x 4 width): vo + float32x4_t vo; + + // load input + index_t in_h = h * 2; + index_t in_w = w * 2; + index_t in_offset = in_h * in_width + in_w; + vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] + vi1 = vld2q_f32(in_base + in_offset + in_width); + vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); + + vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] + vi1n = vld1q_f32(in_base + in_offset + in_width + 8); + vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); + + // load ouptut + index_t out_offset = h * out_width + w; + vo = vld1q_f32(out_base + out_offset); + + vi00 = vi0.val[0]; // [0.2.4.6] + vi01 = vi0.val[1]; // [1.3.5.7] + vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] + vi10 = vi1.val[0]; + vi11 = vi1.val[1]; + vi12 = vextq_f32(vi10, vi1n, 1); + vi20 = vi2.val[0]; + vi21 = vi2.val[1]; + vi22 = vextq_f32(vi20, vi2n, 1); + + // outch 0, height 0 + vo = vmlaq_lane_f32(vo, vi00, vf01, 0); + vo = vmlaq_lane_f32(vo, vi01, vf01, 1); + vo = vmlaq_lane_f32(vo, vi02, vf23, 0); + vo = vmlaq_lane_f32(vo, vi10, vf23, 1); + vo = vmlaq_lane_f32(vo, vi11, vf45, 0); + vo = vmlaq_lane_f32(vo, vi12, vf45, 1); + vo = vmlaq_lane_f32(vo, vi20, vf67, 0); + vo = vmlaq_lane_f32(vo, vi21, vf67, 1); + vo = vmlaq_lane_f32(vo, vi22, vf78, 1); + + vst1q_f32(out_base + out_offset, vo); + } // w + } // h #endif - } // c - } // m - } // b + } // c + } // m + } // b + }, 0, batch, 1, 0, out_channels, 1); UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; diff --git a/mace/ops/arm/fp32/conv_2d_3x3.h b/mace/ops/arm/fp32/conv_2d_3x3.h index 66d47801c39fee076ca0fd0bddff806a8e30c127..bd96501d98f32ebe9ffe0bad98cccee67bc0b062 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3.h +++ b/mace/ops/arm/fp32/conv_2d_3x3.h @@ -28,7 +28,7 @@ namespace fp32 { class Conv2dK3x3S1 : public Conv2dBase { public: - Conv2dK3x3S1(const std::vector paddings, const Padding padding_type) + Conv2dK3x3S1(const std::vector &paddings, const Padding padding_type) : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK3x3S1() {} @@ -36,12 +36,12 @@ class Conv2dK3x3S1 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; class Conv2dK3x3S2 : public Conv2dBase { public: - Conv2dK3x3S2(const std::vector paddings, const Padding padding_type) + Conv2dK3x3S2(const std::vector &paddings, const Padding padding_type) : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK3x3S2() {} @@ -49,7 +49,7 @@ class Conv2dK3x3S2 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; } // namespace fp32 diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc index b894a60a964ff9b149abc5d93852f76a658b9b94..ab2517bf6295691de4ba00fd22d9e651e1e13fee 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc +++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h" + #include -#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h" #include "mace/ops/common/conv_pool_2d_util.h" #include "mace/utils/memory.h" #include "mace/utils/math.h" @@ -136,13 +137,15 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, auto transformed_filter_data = transformed_filter_->mutable_data(); switch (out_tile_size) { case 2: - TransformFilter4x4(filter_data, + TransformFilter4x4(context, + filter_data, in_channels, out_channels, transformed_filter_data); break; case 6: - TransformFilter8x8(filter_data, + TransformFilter8x8(context, + filter_data, in_channels, out_channels, transformed_filter_data); @@ -153,7 +156,8 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, switch (out_tile_size) { case 2: - TransformInput4x4(padded_in_data, + TransformInput4x4(context, + padded_in_data, batch, padded_in_height, padded_in_width, @@ -162,7 +166,8 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, transformed_in_data); break; case 6: - TransformInput8x8(padded_in_data, + TransformInput8x8(context, + padded_in_data, batch, padded_in_height, padded_in_width, @@ -212,7 +217,8 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, switch (out_tile_size) { case 2: - TransformOutput4x4(transformed_out_data, + TransformOutput4x4(context, + transformed_out_data, batch, padded_out_height, padded_out_width, @@ -221,7 +227,8 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, padded_out_data); break; case 6: - TransformOutput8x8(transformed_out_data, + TransformOutput8x8(context, + transformed_out_data, batch, padded_out_height, padded_out_width, @@ -238,72 +245,78 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, } // OCHW => TOC -void Conv2dK3x3Winograd::TransformFilter4x4(const float *filter, +void Conv2dK3x3Winograd::TransformFilter4x4(const OpContext *context, + const float *filter, const index_t in_channels, const index_t out_channels, float *output) { const index_t stride = out_channels * in_channels; -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t m = 0; m < out_channels; ++m) { - for (index_t c = 0; c < in_channels; ++c) { - float g0, g1, g2, g3, g4, g5, g6, g7, g8; - float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, - s15; - - // load filter - index_t filter_offset = (m * in_channels + c) * 9; - g0 = filter[filter_offset]; - g1 = filter[filter_offset + 1]; - g2 = filter[filter_offset + 2]; - g3 = filter[filter_offset + 3]; - g4 = filter[filter_offset + 4]; - g5 = filter[filter_offset + 5]; - g6 = filter[filter_offset + 6]; - g7 = filter[filter_offset + 7]; - g8 = filter[filter_offset + 8]; - - // s = G * g * GT - s0 = g0; - s1 = (g0 + g2 + g1) * 0.5f; - s2 = (g0 + g2 - g1) * 0.5f; - s3 = g2; - s4 = (g0 + g6 + g3) * 0.5f; - s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f; - s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f; - s7 = (g2 + g8 + g5) * 0.5f; - s8 = (g0 + g6 - g3) * 0.5f; - s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f; - s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f; - s11 = (g2 + g8 - g5) * 0.5f; - s12 = g6; - s13 = (g6 + g8 + g7) * 0.5f; - s14 = (g6 + g8 - g7) * 0.5f; - s15 = g8; - - // store output - index_t output_offset = m * in_channels + c; - output[output_offset + 0 * stride] = s0; - output[output_offset + 1 * stride] = s1; - output[output_offset + 2 * stride] = s2; - output[output_offset + 3 * stride] = s3; - - output[output_offset + 4 * stride] = s4; - output[output_offset + 5 * stride] = s5; - output[output_offset + 6 * stride] = s6; - output[output_offset + 7 * stride] = s7; - - output[output_offset + 8 * stride] = s8; - output[output_offset + 9 * stride] = s9; - output[output_offset + 10 * stride] = s10; - output[output_offset + 11 * stride] = s11; - - output[output_offset + 12 * stride] = s12; - output[output_offset + 13 * stride] = s13; - output[output_offset + 14 * stride] = s14; - output[output_offset + 15 * stride] = s15; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t m = start0; m < end0; m += step0) { + for (index_t c = start1; c < end1; c += step1) { + float g0, g1, g2, g3, g4, g5, g6, g7, g8; + float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + + // load filter + index_t filter_offset = (m * in_channels + c) * 9; + g0 = filter[filter_offset]; + g1 = filter[filter_offset + 1]; + g2 = filter[filter_offset + 2]; + g3 = filter[filter_offset + 3]; + g4 = filter[filter_offset + 4]; + g5 = filter[filter_offset + 5]; + g6 = filter[filter_offset + 6]; + g7 = filter[filter_offset + 7]; + g8 = filter[filter_offset + 8]; + + // s = G * g * GT + s0 = g0; + s1 = (g0 + g2 + g1) * 0.5f; + s2 = (g0 + g2 - g1) * 0.5f; + s3 = g2; + s4 = (g0 + g6 + g3) * 0.5f; + s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f; + s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f; + s7 = (g2 + g8 + g5) * 0.5f; + s8 = (g0 + g6 - g3) * 0.5f; + s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f; + s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f; + s11 = (g2 + g8 - g5) * 0.5f; + s12 = g6; + s13 = (g6 + g8 + g7) * 0.5f; + s14 = (g6 + g8 - g7) * 0.5f; + s15 = g8; + + // store output + index_t output_offset = m * in_channels + c; + output[output_offset + 0 * stride] = s0; + output[output_offset + 1 * stride] = s1; + output[output_offset + 2 * stride] = s2; + output[output_offset + 3 * stride] = s3; + + output[output_offset + 4 * stride] = s4; + output[output_offset + 5 * stride] = s5; + output[output_offset + 6 * stride] = s6; + output[output_offset + 7 * stride] = s7; + + output[output_offset + 8 * stride] = s8; + output[output_offset + 9 * stride] = s9; + output[output_offset + 10 * stride] = s10; + output[output_offset + 11 * stride] = s11; + + output[output_offset + 12 * stride] = s12; + output[output_offset + 13 * stride] = s13; + output[output_offset + 14 * stride] = s14; + output[output_offset + 15 * stride] = s15; + } } - } + }, 0, out_channels, 1, 0, in_channels, 1); } // OCHW => TOC @@ -325,7 +338,8 @@ void Conv2dK3x3Winograd::TransformFilter4x4(const float *filter, ⎢ ⎥ ⎣ 0 0 1 ⎦ */ -void Conv2dK3x3Winograd::TransformFilter8x8(const float *filter, +void Conv2dK3x3Winograd::TransformFilter8x8(const OpContext *context, + const float *filter, const index_t in_channels, const index_t out_channels, float *output) { @@ -340,43 +354,49 @@ void Conv2dK3x3Winograd::TransformFilter8x8(const float *filter, {1.0f / 45, -1.0f / 90, 1.0f / 180}, {0.0f, 0.0f, 1.0f}}; -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t m = 0; m < out_channels; ++m) { - for (index_t c = 0; c < in_channels; ++c) { - // load filter - index_t filter_offset = (m * in_channels + c) * 9; - float g0, g1, g2, g3, g4, g5, g6, g7, g8; - g0 = filter[filter_offset]; - g1 = filter[filter_offset + 1]; - g2 = filter[filter_offset + 2]; - g3 = filter[filter_offset + 3]; - g4 = filter[filter_offset + 4]; - g5 = filter[filter_offset + 5]; - g6 = filter[filter_offset + 6]; - g7 = filter[filter_offset + 7]; - g8 = filter[filter_offset + 8]; - - float s[3][8]; - for (int i = 0; i < 8; ++i) { - s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2]; - s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2]; - s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2]; - } + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t m = start0; m < end0; m += step0) { + for (index_t c = start1; c < end1; c += step1) { + // load filter + index_t filter_offset = (m * in_channels + c) * 9; + float g0, g1, g2, g3, g4, g5, g6, g7, g8; + g0 = filter[filter_offset]; + g1 = filter[filter_offset + 1]; + g2 = filter[filter_offset + 2]; + g3 = filter[filter_offset + 3]; + g4 = filter[filter_offset + 4]; + g5 = filter[filter_offset + 5]; + g6 = filter[filter_offset + 6]; + g7 = filter[filter_offset + 7]; + g8 = filter[filter_offset + 8]; + + float s[3][8]; + for (int i = 0; i < 8; ++i) { + s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2]; + s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2]; + s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2]; + } - // store output - index_t output_offset = m * in_channels + c; - for (int i = 0; i < 8; ++i) { - for (int j = 0; j < 8; ++j) { - output[output_offset + (i * 8 + j) * stride] = - G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j]; + // store output + index_t output_offset = m * in_channels + c; + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + output[output_offset + (i * 8 + j) * stride] = + G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j]; + } } } } - } + }, 0, out_channels, 1, 0, in_channels, 1); } // NCHW => NTCB (T: in tile pixels, B: tile indices) -void Conv2dK3x3Winograd::TransformInput4x4(const float *input, +void Conv2dK3x3Winograd::TransformInput4x4(const OpContext *context, + const float *input, const index_t batch, const index_t in_height, const index_t in_width, @@ -388,86 +408,93 @@ void Conv2dK3x3Winograd::TransformInput4x4(const float *input, const index_t input_batch_size = in_height_width * in_channels; const index_t output_batch_size = 16 * in_channels * tile_count; -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t n = 0; n < batch; ++n) { - for (index_t c = 0; c < in_channels; ++c) { - index_t tile_index = 0; - for (index_t h = 0; h < in_height - 2; h += 2) { - for (index_t w = 0; w < in_width - 2; w += 2) { - float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, - d15; - float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, - s15; - - // load tile data - const float *input_ptr = input + n * input_batch_size + - c * in_height_width + h * in_width + w; - d0 = input_ptr[0]; - d1 = input_ptr[1]; - d2 = input_ptr[2]; - d3 = input_ptr[3]; - - d4 = input_ptr[in_width]; - d5 = input_ptr[in_width + 1]; - d6 = input_ptr[in_width + 2]; - d7 = input_ptr[in_width + 3]; - - d8 = input_ptr[2 * in_width]; - d9 = input_ptr[2 * in_width + 1]; - d10 = input_ptr[2 * in_width + 2]; - d11 = input_ptr[2 * in_width + 3]; - - d12 = input_ptr[3 * in_width]; - d13 = input_ptr[3 * in_width + 1]; - d14 = input_ptr[3 * in_width + 2]; - d15 = input_ptr[3 * in_width + 3]; - - // s = BT * d * B - s0 = (d0 - d8) - (d2 - d10); - s1 = (d1 - d9) + (d2 - d10); - s2 = (d2 - d10) - (d1 - d9); - s3 = (d1 - d9) - (d3 - d11); - s4 = (d4 + d8) - (d6 + d10); - s5 = (d5 + d9) + (d6 + d10); - s6 = (d6 + d10) - (d5 + d9); - s7 = (d5 + d9) - (d7 + d11); - s8 = (d8 - d4) - (d10 - d6); - s9 = (d9 - d5) + (d10 - d6); - s10 = (d10 - d6) - (d9 - d5); - s11 = (d9 - d5) - (d11 - d7); - s12 = (d4 - d12) - (d6 - d14); - s13 = (d5 - d13) + (d6 - d14); - s14 = (d6 - d14) - (d5 - d13); - s15 = (d5 - d13) - (d7 - d15); - - // store output - float *output_ptr = - output + n * output_batch_size + c * tile_count + tile_index; - output_ptr[0] = s0; - output_ptr[1 * stride] = s1; - output_ptr[2 * stride] = s2; - output_ptr[3 * stride] = s3; - - output_ptr[4 * stride] = s4; - output_ptr[5 * stride] = s5; - output_ptr[6 * stride] = s6; - output_ptr[7 * stride] = s7; - - output_ptr[8 * stride] = s8; - output_ptr[9 * stride] = s9; - output_ptr[10 * stride] = s10; - output_ptr[11 * stride] = s11; - - output_ptr[12 * stride] = s12; - output_ptr[13 * stride] = s13; - output_ptr[14 * stride] = s14; - output_ptr[15 * stride] = s15; - - ++tile_index; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t n = start0; n < end0; n += step0) { + for (index_t c = start1; c < end1; c += step1) { + index_t tile_index = 0; + for (index_t h = 0; h < in_height - 2; h += 2) { + for (index_t w = 0; w < in_width - 2; w += 2) { + float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, + d14, + d15; + float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, + s15; + + // load tile data + const float *input_ptr = input + n * input_batch_size + + c * in_height_width + h * in_width + w; + d0 = input_ptr[0]; + d1 = input_ptr[1]; + d2 = input_ptr[2]; + d3 = input_ptr[3]; + + d4 = input_ptr[in_width]; + d5 = input_ptr[in_width + 1]; + d6 = input_ptr[in_width + 2]; + d7 = input_ptr[in_width + 3]; + + d8 = input_ptr[2 * in_width]; + d9 = input_ptr[2 * in_width + 1]; + d10 = input_ptr[2 * in_width + 2]; + d11 = input_ptr[2 * in_width + 3]; + + d12 = input_ptr[3 * in_width]; + d13 = input_ptr[3 * in_width + 1]; + d14 = input_ptr[3 * in_width + 2]; + d15 = input_ptr[3 * in_width + 3]; + + // s = BT * d * B + s0 = (d0 - d8) - (d2 - d10); + s1 = (d1 - d9) + (d2 - d10); + s2 = (d2 - d10) - (d1 - d9); + s3 = (d1 - d9) - (d3 - d11); + s4 = (d4 + d8) - (d6 + d10); + s5 = (d5 + d9) + (d6 + d10); + s6 = (d6 + d10) - (d5 + d9); + s7 = (d5 + d9) - (d7 + d11); + s8 = (d8 - d4) - (d10 - d6); + s9 = (d9 - d5) + (d10 - d6); + s10 = (d10 - d6) - (d9 - d5); + s11 = (d9 - d5) - (d11 - d7); + s12 = (d4 - d12) - (d6 - d14); + s13 = (d5 - d13) + (d6 - d14); + s14 = (d6 - d14) - (d5 - d13); + s15 = (d5 - d13) - (d7 - d15); + + // store output + float *output_ptr = + output + n * output_batch_size + c * tile_count + tile_index; + output_ptr[0] = s0; + output_ptr[1 * stride] = s1; + output_ptr[2 * stride] = s2; + output_ptr[3 * stride] = s3; + + output_ptr[4 * stride] = s4; + output_ptr[5 * stride] = s5; + output_ptr[6 * stride] = s6; + output_ptr[7 * stride] = s7; + + output_ptr[8 * stride] = s8; + output_ptr[9 * stride] = s9; + output_ptr[10 * stride] = s10; + output_ptr[11 * stride] = s11; + + output_ptr[12 * stride] = s12; + output_ptr[13 * stride] = s13; + output_ptr[14 * stride] = s14; + output_ptr[15 * stride] = s15; + + ++tile_index; + } } } } - } + }, 0, batch, 1, 0, in_channels, 1); } // NCHW => NTCB (T: in tile pixels, B: tile indices) @@ -489,7 +516,8 @@ void Conv2dK3x3Winograd::TransformInput4x4(const float *input, ⎢ ⎥ ⎣0 -1 0 21/4 0 -21/4 0 1⎦ */ -void Conv2dK3x3Winograd::TransformInput8x8(const float *input, +void Conv2dK3x3Winograd::TransformInput8x8(const OpContext *context, + const float *input, const index_t batch, const index_t in_height, const index_t in_width, @@ -501,89 +529,94 @@ void Conv2dK3x3Winograd::TransformInput8x8(const float *input, const index_t input_batch_size = in_height_width * in_channels; const index_t output_batch_size = 64 * in_channels * tile_count; -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t n = 0; n < batch; ++n) { - for (index_t c = 0; c < in_channels; ++c) { - index_t tile_index = 0; - float s[8][8]; - for (index_t h = 0; h < in_height - 2; h += 6) { - for (index_t w = 0; w < in_width - 2; w += 6) { - const float *input_ptr = input + n * input_batch_size + - c * in_height_width + h * in_width + w; - - for (int i = 0; i < 8; ++i) { - float d0, d1, d2, d3, d4, d5, d6, d7; - d0 = input_ptr[0]; - d1 = input_ptr[1]; - d2 = input_ptr[2]; - d3 = input_ptr[3]; - d4 = input_ptr[4]; - d5 = input_ptr[5]; - d6 = input_ptr[6]; - d7 = input_ptr[7]; - - s[i][0] = d0 - d6 + (d4 - d2) * 5.25; - s[i][7] = d7 - d1 + (d3 - d5) * 5.25; - - float u = d2 + d6 - d4 * 4.25; - float v = d1 + d5 - d3 * 4.25; - s[i][1] = u + v; - s[i][2] = u - v; - - u = d6 + d2 * 0.25 - d4 * 1.25; - v = d1 * 0.5 - d3 * 2.5 + d5 * 2; - s[i][3] = u + v; - s[i][4] = u - v; - - u = d6 + (d2 - d4 * 1.25) * 4; - v = d1 * 2 - d3 * 2.5 + d5 * 0.5; - s[i][5] = u + v; - s[i][6] = u - v; - - input_ptr += in_width; - } - - float *output_ptr = - output + n * output_batch_size + c * tile_count + tile_index; - for (int i = 0; i < 8; ++i) { - float d0, d1, d2, d3, d4, d5, d6, d7; - d0 = s[0][i]; - d1 = s[1][i]; - d2 = s[2][i]; - d3 = s[3][i]; - d4 = s[4][i]; - d5 = s[5][i]; - d6 = s[6][i]; - d7 = s[7][i]; - - output_ptr[i * stride] = d0 - d6 + (d4 - d2) * 5.25; - output_ptr[(56 + i) * stride] = d7 - d1 + (d3 - d5) * 5.25; - - float u = d2 + d6 - d4 * 4.25; - float v = d1 + d5 - d3 * 4.25; - output_ptr[(8 + i) * stride] = u + v; - output_ptr[(16 + i) * stride] = u - v; - - u = d6 + d2 * 0.25 - d4 * 1.25; - v = d1 * 0.5 - d3 * 2.5 + d5 * 2; - output_ptr[(24 + i) * stride] = u + v; - output_ptr[(32 + i) * stride] = u - v; - - u = d6 + (d2 - d4 * 1.25) * 4; - v = d1 * 2 - d3 * 2.5 + d5 * 0.5; - output_ptr[(40 + i) * stride] = u + v; - output_ptr[(48 + i) * stride] = u - v; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t n = start0; n < end0; n += step0) { + for (index_t c = start1; c < end1; c += step1) { + index_t tile_index = 0; + float s[8][8]; + for (index_t h = 0; h < in_height - 2; h += 6) { + for (index_t w = 0; w < in_width - 2; w += 6) { + const float *input_ptr = input + n * input_batch_size + + c * in_height_width + h * in_width + w; + + for (int i = 0; i < 8; ++i) { + float d0, d1, d2, d3, d4, d5, d6, d7; + d0 = input_ptr[0]; + d1 = input_ptr[1]; + d2 = input_ptr[2]; + d3 = input_ptr[3]; + d4 = input_ptr[4]; + d5 = input_ptr[5]; + d6 = input_ptr[6]; + d7 = input_ptr[7]; + + s[i][0] = d0 - d6 + (d4 - d2) * 5.25; + s[i][7] = d7 - d1 + (d3 - d5) * 5.25; + + float u = d2 + d6 - d4 * 4.25; + float v = d1 + d5 - d3 * 4.25; + s[i][1] = u + v; + s[i][2] = u - v; + + u = d6 + d2 * 0.25 - d4 * 1.25; + v = d1 * 0.5 - d3 * 2.5 + d5 * 2; + s[i][3] = u + v; + s[i][4] = u - v; + + u = d6 + (d2 - d4 * 1.25) * 4; + v = d1 * 2 - d3 * 2.5 + d5 * 0.5; + s[i][5] = u + v; + s[i][6] = u - v; + + input_ptr += in_width; + } + + float *output_ptr = + output + n * output_batch_size + c * tile_count + tile_index; + for (int i = 0; i < 8; ++i) { + float d0, d1, d2, d3, d4, d5, d6, d7; + d0 = s[0][i]; + d1 = s[1][i]; + d2 = s[2][i]; + d3 = s[3][i]; + d4 = s[4][i]; + d5 = s[5][i]; + d6 = s[6][i]; + d7 = s[7][i]; + + output_ptr[i * stride] = d0 - d6 + (d4 - d2) * 5.25; + output_ptr[(56 + i) * stride] = d7 - d1 + (d3 - d5) * 5.25; + + float u = d2 + d6 - d4 * 4.25; + float v = d1 + d5 - d3 * 4.25; + output_ptr[(8 + i) * stride] = u + v; + output_ptr[(16 + i) * stride] = u - v; + + u = d6 + d2 * 0.25 - d4 * 1.25; + v = d1 * 0.5 - d3 * 2.5 + d5 * 2; + output_ptr[(24 + i) * stride] = u + v; + output_ptr[(32 + i) * stride] = u - v; + + u = d6 + (d2 - d4 * 1.25) * 4; + v = d1 * 2 - d3 * 2.5 + d5 * 0.5; + output_ptr[(40 + i) * stride] = u + v; + output_ptr[(48 + i) * stride] = u - v; + } + + ++tile_index; } - - ++tile_index; } } } - } + }, 0, batch, 1, 0, in_channels, 1); } // NTOB => NToOB => NOHoWo -void Conv2dK3x3Winograd::TransformOutput4x4(const float *input, +void Conv2dK3x3Winograd::TransformOutput4x4(const OpContext *context, + const float *input, index_t batch, index_t out_height, index_t out_width, @@ -595,65 +628,70 @@ void Conv2dK3x3Winograd::TransformOutput4x4(const float *input, const index_t out_image_size = out_height * out_width; const index_t output_batch_size = out_channels * out_image_size; -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t n = 0; n < batch; ++n) { - for (index_t m = 0; m < out_channels; ++m) { - index_t tile_offset = 0; - for (index_t h = 0; h < out_height; h += 2) { - for (index_t w = 0; w < out_width; w += 2) { - float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, - d15; - float s0, s1, s2, s3, s4, s5, s6, s7; - float v0, v1, v2, v3; - - const float *input_ptr = - input + n * input_batch_size + m * tile_count + tile_offset; - d0 = input_ptr[0]; - d1 = input_ptr[1 * stride]; - d2 = input_ptr[2 * stride]; - d3 = input_ptr[3 * stride]; - - d4 = input_ptr[4 * stride]; - d5 = input_ptr[5 * stride]; - d6 = input_ptr[6 * stride]; - d7 = input_ptr[7 * stride]; - - d8 = input_ptr[8 * stride]; - d9 = input_ptr[9 * stride]; - d10 = input_ptr[10 * stride]; - d11 = input_ptr[11 * stride]; - - d12 = input_ptr[12 * stride]; - d13 = input_ptr[13 * stride]; - d14 = input_ptr[14 * stride]; - d15 = input_ptr[15 * stride]; - - s0 = d0 + d1 + d2; - s1 = d1 - d2 - d3; - s2 = d4 + d5 + d6; - s3 = d5 - d6 - d7; - s4 = d8 + d9 + d10; - s5 = d9 - d10 - d11; - s6 = d12 + d13 + d14; - s7 = d13 - d14 - d15; - - v0 = s0 + s2 + s4; - v1 = s1 + s3 + s5; - v2 = s2 - s4 - s6; - v3 = s3 - s5 - s7; - - float *output_ptr = output + n * output_batch_size + - m * out_image_size + h * out_width + w; - output_ptr[0] = v0; - output_ptr[1] = v1; - output_ptr[out_width] = v2; - output_ptr[out_width + 1] = v3; - - ++tile_offset; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t n = start0; n < end0; n += step0) { + for (index_t m = start1; m < end1; m += step1) { + index_t tile_offset = 0; + for (index_t h = 0; h < out_height; h += 2) { + for (index_t w = 0; w < out_width; w += 2) { + float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, + d14, + d15; + float s0, s1, s2, s3, s4, s5, s6, s7; + float v0, v1, v2, v3; + + const float *input_ptr = + input + n * input_batch_size + m * tile_count + tile_offset; + d0 = input_ptr[0]; + d1 = input_ptr[1 * stride]; + d2 = input_ptr[2 * stride]; + d3 = input_ptr[3 * stride]; + + d4 = input_ptr[4 * stride]; + d5 = input_ptr[5 * stride]; + d6 = input_ptr[6 * stride]; + d7 = input_ptr[7 * stride]; + + d8 = input_ptr[8 * stride]; + d9 = input_ptr[9 * stride]; + d10 = input_ptr[10 * stride]; + d11 = input_ptr[11 * stride]; + + d12 = input_ptr[12 * stride]; + d13 = input_ptr[13 * stride]; + d14 = input_ptr[14 * stride]; + d15 = input_ptr[15 * stride]; + + s0 = d0 + d1 + d2; + s1 = d1 - d2 - d3; + s2 = d4 + d5 + d6; + s3 = d5 - d6 - d7; + s4 = d8 + d9 + d10; + s5 = d9 - d10 - d11; + s6 = d12 + d13 + d14; + s7 = d13 - d14 - d15; + + v0 = s0 + s2 + s4; + v1 = s1 + s3 + s5; + v2 = s2 - s4 - s6; + v3 = s3 - s5 - s7; + + float *output_ptr = output + n * output_batch_size + + m * out_image_size + h * out_width + w; + output_ptr[0] = v0; + output_ptr[1] = v1; + output_ptr[out_width] = v2; + output_ptr[out_width + 1] = v3; + + ++tile_offset; + } } } } - } + }, 0, batch, 1, 0, out_channels, 1); } // NTOB => NToOB => NOHoWo @@ -671,7 +709,8 @@ void Conv2dK3x3Winograd::TransformOutput4x4(const float *input, ⎢ ⎥ ⎣0 1 -1 32 -32 1 -1 1⎦ */ -void Conv2dK3x3Winograd::TransformOutput8x8(const float *input, +void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context, + const float *input, index_t batch, index_t out_height, index_t out_width, @@ -683,78 +722,82 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const float *input, const index_t out_image_size = out_height * out_width; const index_t output_batch_size = out_channels * out_image_size; -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t n = 0; n < batch; ++n) { - for (index_t m = 0; m < out_channels; ++m) { - index_t tile_offset = 0; - float s[8][6]; - for (index_t h = 0; h < out_height; h += 6) { - for (index_t w = 0; w < out_width; w += 6) { - const float *input_ptr = - input + n * input_batch_size + m * tile_count + tile_offset; - for (int i = 0; i < 8; ++i) { - float d0, d1, d2, d3, d4, d5, d6, d7; - - d0 = input_ptr[0]; - d1 = input_ptr[1 * stride]; - d2 = input_ptr[2 * stride]; - d3 = input_ptr[3 * stride]; - d4 = input_ptr[4 * stride]; - d5 = input_ptr[5 * stride]; - d6 = input_ptr[6 * stride]; - d7 = input_ptr[7 * stride]; - - float u = d1 + d2; - float v = d1 - d2; - float w = d3 + d4; - float x = d3 - d4; - float y = d5 + d6; - float z = d5 - d6; - - s[i][0] = d0 + u + w + y * 32; - s[i][1] = v + x + x + z * 16; - s[i][2] = u + w * 4 + y * 8; - s[i][3] = v + x * 8 + z * 4; - s[i][4] = u + w * 16 + y + y; - s[i][5] = v + x * 32 + z + d7; - - input_ptr += 8 * stride; - } - - float *output_ptr = output + n * output_batch_size + - m * out_image_size + h * out_width + w; - - for (int i = 0; i < 6; ++i) { - float d0, d1, d2, d3, d4, d5, d6, d7; - d0 = s[0][i]; - d1 = s[1][i]; - d2 = s[2][i]; - d3 = s[3][i]; - d4 = s[4][i]; - d5 = s[5][i]; - d6 = s[6][i]; - d7 = s[7][i]; - - float u = d1 + d2; - float v = d1 - d2; - float w = d3 + d4; - float x = d3 - d4; - float y = d5 + d6; - float z = d5 - d6; - - output_ptr[i] = d0 + u + w + y * 32; - output_ptr[1 * out_width + i] = v + x + x + z * 16; - output_ptr[2 * out_width + i] = u + w * 4 + y * 8; - output_ptr[3 * out_width + i] = v + x * 8 + z * 4; - output_ptr[4 * out_width + i] = u + w * 16 + y + y; - output_ptr[5 * out_width + i] = v + x * 32 + z + d7; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t n = start0; n < end0; n += step0) { + for (index_t m = start1; m < end1; m += step1) { + index_t tile_offset = 0; + float s[8][6]; + for (index_t h = 0; h < out_height; h += 6) { + for (index_t w = 0; w < out_width; w += 6) { + const float *input_ptr = + input + n * input_batch_size + m * tile_count + tile_offset; + for (int i = 0; i < 8; ++i) { + float d0, d1, d2, d3, d4, d5, d6, d7; + + d0 = input_ptr[0]; + d1 = input_ptr[1 * stride]; + d2 = input_ptr[2 * stride]; + d3 = input_ptr[3 * stride]; + d4 = input_ptr[4 * stride]; + d5 = input_ptr[5 * stride]; + d6 = input_ptr[6 * stride]; + d7 = input_ptr[7 * stride]; + + float u = d1 + d2; + float v = d1 - d2; + float w = d3 + d4; + float x = d3 - d4; + float y = d5 + d6; + float z = d5 - d6; + + s[i][0] = d0 + u + w + y * 32; + s[i][1] = v + x + x + z * 16; + s[i][2] = u + w * 4 + y * 8; + s[i][3] = v + x * 8 + z * 4; + s[i][4] = u + w * 16 + y + y; + s[i][5] = v + x * 32 + z + d7; + + input_ptr += 8 * stride; + } + + float *output_ptr = output + n * output_batch_size + + m * out_image_size + h * out_width + w; + + for (int i = 0; i < 6; ++i) { + float d0, d1, d2, d3, d4, d5, d6, d7; + d0 = s[0][i]; + d1 = s[1][i]; + d2 = s[2][i]; + d3 = s[3][i]; + d4 = s[4][i]; + d5 = s[5][i]; + d6 = s[6][i]; + d7 = s[7][i]; + + float u = d1 + d2; + float v = d1 - d2; + float w = d3 + d4; + float x = d3 - d4; + float y = d5 + d6; + float z = d5 - d6; + + output_ptr[i] = d0 + u + w + y * 32; + output_ptr[1 * out_width + i] = v + x + x + z * 16; + output_ptr[2 * out_width + i] = u + w * 4 + y * 8; + output_ptr[3 * out_width + i] = v + x * 8 + z * 4; + output_ptr[4 * out_width + i] = u + w * 16 + y + y; + output_ptr[5 * out_width + i] = v + x * 32 + z + d7; + } + + ++tile_offset; } - - ++tile_offset; } } } - } + }, 0, batch, 1, 0, out_channels, 1); } } // namespace fp32 diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h index 3ed8646b17c12424a884611ac22698c6d3a9bf05..53118a6aea3b2d8d3a75b08fa5d0b0f84ef69203 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h +++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h @@ -31,7 +31,7 @@ namespace fp32 { class Conv2dK3x3Winograd : public Conv2dBase { public: - Conv2dK3x3Winograd(const std::vector paddings, + Conv2dK3x3Winograd(const std::vector &paddings, const Padding padding_type) : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type), gemm_(), @@ -44,20 +44,23 @@ class Conv2dK3x3Winograd : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; private: - void TransformFilter4x4(const float *filter, + void TransformFilter4x4(const OpContext *context, + const float *filter, const index_t in_channels, const index_t out_channels, float *output); - void TransformFilter8x8(const float *filter, + void TransformFilter8x8(const OpContext *context, + const float *filter, const index_t in_channels, const index_t out_channels, float *output); - void TransformInput4x4(const float *input, + void TransformInput4x4(const OpContext *context, + const float *input, const index_t batch, const index_t in_height, const index_t in_width, @@ -65,7 +68,8 @@ class Conv2dK3x3Winograd : public Conv2dBase { const index_t tile_count, float *output); - void TransformInput8x8(const float *input, + void TransformInput8x8(const OpContext *context, + const float *input, const index_t batch, const index_t in_height, const index_t in_width, @@ -73,7 +77,8 @@ class Conv2dK3x3Winograd : public Conv2dBase { const index_t tile_count, float *output); - void TransformOutput4x4(const float *input, + void TransformOutput4x4(const OpContext *context, + const float *input, index_t batch, index_t out_height, index_t out_width, @@ -81,7 +86,8 @@ class Conv2dK3x3Winograd : public Conv2dBase { index_t tile_count, float *output); - void TransformOutput8x8(const float *input, + void TransformOutput8x8(const OpContext *context, + const float *input, index_t batch, index_t out_height, index_t out_width, diff --git a/mace/ops/arm/fp32/conv_2d_5x5.cc b/mace/ops/arm/fp32/conv_2d_5x5.cc index 264e48fa13f91756c47fae6f5b9db9ed7f2cc57c..1b41ec7ccd87a14e5683e1f84bc6f967e159b5b3 100644 --- a/mace/ops/arm/fp32/conv_2d_5x5.cc +++ b/mace/ops/arm/fp32/conv_2d_5x5.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/ops/arm/fp32/conv_2d_5x5.h" + #include #include -#include "mace/ops/arm/fp32/conv_2d_5x5.h" namespace mace { namespace ops { @@ -91,11 +92,11 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, &padded_input, &padded_output); const Tensor *in_tensor = input; - if (padded_input.get() != nullptr) { + if (padded_input != nullptr) { in_tensor = padded_input.get(); } Tensor *out_tensor = output; - if (padded_output.get() != nullptr) { + if (padded_output != nullptr) { out_tensor = padded_output.get(); } out_tensor->Clear(); @@ -107,104 +108,62 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, auto input_data = in_tensor->data(); auto output_data = out_tensor->mutable_data(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); - - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; m += 4) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - if (m + 3 < out_channels) { - float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; - - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25; - const float *filter_ptr1 = - filter_data + (m + 1) * in_channels * 25 + c * 25; - const float *filter_ptr2 = - filter_data + (m + 2) * in_channels * 25 + c * 25; - const float *filter_ptr3 = - filter_data + (m + 3) * in_channels * 25 + c * 25; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input offset - index_t in_offset = h * in_width + w; - // output (4 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0, vo1, vo2, vo3; - // load output - index_t out_offset = h * out_width + w; - vo0 = vld1q_f32(out_ptr0_base + out_offset); - vo1 = vld1q_f32(out_ptr1_base + out_offset); - vo2 = vld1q_f32(out_ptr2_base + out_offset); - vo3 = vld1q_f32(out_ptr3_base + out_offset); - for (index_t r = 0; r < 5; ++r) { - // input (3 slide) - float32x4_t vi0, vi1, vi2, vi3, vi4; - // load input - vi0 = vld1q_f32(in_ptr_base + in_offset); - vi4 = vld1q_f32(in_ptr_base + in_offset + 4); - vi1 = vextq_f32(vi0, vi4, 1); - vi2 = vextq_f32(vi0, vi4, 2); - vi3 = vextq_f32(vi0, vi4, 3); - - MACE_Conv2dNeonK5x5SnLoadCalc4; - - in_offset += in_width; - filter_ptr0 += 5; - filter_ptr1 += 5; - filter_ptr2 += 5; - filter_ptr3 += 5; - } // r - - vst1q_f32(out_ptr0_base + out_offset, vo0); - vst1q_f32(out_ptr1_base + out_offset, vo1); - vst1q_f32(out_ptr2_base + out_offset, vo2); - vst1q_f32(out_ptr3_base + out_offset, vo3); - - filter_ptr0 -= 25; - filter_ptr1 -= 25; - filter_ptr2 -= 25; - filter_ptr3 -= 25; - } // w - } // h - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + if (m + 3 < out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output_data + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output_data + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output_data + b * out_batch_size + (m + 3) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = input_data + b * in_batch_size + c * in_image_size; const float - *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25; + *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25; + const float *filter_ptr1 = + filter_data + (m + 1) * in_channels * 25 + c * 25; + const float *filter_ptr2 = + filter_data + (m + 2) * in_channels * 25 + c * 25; + const float *filter_ptr3 = + filter_data + (m + 3) * in_channels * 25 + c * 25; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset index_t in_offset = h * in_width + w; - // output (1 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0; + // output (4 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0, vo1, vo2, vo3; // load output index_t out_offset = h * out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); + vo1 = vld1q_f32(out_ptr1_base + out_offset); + vo2 = vld1q_f32(out_ptr2_base + out_offset); + vo3 = vld1q_f32(out_ptr3_base + out_offset); for (index_t r = 0; r < 5; ++r) { // input (3 slide) float32x4_t vi0, vi1, vi2, vi3, vi4; @@ -215,21 +174,71 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, vi2 = vextq_f32(vi0, vi4, 2); vi3 = vextq_f32(vi0, vi4, 3); - MACE_Conv2dNeonK5x5SnLoadCalc1; + MACE_Conv2dNeonK5x5SnLoadCalc4; in_offset += in_width; filter_ptr0 += 5; + filter_ptr1 += 5; + filter_ptr2 += 5; + filter_ptr3 += 5; } // r vst1q_f32(out_ptr0_base + out_offset, vo0); + vst1q_f32(out_ptr1_base + out_offset, vo1); + vst1q_f32(out_ptr2_base + out_offset, vo2); + vst1q_f32(out_ptr3_base + out_offset, vo3); + filter_ptr0 -= 25; + filter_ptr1 -= 25; + filter_ptr2 -= 25; + filter_ptr3 -= 25; } // w } // h } // c - } // mm - } // if - } // m - } // b + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25; + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t in_offset = h * in_width + w; + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + for (index_t r = 0; r < 5; ++r) { + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4; + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + + MACE_Conv2dNeonK5x5SnLoadCalc1; + + in_offset += in_width; + filter_ptr0 += 5; + } // r + + vst1q_f32(out_ptr0_base + out_offset, vo0); + filter_ptr0 -= 25; + } // w + } // h + } // c + } // mm + } // if + } // m + } // b + }, 0, batch, 1, 0, out_channels, 4); UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; diff --git a/mace/ops/arm/fp32/conv_2d_5x5.h b/mace/ops/arm/fp32/conv_2d_5x5.h index 154d74a849f38c5b114f70d897946a220a722d2c..b6fdf9bbda9d7edc7593a08e30ce6f30987de2a4 100644 --- a/mace/ops/arm/fp32/conv_2d_5x5.h +++ b/mace/ops/arm/fp32/conv_2d_5x5.h @@ -28,7 +28,7 @@ namespace fp32 { class Conv2dK5x5S1 : public Conv2dBase { public: - Conv2dK5x5S1(const std::vector paddings, const Padding padding_type) + Conv2dK5x5S1(const std::vector &paddings, const Padding padding_type) : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK5x5S1() {} @@ -36,7 +36,7 @@ class Conv2dK5x5S1 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; diff --git a/mace/ops/arm/fp32/conv_2d_7x7.cc b/mace/ops/arm/fp32/conv_2d_7x7.cc index 86d3e468f494bb42e3f5c3ecaf608adca72cea5a..4ee8a045a8c61e72fb615816af0fc9c52b77f9b9 100644 --- a/mace/ops/arm/fp32/conv_2d_7x7.cc +++ b/mace/ops/arm/fp32/conv_2d_7x7.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/ops/arm/fp32/conv_2d_7x7.h" + #include #include -#include "mace/ops/arm/fp32/conv_2d_7x7.h" namespace mace { namespace ops { @@ -168,11 +169,11 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, &padded_input, &padded_output); const Tensor *in_tensor = input; - if (padded_input.get() != nullptr) { + if (padded_input != nullptr) { in_tensor = padded_input.get(); } Tensor *out_tensor = output; - if (padded_output.get() != nullptr) { + if (padded_output != nullptr) { out_tensor = padded_output.get(); } out_tensor->Clear(); @@ -184,111 +185,61 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, auto input_data = in_tensor->data(); auto output_data = out_tensor->mutable_data(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); - - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; m += 4) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - if (m + 3 < out_channels) { - float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; - const float *filter_ptr1 = - filter_data + (m + 1) * in_channels * 49 + c * 49; - const float *filter_ptr2 = - filter_data + (m + 2) * in_channels * 49 + c * 49; - const float *filter_ptr3 = - filter_data + (m + 3) * in_channels * 49 + c * 49; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input offset - index_t in_offset = h * in_width + w; - // output (4 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0, vo1, vo2, vo3; - // load output - index_t out_offset = h * out_width + w; - vo0 = vld1q_f32(out_ptr0_base + out_offset); - vo1 = vld1q_f32(out_ptr1_base + out_offset); - vo2 = vld1q_f32(out_ptr2_base + out_offset); - vo3 = vld1q_f32(out_ptr3_base + out_offset); - for (index_t r = 0; r < 7; ++r) { - // input (3 slide) - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; - float32x4_t vi8; // for tmp use - // load input - vi0 = vld1q_f32(in_ptr_base + in_offset); - vi4 = vld1q_f32(in_ptr_base + in_offset + 4); - vi8 = vld1q_f32(in_ptr_base + in_offset + 8); - vi1 = vextq_f32(vi0, vi4, 1); - vi2 = vextq_f32(vi0, vi4, 2); - vi3 = vextq_f32(vi0, vi4, 3); - vi5 = vextq_f32(vi4, vi8, 1); - vi6 = vextq_f32(vi4, vi8, 2); - -#if defined(__aarch64__) - MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; -#else - MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; -#endif - - in_offset += in_width; - filter_ptr0 += 7; - filter_ptr1 += 7; - filter_ptr2 += 7; - filter_ptr3 += 7; - } // r - - vst1q_f32(out_ptr0_base + out_offset, vo0); - vst1q_f32(out_ptr1_base + out_offset, vo1); - vst1q_f32(out_ptr2_base + out_offset, vo2); - vst1q_f32(out_ptr3_base + out_offset, vo3); - - filter_ptr0 -= 49; - filter_ptr1 -= 49; - filter_ptr2 -= 49; - filter_ptr3 -= 49; - } // w - } // h - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + if (m + 3 < out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output_data + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output_data + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output_data + b * out_batch_size + (m + 3) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = input_data + b * in_batch_size + c * in_image_size; const float - *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; + *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; + const float *filter_ptr1 = + filter_data + (m + 1) * in_channels * 49 + c * 49; + const float *filter_ptr2 = + filter_data + (m + 2) * in_channels * 49 + c * 49; + const float *filter_ptr3 = + filter_data + (m + 3) * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset index_t in_offset = h * in_width + w; - // output (1 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0; + // output (4 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0, vo1, vo2, vo3; // load output index_t out_offset = h * out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); + vo1 = vld1q_f32(out_ptr1_base + out_offset); + vo2 = vld1q_f32(out_ptr2_base + out_offset); + vo3 = vld1q_f32(out_ptr3_base + out_offset); for (index_t r = 0; r < 7; ++r) { // input (3 slide) float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; @@ -304,24 +255,82 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, vi6 = vextq_f32(vi4, vi8, 2); #if defined(__aarch64__) - MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; #else - MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif in_offset += in_width; filter_ptr0 += 7; + filter_ptr1 += 7; + filter_ptr2 += 7; + filter_ptr3 += 7; } // r vst1q_f32(out_ptr0_base + out_offset, vo0); + vst1q_f32(out_ptr1_base + out_offset, vo1); + vst1q_f32(out_ptr2_base + out_offset, vo2); + vst1q_f32(out_ptr3_base + out_offset, vo3); + filter_ptr0 -= 49; + filter_ptr1 -= 49; + filter_ptr2 -= 49; + filter_ptr3 -= 49; } // w } // h } // c - } // mm - } // if - } // m - } // b + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t in_offset = h * in_width + w; + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + for (index_t r = 0; r < 7; ++r) { + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; + float32x4_t vi8; // for tmp use + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + +#if defined(__aarch64__) + MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; +#else + MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; +#endif + + in_offset += in_width; + filter_ptr0 += 7; + } // r + + vst1q_f32(out_ptr0_base + out_offset, vo0); + filter_ptr0 -= 49; + } // w + } // h + } // c + } // mm + } // if + } // m + } // b + }, 0, batch, 1, 0, out_channels, 4); UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; @@ -342,11 +351,11 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, &padded_input, &padded_output); const Tensor *in_tensor = input; - if (padded_input.get() != nullptr) { + if (padded_input != nullptr) { in_tensor = padded_input.get(); } Tensor *out_tensor = output; - if (padded_output.get() != nullptr) { + if (padded_output != nullptr) { out_tensor = padded_output.get(); } out_tensor->Clear(); @@ -358,118 +367,63 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, auto input_data = in_tensor->data(); auto output_data = out_tensor->mutable_data(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); - - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; m += 4) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - if (m + 3 < out_channels) { - float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; - const float *filter_ptr1 = - filter_data + (m + 1) * in_channels * 49 + c * 49; - const float *filter_ptr2 = - filter_data + (m + 2) * in_channels * 49 + c * 49; - const float *filter_ptr3 = - filter_data + (m + 3) * in_channels * 49 + c * 49; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input offset - index_t in_h = h * 2; - index_t in_w = w * 2; - index_t in_offset = in_h * in_width + in_w; - // output (4 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0, vo1, vo2, vo3; - // load output - index_t out_offset = h * out_width + w; - vo0 = vld1q_f32(out_ptr0_base + out_offset); - vo1 = vld1q_f32(out_ptr1_base + out_offset); - vo2 = vld1q_f32(out_ptr2_base + out_offset); - vo3 = vld1q_f32(out_ptr3_base + out_offset); - for (index_t r = 0; r < 7; ++r) { - // input (3 slide) - float32x4x2_t vvi0, vvi1; // to de-interleave - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; - // load input - // [0.2.4.6, 1.3.5.7] - vvi0 = vld2q_f32(in_ptr_base + in_offset); - // [8.10.12.14, 9.11.13.15] - vvi1 = vld2q_f32(in_ptr_base + in_offset + 8); - vi0 = vvi0.val[0]; // [0.2.4.6] - vi1 = vvi0.val[1]; // [1.3.5.7] - vi2 = vextq_f32(vi0, vvi1.val[0], 1); // [2.4.6.8] - vi3 = vextq_f32(vi1, vvi1.val[1], 1); // [3.5.7.9] - vi4 = vextq_f32(vi0, vvi1.val[0], 2); // [4.6.8.10] - vi5 = vextq_f32(vi1, vvi1.val[1], 2); // [5.7.9.11] - vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] - -#if defined(__aarch64__) - MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; -#else - MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; -#endif - - in_offset += in_width; - filter_ptr0 += 7; - filter_ptr1 += 7; - filter_ptr2 += 7; - filter_ptr3 += 7; - } // r - - vst1q_f32(out_ptr0_base + out_offset, vo0); - vst1q_f32(out_ptr1_base + out_offset, vo1); - vst1q_f32(out_ptr2_base + out_offset, vo2); - vst1q_f32(out_ptr3_base + out_offset, vo3); - - filter_ptr0 -= 49; - filter_ptr1 -= 49; - filter_ptr2 -= 49; - filter_ptr3 -= 49; - } // w - } // h - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + if (m + 3 < out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output_data + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output_data + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output_data + b * out_batch_size + (m + 3) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = input_data + b * in_batch_size + c * in_image_size; const float - *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; + *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; + const float *filter_ptr1 = + filter_data + (m + 1) * in_channels * 49 + c * 49; + const float *filter_ptr2 = + filter_data + (m + 2) * in_channels * 49 + c * 49; + const float *filter_ptr3 = + filter_data + (m + 3) * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset index_t in_h = h * 2; index_t in_w = w * 2; index_t in_offset = in_h * in_width + in_w; - // output (1 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0; - // load ouput + // output (4 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0, vo1, vo2, vo3; + // load output index_t out_offset = h * out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); + vo1 = vld1q_f32(out_ptr1_base + out_offset); + vo2 = vld1q_f32(out_ptr2_base + out_offset); + vo3 = vld1q_f32(out_ptr3_base + out_offset); for (index_t r = 0; r < 7; ++r) { // input (3 slide) float32x4x2_t vvi0, vvi1; // to de-interleave @@ -488,24 +442,87 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] #if defined(__aarch64__) - MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; #else - MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif in_offset += in_width; filter_ptr0 += 7; + filter_ptr1 += 7; + filter_ptr2 += 7; + filter_ptr3 += 7; } // r vst1q_f32(out_ptr0_base + out_offset, vo0); + vst1q_f32(out_ptr1_base + out_offset, vo1); + vst1q_f32(out_ptr2_base + out_offset, vo2); + vst1q_f32(out_ptr3_base + out_offset, vo3); + filter_ptr0 -= 49; + filter_ptr1 -= 49; + filter_ptr2 -= 49; + filter_ptr3 -= 49; } // w } // h } // c - } // mm - } // if - } // m - } // b + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t in_h = h * 2; + index_t in_w = w * 2; + index_t in_offset = in_h * in_width + in_w; + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0; + // load ouput + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + for (index_t r = 0; r < 7; ++r) { + // input (3 slide) + float32x4x2_t vvi0, vvi1; // to de-interleave + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; + // load input + // [0.2.4.6, 1.3.5.7] + vvi0 = vld2q_f32(in_ptr_base + in_offset); + // [8.10.12.14, 9.11.13.15] + vvi1 = vld2q_f32(in_ptr_base + in_offset + 8); + vi0 = vvi0.val[0]; // [0.2.4.6] + vi1 = vvi0.val[1]; // [1.3.5.7] + vi2 = vextq_f32(vi0, vvi1.val[0], 1); // [2.4.6.8] + vi3 = vextq_f32(vi1, vvi1.val[1], 1); // [3.5.7.9] + vi4 = vextq_f32(vi0, vvi1.val[0], 2); // [4.6.8.10] + vi5 = vextq_f32(vi1, vvi1.val[1], 2); // [5.7.9.11] + vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] + +#if defined(__aarch64__) + MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; +#else + MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; +#endif + + in_offset += in_width; + filter_ptr0 += 7; + } // r + + vst1q_f32(out_ptr0_base + out_offset, vo0); + filter_ptr0 -= 49; + } // w + } // h + } // c + } // mm + } // if + } // m + } // b + }, 0, batch, 1, 0, out_channels, 4); UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; @@ -526,11 +543,11 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, &padded_input, &padded_output); const Tensor *in_tensor = input; - if (padded_input.get() != nullptr) { + if (padded_input != nullptr) { in_tensor = padded_input.get(); } Tensor *out_tensor = output; - if (padded_output.get() != nullptr) { + if (padded_output != nullptr) { out_tensor = padded_output.get(); } out_tensor->Clear(); @@ -542,118 +559,63 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, auto input_data = in_tensor->data(); auto output_data = out_tensor->mutable_data(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); - - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; m += 4) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - if (m + 3 < out_channels) { - float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; - const float *filter_ptr1 = - filter_data + (m + 1) * in_channels * 49 + c * 49; - const float *filter_ptr2 = - filter_data + (m + 2) * in_channels * 49 + c * 49; - const float *filter_ptr3 = - filter_data + (m + 3) * in_channels * 49 + c * 49; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input offset - index_t in_h = h * 3; - index_t in_w = w * 3; - index_t in_offset = in_h * in_width + in_w; - // output (4 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0, vo1, vo2, vo3; - // load output - index_t out_offset = h * out_width + w; - vo0 = vld1q_f32(out_ptr0_base + out_offset); - vo1 = vld1q_f32(out_ptr1_base + out_offset); - vo2 = vld1q_f32(out_ptr2_base + out_offset); - vo3 = vld1q_f32(out_ptr3_base + out_offset); - for (index_t r = 0; r < 7; ++r) { - // input (3 slide) - float32x4x3_t vvi0, vvi1; // to de-interleave - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; - // load input - // [0.3.6.9, 1.4.7.10, 2.5.8.11] - vvi0 = vld3q_f32(in_ptr_base + in_offset); - // [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx] - vvi1 = vld3q_f32(in_ptr_base + in_offset + 12); - vi0 = vvi0.val[0]; // [0.3.6.9] - vi1 = vvi0.val[1]; // [1.4.7.10] - vi2 = vvi0.val[2]; // [2.5.8.11] - vi3 = vextq_f32(vi0, vvi1.val[0], 1); // [3.6.9.12] - vi4 = vextq_f32(vi1, vvi1.val[1], 1); // [4.7.10.13] - vi5 = vextq_f32(vi2, vvi1.val[2], 1); // [5.8.11.14] - vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] - -#if defined(__aarch64__) - MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; -#else - MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; -#endif - - in_offset += in_width; - filter_ptr0 += 7; - filter_ptr1 += 7; - filter_ptr2 += 7; - filter_ptr3 += 7; - } // r - - vst1q_f32(out_ptr0_base + out_offset, vo0); - vst1q_f32(out_ptr1_base + out_offset, vo1); - vst1q_f32(out_ptr2_base + out_offset, vo2); - vst1q_f32(out_ptr3_base + out_offset, vo3); - - filter_ptr0 -= 49; - filter_ptr1 -= 49; - filter_ptr2 -= 49; - filter_ptr3 -= 49; - } // w - } // h - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + if (m + 3 < out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output_data + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output_data + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output_data + b * out_batch_size + (m + 3) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = input_data + b * in_batch_size + c * in_image_size; const float - *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; + *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; + const float *filter_ptr1 = + filter_data + (m + 1) * in_channels * 49 + c * 49; + const float *filter_ptr2 = + filter_data + (m + 2) * in_channels * 49 + c * 49; + const float *filter_ptr3 = + filter_data + (m + 3) * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset index_t in_h = h * 3; index_t in_w = w * 3; index_t in_offset = in_h * in_width + in_w; - // output (1 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0; + // output (4 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0, vo1, vo2, vo3; // load output index_t out_offset = h * out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); + vo1 = vld1q_f32(out_ptr1_base + out_offset); + vo2 = vld1q_f32(out_ptr2_base + out_offset); + vo3 = vld1q_f32(out_ptr3_base + out_offset); for (index_t r = 0; r < 7; ++r) { // input (3 slide) float32x4x3_t vvi0, vvi1; // to de-interleave @@ -672,24 +634,87 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] #if defined(__aarch64__) - MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv8NeonK7x7SnLoadCalc4; #else - MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; + MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif in_offset += in_width; filter_ptr0 += 7; + filter_ptr1 += 7; + filter_ptr2 += 7; + filter_ptr3 += 7; } // r vst1q_f32(out_ptr0_base + out_offset, vo0); + vst1q_f32(out_ptr1_base + out_offset, vo1); + vst1q_f32(out_ptr2_base + out_offset, vo2); + vst1q_f32(out_ptr3_base + out_offset, vo3); + filter_ptr0 -= 49; + filter_ptr1 -= 49; + filter_ptr2 -= 49; + filter_ptr3 -= 49; } // w } // h } // c - } // mm - } // if - } // m - } // b + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t in_h = h * 3; + index_t in_w = w * 3; + index_t in_offset = in_h * in_width + in_w; + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + for (index_t r = 0; r < 7; ++r) { + // input (3 slide) + float32x4x3_t vvi0, vvi1; // to de-interleave + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; + // load input + // [0.3.6.9, 1.4.7.10, 2.5.8.11] + vvi0 = vld3q_f32(in_ptr_base + in_offset); + // [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx] + vvi1 = vld3q_f32(in_ptr_base + in_offset + 12); + vi0 = vvi0.val[0]; // [0.3.6.9] + vi1 = vvi0.val[1]; // [1.4.7.10] + vi2 = vvi0.val[2]; // [2.5.8.11] + vi3 = vextq_f32(vi0, vvi1.val[0], 1); // [3.6.9.12] + vi4 = vextq_f32(vi1, vvi1.val[1], 1); // [4.7.10.13] + vi5 = vextq_f32(vi2, vvi1.val[2], 1); // [5.8.11.14] + vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] + +#if defined(__aarch64__) + MACE_Conv2dArmv8NeonK7x7SnLoadCalc1; +#else + MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; +#endif + + in_offset += in_width; + filter_ptr0 += 7; + } // r + + vst1q_f32(out_ptr0_base + out_offset, vo0); + filter_ptr0 -= 49; + } // w + } // h + } // c + } // mm + } // if + } // m + } // b + }, 0, batch, 1, 0, out_channels, 4); UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; diff --git a/mace/ops/arm/fp32/conv_2d_7x7.h b/mace/ops/arm/fp32/conv_2d_7x7.h index e64780bab2bb4c22c2107da29d85b9040ef86460..9324f4daac2392cb069935d3d46fc36274e8b8ea 100644 --- a/mace/ops/arm/fp32/conv_2d_7x7.h +++ b/mace/ops/arm/fp32/conv_2d_7x7.h @@ -28,7 +28,7 @@ namespace fp32 { class Conv2dK7x7S1 : public Conv2dBase { public: - Conv2dK7x7S1(const std::vector paddings, const Padding padding_type) + Conv2dK7x7S1(const std::vector &paddings, const Padding padding_type) : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK7x7S1() {} @@ -36,12 +36,12 @@ class Conv2dK7x7S1 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; class Conv2dK7x7S2 : public Conv2dBase { public: - Conv2dK7x7S2(const std::vector paddings, const Padding padding_type) + Conv2dK7x7S2(const std::vector &paddings, const Padding padding_type) : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK7x7S2() {} @@ -49,12 +49,12 @@ class Conv2dK7x7S2 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; class Conv2dK7x7S3 : public Conv2dBase { public: - Conv2dK7x7S3(const std::vector paddings, const Padding padding_type) + Conv2dK7x7S3(const std::vector &paddings, const Padding padding_type) : Conv2dBase({3, 3}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK7x7S3() {} @@ -62,7 +62,7 @@ class Conv2dK7x7S3 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; } // namespace fp32 diff --git a/mace/ops/arm/fp32/conv_general.cc b/mace/ops/arm/fp32/conv_general.cc index a12c5d53b83c275a470f04accdeee07d65317330..25fb2441481cb5ac55da78e44327478b513de018 100644 --- a/mace/ops/arm/fp32/conv_general.cc +++ b/mace/ops/arm/fp32/conv_general.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include "mace/ops/arm/fp32/conv_general.h" +#include + namespace mace { namespace ops { namespace arm { @@ -37,11 +38,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, &padded_output); const Tensor *in_tensor = input; - if (padded_input.get() != nullptr) { + if (padded_input != nullptr) { in_tensor = padded_input.get(); } Tensor *out_tensor = output; - if (padded_output.get() != nullptr) { + if (padded_output != nullptr) { out_tensor = padded_output.get(); } out_tensor->Clear(); @@ -53,148 +54,70 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, auto input_data = in_tensor->data(); auto output_data = out_tensor->mutable_data(); - auto in_shape = in_tensor->shape(); - auto out_shape = out_tensor->shape(); - auto filter_shape = filter->shape(); - - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = filter_shape[1] * in_image_size; - const index_t out_batch_size = filter_shape[0] * out_image_size; - const index_t filter_size = filter_shape[2] * filter_shape[3]; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < in_shape[0]; b++) { - for (index_t m = 0; m < filter_shape[0]; m += 4) { - const index_t in_width = in_shape[3]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t out_channels = filter_shape[0]; - const index_t in_channels = filter_shape[1]; - - const int stride_h = strides_[0]; - const int stride_w = strides_[1]; - const int dilation_h = dilations_[0]; - const int dilation_w = dilations_[1]; - if (m + 3 < out_channels) { - float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = out_ptr0_base + out_image_size; - float *out_ptr2_base = out_ptr1_base + out_image_size; - float *out_ptr3_base = out_ptr2_base + out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = - filter_data + m * in_channels * filter_size + c * filter_size; - const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; - const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; - const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input offset - index_t ih = h * stride_h; - index_t iw = w * stride_w; - index_t in_offset = ih * in_width + iw; - // output (4 outch x 1 height x 4 width): vo_outch_height - float vo0[4], vo1[4], vo2[4], vo3[4]; - // load output - index_t out_offset = h * out_width + w; - for (index_t ow = 0; ow < 4; ++ow) { - vo0[ow] = out_ptr0_base[out_offset + ow]; - vo1[ow] = out_ptr1_base[out_offset + ow]; - vo2[ow] = out_ptr2_base[out_offset + ow]; - vo3[ow] = out_ptr3_base[out_offset + ow]; - } - // calc by row - for (index_t kh = 0; kh < filter_shape[2]; ++kh) { - for (index_t kw = 0; kw < filter_shape[3]; ++kw) { - // outch 0 - vo0[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr0[kw]; - vo0[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - // outch 1 - vo1[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr1[kw]; - vo1[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr1[kw]; - vo1[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr1[kw]; - vo1[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr1[kw]; - // outch 2 - vo2[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr2[kw]; - vo2[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr2[kw]; - vo2[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr2[kw]; - vo2[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr2[kw]; - // outch 3 - vo3[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr3[kw]; - vo3[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr3[kw]; - vo3[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr3[kw]; - vo3[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr3[kw]; - } // kw - - in_offset += dilation_h * in_width; - filter_ptr0 += filter_shape[3]; - filter_ptr1 += filter_shape[3]; - filter_ptr2 += filter_shape[3]; - filter_ptr3 += filter_shape[3]; - } // kh - - for (index_t ow = 0; ow < 4; ++ow) { - out_ptr0_base[out_offset + ow] = vo0[ow]; - out_ptr1_base[out_offset + ow] = vo1[ow]; - out_ptr2_base[out_offset + ow] = vo2[ow]; - out_ptr3_base[out_offset + ow] = vo3[ow]; - } - - filter_ptr0 -= filter_size; - filter_ptr1 -= filter_size; - filter_ptr2 -= filter_size; - filter_ptr3 -= filter_size; - } // w - } // h - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + auto &filter_shape = filter->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t filter_height = filter_shape[2]; + const index_t filter_width = filter_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + const index_t filter_size = filter_height * filter_width; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + const int stride_h = strides_[0]; + const int stride_w = strides_[1]; + const int dilation_h = dilations_[0]; + const int dilation_w = dilations_[1]; + if (m + 3 < out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = out_ptr0_base + out_image_size; + float *out_ptr2_base = out_ptr1_base + out_image_size; + float *out_ptr3_base = out_ptr2_base + out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = input_data + b * in_batch_size + c * in_image_size; const float *filter_ptr0 = - filter_data + mm * in_channels * filter_size + c * filter_size; - + filter_data + m * in_channels * filter_size + c * filter_size; + const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; + const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; + const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset index_t ih = h * stride_h; index_t iw = w * stride_w; index_t in_offset = ih * in_width + iw; - // output (1 outch x 1 height x 4 width): vo_outch_height - float vo0[4]; + // output (4 outch x 1 height x 4 width): vo_outch_height + float vo0[4], vo1[4], vo2[4], vo3[4]; // load output index_t out_offset = h * out_width + w; for (index_t ow = 0; ow < 4; ++ow) { vo0[ow] = out_ptr0_base[out_offset + ow]; + vo1[ow] = out_ptr1_base[out_offset + ow]; + vo2[ow] = out_ptr2_base[out_offset + ow]; + vo3[ow] = out_ptr3_base[out_offset + ow]; } - // calc by row - for (index_t kh = 0; kh < filter_shape[2]; ++kh) { - for (index_t kw = 0; kw < filter_shape[3]; ++kw) { + for (index_t kh = 0; kh < filter_height; ++kh) { + for (index_t kw = 0; kw < filter_width; ++kw) { // outch 0 vo0[0] += in_ptr_base[in_offset + kw * dilation_w] * filter_ptr0[kw]; @@ -204,23 +127,111 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, + kw * dilation_w] * filter_ptr0[kw]; vo0[3] += in_ptr_base[in_offset + 3 * stride_w + kw * dilation_w] * filter_ptr0[kw]; + // outch 1 + vo1[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr1[kw]; + vo1[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr1[kw]; + vo1[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr1[kw]; + vo1[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr1[kw]; + // outch 2 + vo2[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr2[kw]; + vo2[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr2[kw]; + vo2[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr2[kw]; + vo2[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr2[kw]; + // outch 3 + vo3[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr3[kw]; + vo3[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr3[kw]; + vo3[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr3[kw]; + vo3[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr3[kw]; } // kw in_offset += dilation_h * in_width; - filter_ptr0 += filter_shape[3]; + filter_ptr0 += filter_width; + filter_ptr1 += filter_width; + filter_ptr2 += filter_width; + filter_ptr3 += filter_width; } // kh for (index_t ow = 0; ow < 4; ++ow) { out_ptr0_base[out_offset + ow] = vo0[ow]; + out_ptr1_base[out_offset + ow] = vo1[ow]; + out_ptr2_base[out_offset + ow] = vo2[ow]; + out_ptr3_base[out_offset + ow] = vo3[ow]; } + filter_ptr0 -= filter_size; + filter_ptr1 -= filter_size; + filter_ptr2 -= filter_size; + filter_ptr3 -= filter_size; } // w } // h } // c - } // mm - } // if - } // m - } // b + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = + filter_data + mm * in_channels * filter_size + + c * filter_size; + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t ih = h * stride_h; + index_t iw = w * stride_w; + index_t in_offset = ih * in_width + iw; + // output (1 outch x 1 height x 4 width): vo_outch_height + float vo0[4]; + // load output + index_t out_offset = h * out_width + w; + for (index_t ow = 0; ow < 4; ++ow) { + vo0[ow] = out_ptr0_base[out_offset + ow]; + } + + // calc by row + for (index_t kh = 0; kh < filter_height; ++kh) { + for (index_t kw = 0; kw < filter_width; ++kw) { + // outch 0 + vo0[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr0[kw]; + vo0[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + } // kw + + in_offset += dilation_h * in_width; + filter_ptr0 += filter_width; + } // kh + + for (index_t ow = 0; ow < 4; ++ow) { + out_ptr0_base[out_offset + ow] = vo0[ow]; + } + filter_ptr0 -= filter_size; + } // w + } // h + } // c + } // mm + } // if + } // m + } // b + }, 0, batch, 1, 0, out_channels, 4); UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; diff --git a/mace/ops/arm/fp32/conv_general.h b/mace/ops/arm/fp32/conv_general.h index 01d019548a19fee9c79deb6d918dac9431110fac..115acdb3fe83cb80e1e20e7939c5fe03eed7c6da 100644 --- a/mace/ops/arm/fp32/conv_general.h +++ b/mace/ops/arm/fp32/conv_general.h @@ -28,9 +28,9 @@ namespace fp32 { class Conv2dGeneral : public Conv2dBase { public: - Conv2dGeneral(const std::vector strides, - const std::vector dilations, - const std::vector paddings, + Conv2dGeneral(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, const Padding padding_type) : Conv2dBase(strides, dilations, paddings, padding_type) {} virtual ~Conv2dGeneral() {} @@ -39,7 +39,7 @@ class Conv2dGeneral : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; } // namespace fp32 diff --git a/mace/ops/arm/fp32/deconv_2d.cc b/mace/ops/arm/fp32/deconv_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..a80d6d645b15720a4210de9c9cdab3fc9c8401b9 --- /dev/null +++ b/mace/ops/arm/fp32/deconv_2d.cc @@ -0,0 +1,120 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/fp32/deconv_2d.h" + +#include +#include +#include "mace/utils/memory.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Deconv2dBase::ResizeOutAndPadOut( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output, + std::vector *out_pad_size, + std::unique_ptr *padded_output) { + std::vector out_shape; + if (output_shape) { + Tensor::MappingGuard out_shape_guard(output_shape); + MACE_CHECK(output_shape->size() == 4, "output shape should be 4-dims"); + out_shape = + std::vector(output_shape->data(), + output_shape->data() + 4); + } + + std::vector padded_out_shape; + + CalDeconvOutputShapeAndPadSize(input->shape(), + filter->shape(), + strides_, + padding_type_, + paddings_, + group_, + &out_shape, + nullptr, + out_pad_size, + &padded_out_shape, + framework_type_, + NCHW); + + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + + const bool is_out_padded = + padded_out_shape[2] != out_shape[2] + || padded_out_shape[3] != out_shape[3]; + + if (is_out_padded) { + index_t padded_out_size = + std::accumulate(padded_out_shape.begin(), + padded_out_shape.end(), + 1, + std::multiplies()) * sizeof(float); + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + index_t scratch_size = PadAlignSize(padded_out_size); + scratch->GrowSize(scratch_size); + + std::unique_ptr + padded_out + (make_unique(scratch->Scratch(scratch_size), DT_FLOAT)); + padded_out->Reshape(padded_out_shape); + *padded_output = std::move(padded_out); + } + + return MaceStatus::MACE_SUCCESS; +} + +void Deconv2dBase::UnPadOutput(const Tensor &src, + const std::vector &out_pad_size, + Tensor *dst) { + if (dst == &src) return; + const index_t pad_h = out_pad_size[0] / 2; + const index_t pad_w = out_pad_size[1] / 2; + + const index_t batch = dst->dim(0); + const index_t channels = dst->dim(1); + const index_t height = dst->dim(2); + const index_t width = dst->dim(3); + const index_t padded_height = src.dim(2); + const index_t padded_width = src.dim(3); + + auto padded_out_data = src.data(); + auto out_data = dst->mutable_data(); + + for (index_t i = 0; i < batch; ++i) { + for (index_t j = 0; j < channels; ++j) { + for (index_t k = 0; k < height; ++k) { + const float *input_base = + padded_out_data + ((i * channels + j) * padded_height + + (k + pad_h)) * padded_width; + float *output_base = + out_data + ((i * channels + j) * height + k) * width; + memcpy(output_base, input_base + pad_w, width * sizeof(float)); + } + } + } +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/deconv_2d.h b/mace/ops/arm/fp32/deconv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..554f2935992d0a6f901bbb7b40aab4b048d63616 --- /dev/null +++ b/mace/ops/arm/fp32/deconv_2d.h @@ -0,0 +1,95 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_DECONV_2D_H_ +#define MACE_OPS_ARM_FP32_DECONV_2D_H_ + +#include +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/gemm.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Deconv2dBase { + public: + Deconv2dBase(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, + const Padding padding_type, + const index_t group, + const FrameworkType framework_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type), + group_(group), + framework_type_(framework_type) {} + + Deconv2dBase(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase(strides, + dilations, + paddings, + padding_type, + 1, + framework_type) {} + + virtual ~Deconv2dBase() = default; + + virtual MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) = 0; + + protected: + MaceStatus ResizeOutAndPadOut(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output, + std::vector *out_pad_size, + std::unique_ptr *padded_output); + + void UnPadOutput(const Tensor &src, + const std::vector &out_pad_size, + Tensor *dst); + + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; + index_t group_; + const FrameworkType framework_type_; +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_DECONV_2D_H_ diff --git a/mace/ops/arm/fp32/deconv_2d_2x2.cc b/mace/ops/arm/fp32/deconv_2d_2x2.cc new file mode 100644 index 0000000000000000000000000000000000000000..c9d630bbb63c66d72684663659965e32b2be6b60 --- /dev/null +++ b/mace/ops/arm/fp32/deconv_2d_2x2.cc @@ -0,0 +1,342 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/fp32/deconv_2d_2x2.h" + +#include +#include "mace/ops/arm/fp32/common_neon.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t out_img_size = outh * outw; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t oc = start1; oc < end1; oc += step1) { + if (oc + 1 < outch) { + float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size; + float *out_base1 = out_base0 + out_img_size; + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input_data + (b * inch + ic) * h * w; + const float *kernel_base0 = filter_data + (oc * inch + ic) * 4; + const float *kernel_base1 = kernel_base0 + inch * 4; + const float *in = input_base; + // output channel 0 + const float *k0 = kernel_base0; + // output channel 1 + const float *k1 = kernel_base1; + // load filter + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + + float *out_row_base1 = out_base1 + i * outw; + float *out_row1_0 = out_row_base1; + float *out_row1_1 = out_row_base1 + outw; + + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02, out03; + float32x4_t out10, out11, out12, out13; + + out00 = vld1q_f32(out_row0_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row0_0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_1); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row0_1, out02); + + out03 = vld1q_f32(out_row0_1 + 1); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row0_1 + 1, out03); + + out10 = vld1q_f32(out_row1_0); + out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); + vst1q_f32(out_row1_0, out10); + + out11 = vld1q_f32(out_row1_0 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); + vst1q_f32(out_row1_0 + 1, out11); + + out12 = vld1q_f32(out_row1_1); + out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); + vst1q_f32(out_row1_1, out12); + + out13 = vld1q_f32(out_row1_1 + 1); + out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); + vst1q_f32(out_row1_1 + 1, out13); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + out_row1_0 += 4; + out_row1_1 += 4; + } + + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 2; ++k) { + out_row0_0[k] += val * k0[k]; + out_row0_1[k] += val * k0[k + 2]; + out_row1_0[k] += val * k1[k]; + out_row1_1[k] += val * k1[k + 2]; + } + in++; + out_row0_0++; + out_row0_1++; + out_row1_0++; + out_row1_1++; + } + } + } + } else { + float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw; + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input_data + (b * inch + ic) * h * w; + const float *kernel_base0 = filter_data + (oc * inch + ic) * 4; + const float *in = input_base; + const float *k0 = kernel_base0; + + // load filter + float32x4_t k0_vec = vld1q_f32(k0); + + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + float32x4_t out00, out01, out02, out03; + + out00 = vld1q_f32(out_row0_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row0_0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_1); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row0_1, out02); + + out03 = vld1q_f32(out_row0_1 + 1); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row0_1 + 1, out03); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + } + + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 2; ++k) { + out_row0_0[k] += val * k0[k]; + out_row0_1[k] += val * k0[k + 2]; + } + in++; + out_row0_0++; + out_row0_1++; + } + } + } + } + } + } + }, 0, batch, 1, 0, outch, 2); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t oc = start1; oc < end1; oc += step1) { + float *out_base = padded_out_data + (b * outch + oc) * out_img_size; + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input_data + (b * inch + ic) * h * w; + const float *kernel_base = filter_data + (oc * inch + ic) * 4; + const float *in = input_base; + const float *k0 = kernel_base; + float32x4_t k0_vec = vld1q_f32(k0); + + for (index_t i = 0; i < h; ++i) { + float *out_row_base = out_base + i * 2 * outw; + float *out_row_0 = out_row_base; + float *out_row_1 = out_row_0 + outw; + + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + // out row 0 + float32x4x2_t out00 = vld2q_f32(out_row_0); + out00.val[0] = + neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); + out00.val[1] = + neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out00); + + // out row 1 + float32x4x2_t out10 = vld2q_f32(out_row_1); + out10.val[0] = + neon_vfma_lane_2(out10.val[0], in_vec, k0_vec); + out10.val[1] = + neon_vfma_lane_3(out10.val[1], in_vec, k0_vec); + vst2q_f32(out_row_1, out10); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + } + + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 2; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k0[k + 2]; + } + in++; + out_row_0 += 2; + out_row_1 += 2; + } + } + } + } + } + }, 0, batch, 1, 0, outch, 1); + + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/deconv_2d_2x2.h b/mace/ops/arm/fp32/deconv_2d_2x2.h new file mode 100644 index 0000000000000000000000000000000000000000..05f80dece27fd6cf20d87861e04a512b94706939 --- /dev/null +++ b/mace/ops/arm/fp32/deconv_2d_2x2.h @@ -0,0 +1,70 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_ +#define MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_ + +#include +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Deconv2dK2x2S1 : public Deconv2dBase { + public: + Deconv2dK2x2S1(const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {} + virtual ~Deconv2dK2x2S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +class Deconv2dK2x2S2 : public Deconv2dBase { + public: + Deconv2dK2x2S2(const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {} + virtual ~Deconv2dK2x2S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_ diff --git a/mace/ops/arm/fp32/deconv_2d_3x3.cc b/mace/ops/arm/fp32/deconv_2d_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2ef6eae269316c9169e33bbb753606d8572c1ff --- /dev/null +++ b/mace/ops/arm/fp32/deconv_2d_3x3.cc @@ -0,0 +1,470 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/fp32/deconv_2d_3x3.h" + +#include +#include "mace/ops/arm/fp32/common_neon.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = out_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t out_img_size = outh * outw; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t oc = start1; oc < end1; oc += step1) { + if (oc + 1 < outch) { + float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size; + float *out_base1 = out_base0 + out_img_size; + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input_data + (b * inch + ic) * h * w; + const float *kernel_base0 = filter_data + (oc * inch + ic) * 9; + const float *kernel_base1 = kernel_base0 + inch * 9; + const float *in = input_base; + + // output channel 0 + const float *k0_0 = kernel_base0; + const float *k0_1 = kernel_base0 + 3; + const float *k0_2 = kernel_base0 + 5; + // output channel 1 + const float *k1_0 = kernel_base1; + const float *k1_1 = kernel_base1 + 3; + const float *k1_2 = kernel_base1 + 5; + + // load filter + float32x4_t k00_vec, k01_vec, k02_vec; + float32x4_t k10_vec, k11_vec, k12_vec; + + k00_vec = vld1q_f32(k0_0); + k01_vec = vld1q_f32(k0_1); + k02_vec = vld1q_f32(k0_2); + + k10_vec = vld1q_f32(k1_0); + k11_vec = vld1q_f32(k1_1); + k12_vec = vld1q_f32(k1_2); + + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + float *out_row0_2 = out_row_base0 + 2 * outw; + + float *out_row_base1 = out_base1 + i * outw; + float *out_row1_0 = out_row_base1; + float *out_row1_1 = out_row_base1 + outw; + float *out_row1_2 = out_row_base1 + 2 * outw; + + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02; + float32x4_t out10, out11, out12; + float32x4_t out20, out21, out22; + + out00 = vld1q_f32(out_row0_0); + out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); + vst1q_f32(out_row0_0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 2, out02); + + out10 = vld1q_f32(out_row0_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 0, out10); + + out11 = vld1q_f32(out_row0_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 1, out11); + + out12 = vld1q_f32(out_row0_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 2, out12); + + out20 = vld1q_f32(out_row0_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 0, out20); + + out21 = vld1q_f32(out_row0_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 1, out21); + + out22 = vld1q_f32(out_row0_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 2, out22); + + out00 = vld1q_f32(out_row1_0 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 0, out00); + + out01 = vld1q_f32(out_row1_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 1, out01); + + out02 = vld1q_f32(out_row1_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 2, out02); + + out10 = vld1q_f32(out_row1_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 0, out10); + + out11 = vld1q_f32(out_row1_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 1, out11); + + out12 = vld1q_f32(out_row1_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 2, out12); + + out20 = vld1q_f32(out_row1_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 0, out20); + + out21 = vld1q_f32(out_row1_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 1, out21); + + out22 = vld1q_f32(out_row1_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 2, out22); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + out_row0_2 += 4; + out_row1_0 += 4; + out_row1_1 += 4; + out_row1_2 += 4; + } + + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 3; ++k) { + out_row0_0[k] += val * k0_0[k]; + out_row0_1[k] += val * k0_1[k]; + out_row0_2[k] += val * k0_2[k + 1]; + out_row1_0[k] += val * k1_0[k]; + out_row1_1[k] += val * k1_1[k]; + out_row1_2[k] += val * k1_2[k + 1]; + } + in++; + out_row0_0++; + out_row0_1++; + out_row0_2++; + out_row1_0++; + out_row1_1++; + out_row1_2++; + } + } + } + } else { + float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw; + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input_data + (b * inch + ic) * h * w; + const float *kernel_base0 = filter_data + (oc * inch + ic) * 9; + const float *in = input_base; + const float *k0_0 = kernel_base0; + const float *k0_1 = kernel_base0 + 3; + const float *k0_2 = kernel_base0 + 5; + + // load filter + float32x4_t k00_vec = vld1q_f32(k0_0); + float32x4_t k01_vec = vld1q_f32(k0_1); + float32x4_t k02_vec = vld1q_f32(k0_2); + + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + float *out_row0_2 = out_row_base0 + 2 * outw; + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02; + float32x4_t out10, out11, out12; + float32x4_t out20, out21, out22; + + out00 = vld1q_f32(out_row0_0 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 2, out02); + + out10 = vld1q_f32(out_row0_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 0, out10); + + out11 = vld1q_f32(out_row0_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 1, out11); + + out12 = vld1q_f32(out_row0_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 2, out12); + + out20 = vld1q_f32(out_row0_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 0, out20); + + out21 = vld1q_f32(out_row0_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 1, out21); + + out22 = vld1q_f32(out_row0_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 2, out22); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + out_row0_2 += 4; + } + + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 3; ++k) { + out_row0_0[k] += val * k0_0[k]; + out_row0_1[k] += val * k0_1[k]; + out_row0_2[k] += val * k0_2[k + 1]; + } + in++; + out_row0_0++; + out_row0_1++; + out_row0_2++; + } + } + } + } + } + } + }, 0, batch, 1, 0, outch, 2); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t oc = start1; oc < end1; oc += step1) { + float *out_base = padded_out_data + (b * outch + oc) * out_img_size; + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input_data + (b * inch + ic) * h * w; + const float *kernel_base = filter_data + (oc * inch + ic) * 9; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 3; + const float *k2 = kernel_base + 5; + + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + + for (index_t i = 0; i < h; ++i) { + float *out_row_base = out_base + i * 2 * outw; + float *out_row_0 = out_row_base; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + + index_t j = 0; + + for (index_t n = 0; n + 9 < outw; n += 8) { + float32x4_t in_vec = vld1q_f32(in); + + // out row 0 + float32x4x2_t out00 = vld2q_f32(out_row_0); + out00.val[0] = + neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); + out00.val[1] = + neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out00); + + float32x4x2_t out01 = vld2q_f32(out_row_0 + 2); + out01.val[0] = + neon_vfma_lane_2(out01.val[0], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out01); + + // out row 1 + float32x4x2_t out10 = vld2q_f32(out_row_1); + out10.val[0] = + neon_vfma_lane_0(out10.val[0], in_vec, k1_vec); + out10.val[1] = + neon_vfma_lane_1(out10.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out10); + + float32x4x2_t out11 = vld2q_f32(out_row_1 + 2); + out11.val[0] = + neon_vfma_lane_2(out11.val[0], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out11); + + // out row 2 + float32x4x2_t out20 = vld2q_f32(out_row_2); + out20.val[0] = + neon_vfma_lane_1(out20.val[0], in_vec, k2_vec); + out20.val[1] = + neon_vfma_lane_2(out20.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out20); + + float32x4x2_t out21 = vld2q_f32(out_row_2 + 2); + out21.val[0] = + neon_vfma_lane_3(out21.val[0], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out21); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + j += 4; + } + + for (; j < w; ++j) { + float val = in[0]; + + for (int k = 0; k < 3; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k + 1]; + } + + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + } + } + } + } + } + }, 0, batch, 1, 0, outch, 1); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/deconv_2d_3x3.h b/mace/ops/arm/fp32/deconv_2d_3x3.h new file mode 100644 index 0000000000000000000000000000000000000000..4495cbe8e4ef5fa3b05c72e9970fa05fb67a7fbb --- /dev/null +++ b/mace/ops/arm/fp32/deconv_2d_3x3.h @@ -0,0 +1,70 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_ +#define MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_ + +#include +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Deconv2dK3x3S1 : public Deconv2dBase { + public: + Deconv2dK3x3S1(const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {} + virtual ~Deconv2dK3x3S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +class Deconv2dK3x3S2 : public Deconv2dBase { + public: + Deconv2dK3x3S2(const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {} + virtual ~Deconv2dK3x3S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_ diff --git a/mace/ops/arm/fp32/deconv_2d_4x4.cc b/mace/ops/arm/fp32/deconv_2d_4x4.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c47ecff71bc46ea02aa73cb49d511a22c61ba27 --- /dev/null +++ b/mace/ops/arm/fp32/deconv_2d_4x4.cc @@ -0,0 +1,581 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/fp32/deconv_2d_4x4.h" + +#include +#include "mace/ops/arm/fp32/common_neon.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t out_img_size = outh * outw; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t oc = start1; oc < end1; oc += step1) { + if (oc + 1 < outch) { + float *out_base = padded_out_data + (b * outch + oc) * out_img_size; + float *out_base1 = out_base + out_img_size; + for (index_t q = 0; q < inch; q++) { + const float *input_base = input_data + (b * inch + q) * h * w; + const float *in = input_base; + const float *kernel_base = filter_data + (oc * inch + q) * 16; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; + + const float *kernel_base1 = kernel_base + inch * 16; + const float *k10 = kernel_base1; + const float *k11 = kernel_base1 + 4; + const float *k12 = kernel_base1 + 8; + const float *k13 = kernel_base1 + 12; + + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); + + float32x4_t k10_vec = vld1q_f32(k10); + float32x4_t k11_vec = vld1q_f32(k11); + float32x4_t k12_vec = vld1q_f32(k12); + float32x4_t k13_vec = vld1q_f32(k13); + + for (index_t i = 0; i < h; i++) { + float *out_row = out_base + i * outw; + + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + + float *out_row1 = out_base1 + i * outw; + + float *out_row1_0 = out_row1; + float *out_row1_1 = out_row1_0 + outw; + float *out_row1_2 = out_row1_1 + outw; + float *out_row1_3 = out_row1_2 + outw; + + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + float32x4_t out00, out01, out02, out03; + float32x4_t out10, out11, out12, out13; + + out00 = vld1q_f32(out_row_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row_0, out00); + + out10 = vld1q_f32(out_row1_0); + out10 = neon_vfma_lane_0(out10, in_vec, k10_vec); + vst1q_f32(out_row1_0, out10); + + out01 = vld1q_f32(out_row_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row_0 + 1, out01); + + out11 = vld1q_f32(out_row1_0 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 1, out11); + + out02 = vld1q_f32(out_row_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row_0 + 2, out02); + + out12 = vld1q_f32(out_row1_0 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 2, out12); + + out03 = vld1q_f32(out_row_0 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row_0 + 3, out03); + + out13 = vld1q_f32(out_row1_0 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 3, out13); + + out00 = vld1q_f32(out_row_1); + out00 = neon_vfma_lane_0(out00, in_vec, k1_vec); + vst1q_f32(out_row_1, out00); + + out10 = vld1q_f32(out_row1_1); + out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); + vst1q_f32(out_row1_1, out10); + + out01 = vld1q_f32(out_row_1 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k1_vec); + vst1q_f32(out_row_1 + 1, out01); + + out11 = vld1q_f32(out_row1_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 1, out11); + + out02 = vld1q_f32(out_row_1 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k1_vec); + vst1q_f32(out_row_1 + 2, out02); + + out12 = vld1q_f32(out_row1_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 2, out12); + + out03 = vld1q_f32(out_row_1 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k1_vec); + vst1q_f32(out_row_1 + 3, out03); + + out13 = vld1q_f32(out_row1_1 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 3, out13); + + out00 = vld1q_f32(out_row_2 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k2_vec); + vst1q_f32(out_row_2 + 0, out00); + + out10 = vld1q_f32(out_row1_2 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 0, out10); + + out01 = vld1q_f32(out_row_2 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k2_vec); + vst1q_f32(out_row_2 + 1, out01); + + out11 = vld1q_f32(out_row1_2 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 1, out11); + + out02 = vld1q_f32(out_row_2 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k2_vec); + vst1q_f32(out_row_2 + 2, out02); + + out12 = vld1q_f32(out_row1_2 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 2, out12); + + out03 = vld1q_f32(out_row_2 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k2_vec); + vst1q_f32(out_row_2 + 3, out03); + + out13 = vld1q_f32(out_row1_2 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 3, out13); + + out00 = vld1q_f32(out_row_3 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k3_vec); + vst1q_f32(out_row_3 + 0, out00); + + out10 = vld1q_f32(out_row1_3 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 0, out10); + + out01 = vld1q_f32(out_row_3 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k3_vec); + vst1q_f32(out_row_3 + 1, out01); + + out11 = vld1q_f32(out_row1_3 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 1, out11); + + out02 = vld1q_f32(out_row_3 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k3_vec); + vst1q_f32(out_row_3 + 2, out02); + + out12 = vld1q_f32(out_row1_3 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 2, out12); + + out03 = vld1q_f32(out_row_3 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k3_vec); + vst1q_f32(out_row_3 + 3, out03); + + out13 = vld1q_f32(out_row1_3 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 3, out13); + + in += 4; + out_row_0 += 4; + out_row_1 += 4; + out_row_2 += 4; + out_row_3 += 4; + out_row1_0 += 4; + out_row1_1 += 4; + out_row1_2 += 4; + out_row1_3 += 4; + } + + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + out_row1_0[k] += val * k10[k]; + out_row1_1[k] += val * k11[k]; + out_row1_2[k] += val * k12[k]; + out_row1_3[k] += val * k13[k]; + } + in++; + out_row_0++; + out_row_1++; + out_row_2++; + out_row_3++; + out_row1_0++; + out_row1_1++; + out_row1_2++; + out_row1_3++; + } + } + } + } else { + float *out_base = padded_out_data + (b * outch + oc) * out_img_size; + for (index_t q = 0; q < inch; q++) { + const float *input_base = input_data + (b * inch + q) * h * w; + const float *kernel_base = filter_data + (oc * inch + q) * 16; + const float *in = input_base; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; + + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); + + for (index_t i = 0; i < h; i++) { + float *out_row = out_base + i * outw; + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + int j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00 = vld1q_f32(out_row_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row_0, out00); + + float32x4_t out01 = vld1q_f32(out_row_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row_0 + 1, out01); + + float32x4_t out02 = vld1q_f32(out_row_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row_0 + 2, out02); + + float32x4_t out03 = vld1q_f32(out_row_0 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row_0 + 3, out03); + + float32x4_t out10 = vld1q_f32(out_row_1); + out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); + vst1q_f32(out_row_1, out10); + + float32x4_t out11 = vld1q_f32(out_row_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); + vst1q_f32(out_row_1 + 1, out11); + + float32x4_t out12 = vld1q_f32(out_row_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); + vst1q_f32(out_row_1 + 2, out12); + + float32x4_t out13 = vld1q_f32(out_row_1 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); + vst1q_f32(out_row_1 + 3, out13); + + float32x4_t out20 = vld1q_f32(out_row_2 + 0); + out20 = neon_vfma_lane_0(out20, in_vec, k2_vec); + vst1q_f32(out_row_2 + 0, out20); + + float32x4_t out21 = vld1q_f32(out_row_2 + 1); + out21 = neon_vfma_lane_1(out21, in_vec, k2_vec); + vst1q_f32(out_row_2 + 1, out21); + + float32x4_t out22 = vld1q_f32(out_row_2 + 2); + out22 = neon_vfma_lane_2(out22, in_vec, k2_vec); + vst1q_f32(out_row_2 + 2, out22); + + float32x4_t out23 = vld1q_f32(out_row_2 + 3); + out23 = neon_vfma_lane_3(out23, in_vec, k2_vec); + vst1q_f32(out_row_2 + 3, out23); + + float32x4_t out30 = vld1q_f32(out_row_3 + 0); + out30 = neon_vfma_lane_0(out30, in_vec, k3_vec); + vst1q_f32(out_row_3 + 0, out30); + + float32x4_t out31 = vld1q_f32(out_row_3 + 1); + out31 = neon_vfma_lane_1(out31, in_vec, k3_vec); + vst1q_f32(out_row_3 + 1, out31); + + float32x4_t out32 = vld1q_f32(out_row_3 + 2); + out32 = neon_vfma_lane_2(out32, in_vec, k3_vec); + vst1q_f32(out_row_3 + 2, out32); + + float32x4_t out33 = vld1q_f32(out_row_3 + 3); + out33 = neon_vfma_lane_3(out33, in_vec, k3_vec); + vst1q_f32(out_row_3 + 3, out33); + + in += 4; + out_row_0 += 4; + out_row_1 += 4; + out_row_2 += 4; + out_row_3 += 4; + } + + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0++; + out_row_1++; + out_row_2++; + out_row_3++; + } + } + } + } + } + } + }, 0, batch, 1, 0, outch, 2); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t p = start1; p < end1; p += step1) { + float *out_base = padded_out_data + (b * outch + p) * out_img_size; + for (index_t q = 0; q < inch; q++) { + const float *input_base = input_data + (b * inch + q) * h * w; + const float *kernel_base = filter_data + (p * inch + q) * 16; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; + + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); + + for (index_t i = 0; i < h; i++) { + float *out_row = out_base + 2 * i * outw; + + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + + index_t j = 0; + + for (index_t n = 0; n + 9 < outw; n += 8) { + float32x4_t in_vec = vld1q_f32(in); + + // row 0 + float32x4x2_t out0 = vld2q_f32(out_row_0); + out0.val[0] = + neon_vfma_lane_0(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_1(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out0); + out0 = vld2q_f32(out_row_0 + 2); + out0.val[0] = + neon_vfma_lane_2(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_3(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out0); + + // row 1 + float32x4x2_t out1 = vld2q_f32(out_row_1); + out1.val[0] = + neon_vfma_lane_0(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_1(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out1); + out1 = vld2q_f32(out_row_1 + 2); + out1.val[0] = + neon_vfma_lane_2(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_3(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out1); + + // row 2 + float32x4x2_t out2 = vld2q_f32(out_row_2); + out2.val[0] = + neon_vfma_lane_0(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_1(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out2); + out2 = vld2q_f32(out_row_2 + 2); + out2.val[0] = + neon_vfma_lane_2(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_3(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out2); + + // row 3 + float32x4x2_t out3 = vld2q_f32(out_row_3); + out3.val[0] = + neon_vfma_lane_0(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_1(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3, out3); + out3 = vld2q_f32(out_row_3 + 2); + out3.val[0] = + neon_vfma_lane_2(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_3(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3 + 2, out3); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + out_row_3 += 8; + j += 4; + } + + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + out_row_3 += 2; + } + } + } + } + } + }, 0, batch, 1, 0, outch, 1); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/deconv_2d_4x4.h b/mace/ops/arm/fp32/deconv_2d_4x4.h new file mode 100644 index 0000000000000000000000000000000000000000..9f09056af0224331fca8815cca18a1f7eecdd1cc --- /dev/null +++ b/mace/ops/arm/fp32/deconv_2d_4x4.h @@ -0,0 +1,70 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_ +#define MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_ + +#include +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Deconv2dK4x4S1 : public Deconv2dBase { + public: + Deconv2dK4x4S1(const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {} + virtual ~Deconv2dK4x4S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +class Deconv2dK4x4S2 : public Deconv2dBase { + public: + Deconv2dK4x4S2(const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {} + virtual ~Deconv2dK4x4S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_ diff --git a/mace/ops/arm/fp32/deconv_2d_general.cc b/mace/ops/arm/fp32/deconv_2d_general.cc new file mode 100644 index 0000000000000000000000000000000000000000..47bfe39cf27adac58b1240afa66390fc23dc8866 --- /dev/null +++ b/mace/ops/arm/fp32/deconv_2d_general.cc @@ -0,0 +1,117 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/fp32/deconv_2d_general.h" + +// TODO(liutuo): optimize it + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Deconv2dGeneral::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_img_size = out_height * out_width; + const index_t in_img_size = in_height * in_width; + const index_t kernel_h = filter->dim(2); + const index_t kernel_w = filter->dim(3); + + const int kernel_size = static_cast(kernel_h * kernel_w); + std::vector index_map(kernel_size, 0); + for (index_t i = 0; i < kernel_h; ++i) { + for (index_t j = 0; j < kernel_w; ++j) { + index_map[i * kernel_w + j] = i * out_width + j; + } + } + + const index_t batch = in_shape[0]; + const index_t out_channels = out_shape[1]; + const index_t in_channels = in_shape[1]; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t oc = start1; oc < end1; oc += step1) { + float *out_base = + padded_out_data + (b * out_channels + oc) * out_img_size; + for (index_t i = 0; i < in_height; ++i) { + for (index_t j = 0; j < in_width; ++j) { + const index_t out_offset = + i * strides_[0] * out_width + j * strides_[1]; + for (int ic = 0; ic < in_channels; ++ic) { + const index_t input_idx = + (b * in_channels + ic) * in_img_size + i * in_width + j; + const float val = input_data[input_idx]; + const index_t kernel_offset = + (oc * in_channels + ic) * kernel_size; + for (int k = 0; k < kernel_size; ++k) { + const index_t out_idx = out_offset + index_map[k]; + const index_t kernel_idx = kernel_offset + k; + out_base[out_idx] += val * filter_data[kernel_idx]; + } + } + } + } + } + } + }, 0, batch, 1, 0, out_channels, 1); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + diff --git a/mace/ops/arm/fp32/deconv_2d_general.h b/mace/ops/arm/fp32/deconv_2d_general.h new file mode 100644 index 0000000000000000000000000000000000000000..d11ada030c02c4f155aec12e0a162513cdae0c25 --- /dev/null +++ b/mace/ops/arm/fp32/deconv_2d_general.h @@ -0,0 +1,60 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_ +#define MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_ + +#include +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Deconv2dGeneral : public Deconv2dBase { + public: + Deconv2dGeneral(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase(strides, + dilations, + paddings, + padding_type, + framework_type) {} + virtual ~Deconv2dGeneral() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_ diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc index 3ac8eb5de20503a89b9b25202b91ddbf8e305031..a27827b471818c049a09e532c059b56396e8f452 100644 --- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc +++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include "mace/ops/arm/fp32/depthwise_conv_2d_3x3.h" +#include + namespace mace { namespace ops { namespace arm { @@ -64,14 +65,26 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, const int pad_top = paddings[0] / 2; const int pad_left = paddings[1] / 2; - const index_t multiplier = out_shape[1] / in_shape[1]; - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + const index_t multiplier = out_channels / in_channels; std::vector out_bounds; CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds); + const index_t valid_h_start = out_bounds[0]; + const index_t valid_h_stop = out_bounds[1]; + const index_t valid_w_start = out_bounds[2]; + const index_t valid_w_stop = out_bounds[3]; Tensor::MappingGuard in_guard(input); Tensor::MappingGuard filter_guard(filter); @@ -80,159 +93,211 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, auto input_data = input->data(); auto output_data = output->mutable_data(); -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < in_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; ++m) { - const index_t c = m / multiplier; - const index_t multi_index = m % multiplier; - const float *in_base = input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr = filter_data + multi_index * in_shape[1] * 9 + c * 9; - float *out_base = output_data + b * out_batch_size + m * out_image_size; - index_t h, w; - const index_t out_width = out_shape[3]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - - const index_t valid_h_start = out_bounds[0]; - const index_t valid_h_stop = out_bounds[1]; - const index_t valid_w_start = out_bounds[2]; - const index_t valid_w_stop = out_bounds[3]; - - // top - for (h = 0; h < valid_h_start; ++h) { - for (w = 0; w < out_shape[3]; ++w) { - DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top, - w - pad_left, out_width, in_height, in_width, 3, - 3, out_base); - } - } - - // load filter (1 outch x 3 height x 3 width): vf_outch_height - float32x4_t vf00, vf01, vf02; - vf00 = vld1q_f32(filter_ptr); - vf01 = vld1q_f32(filter_ptr + 3); - vf02 = vld1q_f32(filter_ptr + 5); - - for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) { - // left - for (w = 0; w < valid_w_start; ++w) { - DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top, - w - pad_left, out_width, in_height, in_width, 3, - 3, out_base); - DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top, - w - pad_left, out_width, in_height, in_width, 3, - 3, out_base); + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + const index_t c = m / multiplier; + const index_t multi_index = m % multiplier; + const float + *in_base = input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9; + float *out_base = output_data + b * out_batch_size + m * out_image_size; + index_t h, w; + + // top + for (h = 0; h < valid_h_start; ++h) { + for (w = 0; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } } - for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { - // input (4 height x 3 slide): vi_height_slide - float32x4_t vi00, vi01, vi02, vi0n; - float32x4_t vi10, vi11, vi12, vi1n; - float32x4_t vi20, vi21, vi22, vi2n; - float32x4_t vi30, vi31, vi32, vi3n; - - // output (1 outch x 2 height x 4 width): vo_outch_height - float32x4_t vo00, vo01; - - // load input - index_t in_h = h - pad_top; - index_t in_w = w - pad_left; - index_t in_offset = in_h * in_width + in_w; - vi00 = vld1q_f32(in_base + in_offset); - vi0n = vld1q_f32(in_base + in_offset + 4); - vi10 = vld1q_f32(in_base + in_offset + in_width); - vi1n = vld1q_f32(in_base + in_offset + in_width + 4); - vi20 = vld1q_f32(in_base + in_offset + 2 * in_width); - vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 4); - vi30 = vld1q_f32(in_base + in_offset + 3 * in_width); - vi3n = vld1q_f32(in_base + in_offset + 3 * in_width + 4); - - vi01 = vextq_f32(vi00, vi0n, 1); - vi02 = vextq_f32(vi00, vi0n, 2); - vi11 = vextq_f32(vi10, vi1n, 1); - vi12 = vextq_f32(vi10, vi1n, 2); - vi21 = vextq_f32(vi20, vi2n, 1); - vi22 = vextq_f32(vi20, vi2n, 2); - vi31 = vextq_f32(vi30, vi3n, 1); - vi32 = vextq_f32(vi30, vi3n, 2); - - // load ouptut - index_t out_offset = h * out_width + w; - vo00 = vld1q_f32(out_base + out_offset); - vo01 = vld1q_f32(out_base + out_offset + out_width); + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float32x4_t vf00, vf01, vf02; + vf00 = vld1q_f32(filter_ptr); + vf01 = vld1q_f32(filter_ptr + 3); + vf02 = vld1q_f32(filter_ptr + 5); + + for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) { + // left + for (w = 0; w < valid_w_start; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + DepthwiseConv2dPixel(in_base, + filter_ptr, + h + 1, + w, + h + 1 - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + + for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { + // input (4 height x 3 slide): vi_height_slide + float32x4_t vi00, vi01, vi02, vi0n; + float32x4_t vi10, vi11, vi12, vi1n; + float32x4_t vi20, vi21, vi22, vi2n; + float32x4_t vi30, vi31, vi32, vi3n; + + // output (1 outch x 2 height x 4 width): vo_outch_height + float32x4_t vo00, vo01; + + // load input + index_t in_h = h - pad_top; + index_t in_w = w - pad_left; + index_t in_offset = in_h * in_width + in_w; + vi00 = vld1q_f32(in_base + in_offset); + vi0n = vld1q_f32(in_base + in_offset + 4); + vi10 = vld1q_f32(in_base + in_offset + in_width); + vi1n = vld1q_f32(in_base + in_offset + in_width + 4); + vi20 = vld1q_f32(in_base + in_offset + 2 * in_width); + vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 4); + vi30 = vld1q_f32(in_base + in_offset + 3 * in_width); + vi3n = vld1q_f32(in_base + in_offset + 3 * in_width + 4); + + vi01 = vextq_f32(vi00, vi0n, 1); + vi02 = vextq_f32(vi00, vi0n, 2); + vi11 = vextq_f32(vi10, vi1n, 1); + vi12 = vextq_f32(vi10, vi1n, 2); + vi21 = vextq_f32(vi20, vi2n, 1); + vi22 = vextq_f32(vi20, vi2n, 2); + vi31 = vextq_f32(vi30, vi3n, 1); + vi32 = vextq_f32(vi30, vi3n, 2); + + // load ouptut + index_t out_offset = h * out_width + w; + vo00 = vld1q_f32(out_base + out_offset); + vo01 = vld1q_f32(out_base + out_offset + out_width); #if defined(__aarch64__) - // outch 0, height 0 - vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); - vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1); - vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2); - vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0); - vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1); - vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2); - vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1); - vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2); - vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3); - - // outch 0, height 1 - vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0); - vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1); - vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2); - vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0); - vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1); - vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2); - vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1); - vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2); - vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3); + // outch 0, height 0 + vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); + vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1); + vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2); + vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0); + vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1); + vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2); + vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1); + vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2); + vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3); + + // outch 0, height 1 + vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0); + vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1); + vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2); + vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0); + vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1); + vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2); + vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1); + vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2); + vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3); #else - // outch 0, height 0 - vo00 = vmlaq_lane_f32(vo00, vi00, vget_low_f32(vf00), 0); - vo00 = vmlaq_lane_f32(vo00, vi01, vget_low_f32(vf00), 1); - vo00 = vmlaq_lane_f32(vo00, vi02, vget_high_f32(vf00), 0); - vo00 = vmlaq_lane_f32(vo00, vi10, vget_low_f32(vf01), 0); - vo00 = vmlaq_lane_f32(vo00, vi11, vget_low_f32(vf01), 1); - vo00 = vmlaq_lane_f32(vo00, vi12, vget_high_f32(vf01), 0); - vo00 = vmlaq_lane_f32(vo00, vi20, vget_low_f32(vf02), 1); - vo00 = vmlaq_lane_f32(vo00, vi21, vget_high_f32(vf02), 0); - vo00 = vmlaq_lane_f32(vo00, vi22, vget_high_f32(vf02), 1); - - // outch 0, height 1 - vo01 = vmlaq_lane_f32(vo01, vi10, vget_low_f32(vf00), 0); - vo01 = vmlaq_lane_f32(vo01, vi11, vget_low_f32(vf00), 1); - vo01 = vmlaq_lane_f32(vo01, vi12, vget_high_f32(vf00), 0); - vo01 = vmlaq_lane_f32(vo01, vi20, vget_low_f32(vf01), 0); - vo01 = vmlaq_lane_f32(vo01, vi21, vget_low_f32(vf01), 1); - vo01 = vmlaq_lane_f32(vo01, vi22, vget_high_f32(vf01), 0); - vo01 = vmlaq_lane_f32(vo01, vi30, vget_low_f32(vf02), 1); - vo01 = vmlaq_lane_f32(vo01, vi31, vget_high_f32(vf02), 0); - vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1); + // outch 0, height 0 + vo00 = vmlaq_lane_f32(vo00, vi00, vget_low_f32(vf00), 0); + vo00 = vmlaq_lane_f32(vo00, vi01, vget_low_f32(vf00), 1); + vo00 = vmlaq_lane_f32(vo00, vi02, vget_high_f32(vf00), 0); + vo00 = vmlaq_lane_f32(vo00, vi10, vget_low_f32(vf01), 0); + vo00 = vmlaq_lane_f32(vo00, vi11, vget_low_f32(vf01), 1); + vo00 = vmlaq_lane_f32(vo00, vi12, vget_high_f32(vf01), 0); + vo00 = vmlaq_lane_f32(vo00, vi20, vget_low_f32(vf02), 1); + vo00 = vmlaq_lane_f32(vo00, vi21, vget_high_f32(vf02), 0); + vo00 = vmlaq_lane_f32(vo00, vi22, vget_high_f32(vf02), 1); + + // outch 0, height 1 + vo01 = vmlaq_lane_f32(vo01, vi10, vget_low_f32(vf00), 0); + vo01 = vmlaq_lane_f32(vo01, vi11, vget_low_f32(vf00), 1); + vo01 = vmlaq_lane_f32(vo01, vi12, vget_high_f32(vf00), 0); + vo01 = vmlaq_lane_f32(vo01, vi20, vget_low_f32(vf01), 0); + vo01 = vmlaq_lane_f32(vo01, vi21, vget_low_f32(vf01), 1); + vo01 = vmlaq_lane_f32(vo01, vi22, vget_high_f32(vf01), 0); + vo01 = vmlaq_lane_f32(vo01, vi30, vget_low_f32(vf02), 1); + vo01 = vmlaq_lane_f32(vo01, vi31, vget_high_f32(vf02), 0); + vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1); #endif - vst1q_f32(out_base + out_offset, vo00); - vst1q_f32(out_base + out_offset + out_width, vo01); - } // w - - // right - for (; w < out_width; ++w) { - DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top, - w - pad_left, out_width, in_height, in_width, 3, - 3, out_base); - DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top, - w - pad_left, out_width, in_height, in_width, 3, - 3, out_base); + vst1q_f32(out_base + out_offset, vo00); + vst1q_f32(out_base + out_offset + out_width, vo01); + } // w + + // right + for (; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + DepthwiseConv2dPixel(in_base, + filter_ptr, + h + 1, + w, + h + 1 - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + } // h + + + // bottom + for (; h < out_height; ++h) { + for (w = 0; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } } - } // h - - - // bottom - for (; h < out_shape[2]; ++h) { - for (w = 0; w < out_shape[3]; ++w) { - DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top, - w - pad_left, out_width, in_height, in_width, 3, - 3, out_base); - } - } - } // m - } // b + } // m + } // b + }, 0, batch, 1, 0, out_channels, 1); // threadpool return MaceStatus::MACE_SUCCESS; } @@ -256,14 +321,26 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, const int pad_top = paddings[0] / 2; const int pad_left = paddings[1] / 2; - const index_t multiplier = out_shape[1] / in_shape[1]; - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + const index_t multiplier = out_channels / in_channels; std::vector out_bounds; CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds); + const index_t valid_h_start = out_bounds[0]; + const index_t valid_h_stop = out_bounds[1]; + const index_t valid_w_start = out_bounds[2]; + const index_t valid_w_stop = out_bounds[3]; Tensor::MappingGuard in_guard(input); Tensor::MappingGuard filter_guard(filter); @@ -272,131 +349,165 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, auto input_data = input->data(); auto output_data = output->mutable_data(); -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < in_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; ++m) { - index_t c = m / multiplier; - index_t multi_index = m % multiplier; - const float *in_base = input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr = filter_data + multi_index * in_shape[1] * 9 + c * 9; - float *out_base = output_data + b * out_batch_size + m * out_image_size; - index_t h, w; - const index_t out_width = out_shape[3]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - - const index_t valid_h_start = out_bounds[0]; - const index_t valid_h_stop = out_bounds[1]; - const index_t valid_w_start = out_bounds[2]; - const index_t valid_w_stop = out_bounds[3]; - - // top - for (h = 0; h < valid_h_start; ++h) { - for (w = 0; w < out_width; ++w) { - DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top, - w * 2 - pad_left, out_width, in_height, in_width, - 3, 3, out_base); - } - } - - // load filter (1 outch x 3 height x 3 width): vf_outch_height - float32x4_t vf00, vf01, vf02; - vf00 = vld1q_f32(filter_ptr); - vf01 = vld1q_f32(filter_ptr + 3); - vf02 = vld1q_f32(filter_ptr + 5); - - for (h = valid_h_start; h < valid_h_stop; ++h) { - // left - for (w = 0; w < valid_w_start; ++w) { - DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top, - w * 2 - pad_left, out_width, in_height, in_width, - 3, 3, out_base); + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t m = start1; m < end1; m += step1) { + index_t c = m / multiplier; + index_t multi_index = m % multiplier; + const float + *in_base = input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9; + float *out_base = output_data + b * out_batch_size + m * out_image_size; + index_t h, w; + + // top + for (h = 0; h < valid_h_start; ++h) { + for (w = 0; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h * 2 - pad_top, + w * 2 - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } } - for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { - float32x4x2_t vi0, vi1, vi2; - float32x4_t vi0n, vi1n, vi2n; - - // input (3 height x 3 slide): vi_height_slide - float32x4_t vi00, vi01, vi02; - float32x4_t vi10, vi11, vi12; - float32x4_t vi20, vi21, vi22; - - // output (1 outch x 1 height x 4 width): vo - float32x4_t vo; - - // load input - index_t in_h = h * 2 - pad_top; - index_t in_w = w * 2 - pad_left; - index_t in_offset = in_h * in_width + in_w; - vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] - vi1 = vld2q_f32(in_base + in_offset + in_width); - vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); - - vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] - vi1n = vld1q_f32(in_base + in_offset + in_width + 8); - vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); - - // load ouptut - index_t out_offset = h * out_width + w; - vo = vld1q_f32(out_base + out_offset); - - vi00 = vi0.val[0]; // [0.2.4.6] - vi01 = vi0.val[1]; // [1.3.5.7] - vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] - vi10 = vi1.val[0]; - vi11 = vi1.val[1]; - vi12 = vextq_f32(vi10, vi1n, 1); - vi20 = vi2.val[0]; - vi21 = vi2.val[1]; - vi22 = vextq_f32(vi20, vi2n, 1); + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float32x4_t vf00, vf01, vf02; + vf00 = vld1q_f32(filter_ptr); + vf01 = vld1q_f32(filter_ptr + 3); + vf02 = vld1q_f32(filter_ptr + 5); + + for (h = valid_h_start; h < valid_h_stop; ++h) { + // left + for (w = 0; w < valid_w_start; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h * 2 - pad_top, + w * 2 - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + + for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { + float32x4x2_t vi0, vi1, vi2; + float32x4_t vi0n, vi1n, vi2n; + + // input (3 height x 3 slide): vi_height_slide + float32x4_t vi00, vi01, vi02; + float32x4_t vi10, vi11, vi12; + float32x4_t vi20, vi21, vi22; + + // output (1 outch x 1 height x 4 width): vo + float32x4_t vo; + + // load input + index_t in_h = h * 2 - pad_top; + index_t in_w = w * 2 - pad_left; + index_t in_offset = in_h * in_width + in_w; + vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] + vi1 = vld2q_f32(in_base + in_offset + in_width); + vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); + + vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] + vi1n = vld1q_f32(in_base + in_offset + in_width + 8); + vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); + + // load ouptut + index_t out_offset = h * out_width + w; + vo = vld1q_f32(out_base + out_offset); + + vi00 = vi0.val[0]; // [0.2.4.6] + vi01 = vi0.val[1]; // [1.3.5.7] + vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] + vi10 = vi1.val[0]; + vi11 = vi1.val[1]; + vi12 = vextq_f32(vi10, vi1n, 1); + vi20 = vi2.val[0]; + vi21 = vi2.val[1]; + vi22 = vextq_f32(vi20, vi2n, 1); #if defined(__aarch64__) - // outch 0, height 0 - vo = vfmaq_laneq_f32(vo, vi00, vf00, 0); - vo = vfmaq_laneq_f32(vo, vi01, vf00, 1); - vo = vfmaq_laneq_f32(vo, vi02, vf00, 2); - vo = vfmaq_laneq_f32(vo, vi10, vf01, 0); - vo = vfmaq_laneq_f32(vo, vi11, vf01, 1); - vo = vfmaq_laneq_f32(vo, vi12, vf01, 2); - vo = vfmaq_laneq_f32(vo, vi20, vf02, 1); - vo = vfmaq_laneq_f32(vo, vi21, vf02, 2); - vo = vfmaq_laneq_f32(vo, vi22, vf02, 3); + // outch 0, height 0 + vo = vfmaq_laneq_f32(vo, vi00, vf00, 0); + vo = vfmaq_laneq_f32(vo, vi01, vf00, 1); + vo = vfmaq_laneq_f32(vo, vi02, vf00, 2); + vo = vfmaq_laneq_f32(vo, vi10, vf01, 0); + vo = vfmaq_laneq_f32(vo, vi11, vf01, 1); + vo = vfmaq_laneq_f32(vo, vi12, vf01, 2); + vo = vfmaq_laneq_f32(vo, vi20, vf02, 1); + vo = vfmaq_laneq_f32(vo, vi21, vf02, 2); + vo = vfmaq_laneq_f32(vo, vi22, vf02, 3); #else - // outch 0, height 0 - vo = vmlaq_lane_f32(vo, vi00, vget_low_f32(vf00), 0); - vo = vmlaq_lane_f32(vo, vi01, vget_low_f32(vf00), 1); - vo = vmlaq_lane_f32(vo, vi02, vget_high_f32(vf00), 0); - vo = vmlaq_lane_f32(vo, vi10, vget_low_f32(vf01), 0); - vo = vmlaq_lane_f32(vo, vi11, vget_low_f32(vf01), 1); - vo = vmlaq_lane_f32(vo, vi12, vget_high_f32(vf01), 0); - vo = vmlaq_lane_f32(vo, vi20, vget_low_f32(vf02), 1); - vo = vmlaq_lane_f32(vo, vi21, vget_high_f32(vf02), 0); - vo = vmlaq_lane_f32(vo, vi22, vget_high_f32(vf02), 1); + // outch 0, height 0 + vo = vmlaq_lane_f32(vo, vi00, vget_low_f32(vf00), 0); + vo = vmlaq_lane_f32(vo, vi01, vget_low_f32(vf00), 1); + vo = vmlaq_lane_f32(vo, vi02, vget_high_f32(vf00), 0); + vo = vmlaq_lane_f32(vo, vi10, vget_low_f32(vf01), 0); + vo = vmlaq_lane_f32(vo, vi11, vget_low_f32(vf01), 1); + vo = vmlaq_lane_f32(vo, vi12, vget_high_f32(vf01), 0); + vo = vmlaq_lane_f32(vo, vi20, vget_low_f32(vf02), 1); + vo = vmlaq_lane_f32(vo, vi21, vget_high_f32(vf02), 0); + vo = vmlaq_lane_f32(vo, vi22, vget_high_f32(vf02), 1); #endif - vst1q_f32(out_base + out_offset, vo); - } // w - - // right - for (; w < out_width; ++w) { - DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top, - w * 2 - pad_left, out_width, in_height, in_width, - 3, 3, out_base); + vst1q_f32(out_base + out_offset, vo); + } // w + + // right + for (; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h * 2 - pad_top, + w * 2 - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + } // h + + + // bottom + for (; h < out_height; ++h) { + for (w = 0; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h * 2 - pad_top, + w * 2 - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } } - } // h - - - // bottom - for (; h < out_shape[2]; ++h) { - for (w = 0; w < out_shape[3]; ++w) { - DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top, - w * 2 - pad_left, out_width, in_height, in_width, - 3, 3, out_base); - } - } - } // m - } // b + } // m + } // b + }, 0, batch, 1, 0, out_channels, 1); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h index e89ae2b969cdbdb7b32f34eeeada62d3ec14af3b..c130fbffd361dfb33be9974b3d603e630cb80979 100644 --- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h +++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h @@ -28,7 +28,7 @@ namespace fp32 { class DepthwiseConv2dK3x3S1 : public Conv2dBase { public: - DepthwiseConv2dK3x3S1(const std::vector paddings, + DepthwiseConv2dK3x3S1(const std::vector &paddings, const Padding padding_type) : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} virtual ~DepthwiseConv2dK3x3S1() {} @@ -37,12 +37,12 @@ class DepthwiseConv2dK3x3S1 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; class DepthwiseConv2dK3x3S2 : public Conv2dBase { public: - DepthwiseConv2dK3x3S2(const std::vector paddings, + DepthwiseConv2dK3x3S2(const std::vector &paddings, const Padding padding_type) : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {} virtual ~DepthwiseConv2dK3x3S2() {} @@ -51,7 +51,7 @@ class DepthwiseConv2dK3x3S2 : public Conv2dBase { const OpContext *context, const Tensor *input, const Tensor *filter, - Tensor *output); + Tensor *output) override; }; } // namespace fp32 diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..3cd6d527b7f1fa67d053cc96dea8ae6505e32352 --- /dev/null +++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc @@ -0,0 +1,782 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h" + +#include +#include "mace/ops/arm/fp32/common_neon.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + group_ = input->dim(1); + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t channels = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + const index_t in_img_size = h * w; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const index_t offset = b * channels + c; + float *out_base = padded_out_data + offset * out_img_size; + const float *input_base = input_data + offset * in_img_size; + const float *kernel_base = filter_data + c * 9; + const float *in = input_base; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 3; + const float *k2 = kernel_base + 5; + + // load filter + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + + for (index_t i = 0; i < h; ++i) { + float *out_row_base = out_base + i * outw; + float *out_row0 = out_row_base; + float *out_row1 = out_row_base + outw; + float *out_row2 = out_row_base + 2 * outw; + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02; + float32x4_t out10, out11, out12; + float32x4_t out20, out21, out22; + + out00 = vld1q_f32(out_row0 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row0 + 0, out00); + + out01 = vld1q_f32(out_row0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row0 + 1, out01); + + out02 = vld1q_f32(out_row0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row0 + 2, out02); + + out10 = vld1q_f32(out_row1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); + vst1q_f32(out_row1 + 0, out10); + + out11 = vld1q_f32(out_row1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); + vst1q_f32(out_row1 + 1, out11); + + out12 = vld1q_f32(out_row1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); + vst1q_f32(out_row1 + 2, out12); + + out20 = vld1q_f32(out_row2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k2_vec); + vst1q_f32(out_row2 + 0, out20); + + out21 = vld1q_f32(out_row2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k2_vec); + vst1q_f32(out_row2 + 1, out21); + + out22 = vld1q_f32(out_row2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k2_vec); + vst1q_f32(out_row2 + 2, out22); + + in += 4; + out_row0 += 4; + out_row1 += 4; + out_row2 += 4; + } + + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 3; ++k) { + out_row0[k] += val * k0[k]; + out_row1[k] += val * k1[k]; + out_row2[k] += val * k2[k + 1]; + } + in++; + out_row0++; + out_row1++; + out_row2++; + } + } + } + } + }, 0, batch, 1, 0, channels, 1); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + group_ = input->dim(1); + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t channels = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + const index_t in_img_size = h * w; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const index_t offset = b * channels + c; + float *out_base = padded_out_data + offset * out_img_size; + const float *input_base = input_data + offset * in_img_size; + const float *kernel_base = filter_data + c * 9; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 3; + const float *k2 = kernel_base + 5; + + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + + for (index_t i = 0; i < h; ++i) { + float *out_row_base = out_base + i * 2 * outw; + float *out_row_0 = out_row_base; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + + index_t j = 0; + + for (index_t n = 0; n + 9 < outw; n += 8) { + float32x4_t in_vec = vld1q_f32(in); + + // out row 0 + float32x4x2_t out00 = vld2q_f32(out_row_0); + out00.val[0] = + neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); + out00.val[1] = + neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out00); + + float32x4x2_t out01 = vld2q_f32(out_row_0 + 2); + out01.val[0] = + neon_vfma_lane_2(out01.val[0], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out01); + + // out row 1 + float32x4x2_t out10 = vld2q_f32(out_row_1); + out10.val[0] = + neon_vfma_lane_0(out10.val[0], in_vec, k1_vec); + out10.val[1] = + neon_vfma_lane_1(out10.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out10); + + float32x4x2_t out11 = vld2q_f32(out_row_1 + 2); + out11.val[0] = + neon_vfma_lane_2(out11.val[0], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out11); + + // out row 2 + float32x4x2_t out20 = vld2q_f32(out_row_2); + out20.val[0] = + neon_vfma_lane_1(out20.val[0], in_vec, k2_vec); + out20.val[1] = + neon_vfma_lane_2(out20.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out20); + + float32x4x2_t out21 = vld2q_f32(out_row_2 + 2); + out21.val[0] = + neon_vfma_lane_3(out21.val[0], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out21); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + j += 4; + } + + for (; j < w; ++j) { + float val = in[0]; + + for (int k = 0; k < 3; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k + 1]; + } + + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + } + } + } + } + }, 0, batch, 1, 0, channels, 1); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t in_img_size = h * w; + const index_t out_img_size = outh * outw; + + const index_t inch_g = inch / group_; + const index_t outch_g = outch / group_; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t g = start1; g < end1; g += step1) { + for (index_t oc = start2; oc < end2; oc += step2) { + if (oc + 1 < outch_g) { + const index_t out_offset = b * outch + outch_g * g + oc; + float *out_base0 = padded_out_data + out_offset * out_img_size; + float *out_base1 = out_base0 + out_img_size; + for (index_t ic = 0; ic < inch_g; ++ic) { + const index_t in_offset = b * inch + inch_g * g + ic; + const float *input_base = input_data + in_offset * in_img_size; + const index_t kernel_offset = (oc * group_ + g) * inch_g + ic; + const float *kernel_base0 = filter_data + kernel_offset * 9; + const float *kernel_base1 = kernel_base0 + inch * 9; + const float *in = input_base; + + // output channel 0 + const float *k0_0 = kernel_base0; + const float *k0_1 = kernel_base0 + 3; + const float *k0_2 = kernel_base0 + 5; + // output channel 1 + const float *k1_0 = kernel_base1; + const float *k1_1 = kernel_base1 + 3; + const float *k1_2 = kernel_base1 + 5; + + // load filter + float32x4_t k00_vec, k01_vec, k02_vec; + float32x4_t k10_vec, k11_vec, k12_vec; + + k00_vec = vld1q_f32(k0_0); + k01_vec = vld1q_f32(k0_1); + k02_vec = vld1q_f32(k0_2); + + k10_vec = vld1q_f32(k1_0); + k11_vec = vld1q_f32(k1_1); + k12_vec = vld1q_f32(k1_2); + + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + float *out_row0_2 = out_row_base0 + 2 * outw; + + float *out_row_base1 = out_base1 + i * outw; + float *out_row1_0 = out_row_base1; + float *out_row1_1 = out_row_base1 + outw; + float *out_row1_2 = out_row_base1 + 2 * outw; + + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02; + float32x4_t out10, out11, out12; + float32x4_t out20, out21, out22; + + out00 = vld1q_f32(out_row0_0); + out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); + vst1q_f32(out_row0_0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 2, out02); + + out10 = vld1q_f32(out_row0_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 0, out10); + + out11 = vld1q_f32(out_row0_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 1, out11); + + out12 = vld1q_f32(out_row0_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 2, out12); + + out20 = vld1q_f32(out_row0_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 0, out20); + + out21 = vld1q_f32(out_row0_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 1, out21); + + out22 = vld1q_f32(out_row0_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 2, out22); + + out00 = vld1q_f32(out_row1_0 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 0, out00); + + out01 = vld1q_f32(out_row1_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 1, out01); + + out02 = vld1q_f32(out_row1_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 2, out02); + + out10 = vld1q_f32(out_row1_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 0, out10); + + out11 = vld1q_f32(out_row1_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 1, out11); + + out12 = vld1q_f32(out_row1_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 2, out12); + + out20 = vld1q_f32(out_row1_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 0, out20); + + out21 = vld1q_f32(out_row1_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 1, out21); + + out22 = vld1q_f32(out_row1_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 2, out22); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + out_row0_2 += 4; + out_row1_0 += 4; + out_row1_1 += 4; + out_row1_2 += 4; + } + + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 3; ++k) { + out_row0_0[k] += val * k0_0[k]; + out_row0_1[k] += val * k0_1[k]; + out_row0_2[k] += val * k0_2[k + 1]; + out_row1_0[k] += val * k1_0[k]; + out_row1_1[k] += val * k1_1[k]; + out_row1_2[k] += val * k1_2[k + 1]; + } + in++; + out_row0_0++; + out_row0_1++; + out_row0_2++; + out_row1_0++; + out_row1_1++; + out_row1_2++; + } + } + } + } else { + const index_t out_offset = b * outch + outch_g * g + oc; + float *out_base0 = padded_out_data + out_offset * out_img_size; + for (index_t ic = 0; ic < inch_g; ++ic) { + const index_t in_offset = (b * group_ + g) * inch_g + ic; + const float *input_base = input_data + in_offset * in_img_size; + const index_t kernel_offset = (oc * group_ + g) * inch_g + ic; + const float *kernel_base0 = filter_data + kernel_offset * 9; + const float *in = input_base; + const float *k0_0 = kernel_base0; + const float *k0_1 = kernel_base0 + 3; + const float *k0_2 = kernel_base0 + 5; + + // load filter + float32x4_t k00_vec = vld1q_f32(k0_0); + float32x4_t k01_vec = vld1q_f32(k0_1); + float32x4_t k02_vec = vld1q_f32(k0_2); + + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + float *out_row0_2 = out_row_base0 + 2 * outw; + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02; + float32x4_t out10, out11, out12; + float32x4_t out20, out21, out22; + + out00 = vld1q_f32(out_row0_0 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 2, out02); + + out10 = vld1q_f32(out_row0_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 0, out10); + + out11 = vld1q_f32(out_row0_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 1, out11); + + out12 = vld1q_f32(out_row0_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 2, out12); + + out20 = vld1q_f32(out_row0_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 0, out20); + + out21 = vld1q_f32(out_row0_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 1, out21); + + out22 = vld1q_f32(out_row0_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 2, out22); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + out_row0_2 += 4; + } + + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 3; ++k) { + out_row0_0[k] += val * k0_0[k]; + out_row0_1[k] += val * k0_1[k]; + out_row0_2[k] += val * k0_2[k + 1]; + } + in++; + out_row0_0++; + out_row0_1++; + out_row0_2++; + } + } + } + } + } + } + } + }, 0, batch, 1, 0, group_, 1, 0, outch_g, 2); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t in_img_size = h * w; + const index_t out_img_size = outh * outw; + + const index_t inch_g = inch / group_; + const index_t outch_g = outch / group_; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t g = start1; g < end1; g += step1) { + for (index_t oc = start2; oc < end2; oc += step2) { + const index_t out_offset = b * outch + outch_g * g + oc; + float *out_base = padded_out_data + out_offset * out_img_size; + for (index_t ic = 0; ic < inch_g; ++ic) { + const index_t in_offset = b * inch + inch_g * g + ic; + const float *input_base = input_data + in_offset * in_img_size; + const index_t kernel_offset = (oc * group_ + g) * inch_g + ic; + const float *kernel_base = filter_data + kernel_offset * 9; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 3; + const float *k2 = kernel_base + 5; + + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + + for (index_t i = 0; i < h; ++i) { + float *out_row_base = out_base + i * 2 * outw; + float *out_row_0 = out_row_base; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + + index_t j = 0; + + for (index_t n = 0; n + 9 < outw; n += 8) { + float32x4_t in_vec = vld1q_f32(in); + + // out row 0 + float32x4x2_t out00 = vld2q_f32(out_row_0); + out00.val[0] = + neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); + out00.val[1] = + neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out00); + + float32x4x2_t out01 = vld2q_f32(out_row_0 + 2); + out01.val[0] = + neon_vfma_lane_2(out01.val[0], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out01); + + // out row 1 + float32x4x2_t out10 = vld2q_f32(out_row_1); + out10.val[0] = + neon_vfma_lane_0(out10.val[0], in_vec, k1_vec); + out10.val[1] = + neon_vfma_lane_1(out10.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out10); + + float32x4x2_t out11 = vld2q_f32(out_row_1 + 2); + out11.val[0] = + neon_vfma_lane_2(out11.val[0], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out11); + + // out row 2 + float32x4x2_t out20 = vld2q_f32(out_row_2); + out20.val[0] = + neon_vfma_lane_1(out20.val[0], in_vec, k2_vec); + out20.val[1] = + neon_vfma_lane_2(out20.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out20); + + float32x4x2_t out21 = vld2q_f32(out_row_2 + 2); + out21.val[0] = + neon_vfma_lane_3(out21.val[0], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out21); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + j += 4; + } + + for (; j < w; ++j) { + float val = in[0]; + + for (int k = 0; k < 3; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k + 1]; + } + + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + } + } + } + } + } + } + }, 0, batch, 1, 0, group_, 1, 0, outch_g, 1); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h new file mode 100644 index 0000000000000000000000000000000000000000..5dd315a47ad5e0c9a815b64ca3c5c0de63faf25e --- /dev/null +++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h @@ -0,0 +1,122 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_ +#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_ + +#include +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class DepthwiseDeconv2dK3x3S1 : public Deconv2dBase { + public: + DepthwiseDeconv2dK3x3S1(const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase({1, 1}, + {1, 1}, + paddings, + padding_type, + framework_type) {} + virtual ~DepthwiseDeconv2dK3x3S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +class DepthwiseDeconv2dK3x3S2 : public Deconv2dBase { + public: + DepthwiseDeconv2dK3x3S2(const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase({2, 2}, + {1, 1}, + paddings, + padding_type, + framework_type) {} + virtual ~DepthwiseDeconv2dK3x3S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +class GroupDeconv2dK3x3S1 : public Deconv2dBase { + public: + GroupDeconv2dK3x3S1(const std::vector &paddings, + const Padding padding_type, + const int group, + const FrameworkType framework_type) + : Deconv2dBase({1, 1}, + {1, 1}, + paddings, + padding_type, + group, + framework_type) {} + virtual ~GroupDeconv2dK3x3S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +class GroupDeconv2dK3x3S2 : public Deconv2dBase { + public: + GroupDeconv2dK3x3S2(const std::vector &paddings, + const Padding padding_type, + const int group, + const FrameworkType framework_type) + : Deconv2dBase({2, 2}, + {1, 1}, + paddings, + padding_type, + group, + framework_type) {} + virtual ~GroupDeconv2dK3x3S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_ diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc new file mode 100644 index 0000000000000000000000000000000000000000..85c93b0cef7b53dc170d48eeaa6c65154f85c8e8 --- /dev/null +++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc @@ -0,0 +1,966 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h" + +#include +#include "mace/ops/arm/fp32/common_neon.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + group_ = input->dim(1); + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t channels = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + const index_t in_img_size = h * w; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const index_t offset = b * channels + c; + float *out_base = padded_out_data + offset * out_img_size; + const float *input_base = input_data + offset * in_img_size; + const float *kernel_base = filter_data + c * 16; + const float *in = input_base; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; + + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); + + for (index_t i = 0; i < h; i++) { + float *out_row = out_base + i * outw; + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00 = vld1q_f32(out_row_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row_0, out00); + + float32x4_t out01 = vld1q_f32(out_row_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row_0 + 1, out01); + + float32x4_t out02 = vld1q_f32(out_row_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row_0 + 2, out02); + + float32x4_t out03 = vld1q_f32(out_row_0 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row_0 + 3, out03); + + // + float32x4_t out10 = vld1q_f32(out_row_1); + out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); + vst1q_f32(out_row_1, out10); + + float32x4_t out11 = vld1q_f32(out_row_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); + vst1q_f32(out_row_1 + 1, out11); + + float32x4_t out12 = vld1q_f32(out_row_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); + vst1q_f32(out_row_1 + 2, out12); + + float32x4_t out13 = vld1q_f32(out_row_1 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); + vst1q_f32(out_row_1 + 3, out13); + + // + float32x4_t out20 = vld1q_f32(out_row_2 + 0); + out20 = neon_vfma_lane_0(out20, in_vec, k2_vec); + vst1q_f32(out_row_2 + 0, out20); + + float32x4_t out21 = vld1q_f32(out_row_2 + 1); + out21 = neon_vfma_lane_1(out21, in_vec, k2_vec); + vst1q_f32(out_row_2 + 1, out21); + + float32x4_t out22 = vld1q_f32(out_row_2 + 2); + out22 = neon_vfma_lane_2(out22, in_vec, k2_vec); + vst1q_f32(out_row_2 + 2, out22); + + float32x4_t out23 = vld1q_f32(out_row_2 + 3); + out23 = neon_vfma_lane_3(out23, in_vec, k2_vec); + vst1q_f32(out_row_2 + 3, out23); + + // + float32x4_t out30 = vld1q_f32(out_row_3 + 0); + out30 = neon_vfma_lane_0(out30, in_vec, k3_vec); + vst1q_f32(out_row_3 + 0, out30); + + float32x4_t out31 = vld1q_f32(out_row_3 + 1); + out31 = neon_vfma_lane_1(out31, in_vec, k3_vec); + vst1q_f32(out_row_3 + 1, out31); + + float32x4_t out32 = vld1q_f32(out_row_3 + 2); + out32 = neon_vfma_lane_2(out32, in_vec, k3_vec); + vst1q_f32(out_row_3 + 2, out32); + + float32x4_t out33 = vld1q_f32(out_row_3 + 3); + out33 = neon_vfma_lane_3(out33, in_vec, k3_vec); + vst1q_f32(out_row_3 + 3, out33); + + in += 4; + out_row_0 += 4; + out_row_1 += 4; + out_row_2 += 4; + out_row_3 += 4; + } + + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0++; + out_row_1++; + out_row_2++; + out_row_3++; + } + } + } + } + }, 0, batch, 1, 0, channels, 1); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + group_ = input->dim(1); + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t channels = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + const index_t in_img_size = h * w; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const index_t offset = b * channels + c; + float *out_base = padded_out_data + offset * out_img_size; + const float *input_base = input_data + offset * in_img_size; + const float *kernel_base = filter_data + c * 16; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; + + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); + + for (index_t i = 0; i < h; i++) { + float *out_row = out_base + 2 * i * outw; + + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + + index_t j = 0; + + for (index_t n = 0; n + 9 < outw; n += 8) { + float32x4_t in_vec = vld1q_f32(in); + + // row 0 + float32x4x2_t out0 = vld2q_f32(out_row_0); + out0.val[0] = + neon_vfma_lane_0(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_1(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out0); + out0 = vld2q_f32(out_row_0 + 2); + out0.val[0] = + neon_vfma_lane_2(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_3(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out0); + + // row 1 + float32x4x2_t out1 = vld2q_f32(out_row_1); + out1.val[0] = + neon_vfma_lane_0(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_1(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out1); + out1 = vld2q_f32(out_row_1 + 2); + out1.val[0] = + neon_vfma_lane_2(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_3(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out1); + + // row 2 + float32x4x2_t out2 = vld2q_f32(out_row_2); + out2.val[0] = + neon_vfma_lane_0(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_1(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out2); + out2 = vld2q_f32(out_row_2 + 2); + out2.val[0] = + neon_vfma_lane_2(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_3(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out2); + + // row 3 + float32x4x2_t out3 = vld2q_f32(out_row_3); + out3.val[0] = + neon_vfma_lane_0(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_1(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3, out3); + out3 = vld2q_f32(out_row_3 + 2); + out3.val[0] = + neon_vfma_lane_2(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_3(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3 + 2, out3); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + out_row_3 += 8; + j += 4; + } + + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + out_row_3 += 2; + } + } + } + } + }, 0, batch, 1, 0, channels, 1); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t in_img_size = h * w; + const index_t out_img_size = outh * outw; + + const index_t inch_g = inch / group_; + const index_t outch_g = outch / group_; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t g = start1; g < end1; g += step1) { + for (index_t oc = start2; oc < end2; oc += step2) { + if (oc + 1 < outch_g) { + const index_t out_offset = + (b * outch + outch_g * g + oc) * out_img_size; + float *out_base = padded_out_data + out_offset; + float *out_base1 = out_base + out_img_size; + for (index_t ic = 0; ic < inch_g; ic++) { + const index_t in_offset = + (b * inch + inch_g * g + ic) * in_img_size; + const float *input_base = input_data + in_offset; + const float *in = input_base; + const index_t kernel_offset = + ((oc * group_ + g) * inch_g + ic) * 16; + const float *kernel_base = filter_data + kernel_offset; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; + + const float *kernel_base1 = kernel_base + inch * 16; + const float *k10 = kernel_base1; + const float *k11 = kernel_base1 + 4; + const float *k12 = kernel_base1 + 8; + const float *k13 = kernel_base1 + 12; + + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); + + float32x4_t k10_vec = vld1q_f32(k10); + float32x4_t k11_vec = vld1q_f32(k11); + float32x4_t k12_vec = vld1q_f32(k12); + float32x4_t k13_vec = vld1q_f32(k13); + + for (index_t i = 0; i < h; i++) { + float *out_row = out_base + i * outw; + + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + + float *out_row1 = out_base1 + i * outw; + + float *out_row1_0 = out_row1; + float *out_row1_1 = out_row1_0 + outw; + float *out_row1_2 = out_row1_1 + outw; + float *out_row1_3 = out_row1_2 + outw; + + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + float32x4_t out00, out01, out02, out03; + float32x4_t out10, out11, out12, out13; + + out00 = vld1q_f32(out_row_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row_0, out00); + + out10 = vld1q_f32(out_row1_0); + out10 = neon_vfma_lane_0(out10, in_vec, k10_vec); + vst1q_f32(out_row1_0, out10); + + out01 = vld1q_f32(out_row_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row_0 + 1, out01); + + out11 = vld1q_f32(out_row1_0 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 1, out11); + + out02 = vld1q_f32(out_row_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row_0 + 2, out02); + + out12 = vld1q_f32(out_row1_0 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 2, out12); + + out03 = vld1q_f32(out_row_0 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row_0 + 3, out03); + + out13 = vld1q_f32(out_row1_0 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 3, out13); + + // + out00 = vld1q_f32(out_row_1); + out00 = neon_vfma_lane_0(out00, in_vec, k1_vec); + vst1q_f32(out_row_1, out00); + + out10 = vld1q_f32(out_row1_1); + out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); + vst1q_f32(out_row1_1, out10); + + out01 = vld1q_f32(out_row_1 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k1_vec); + vst1q_f32(out_row_1 + 1, out01); + + out11 = vld1q_f32(out_row1_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 1, out11); + + out02 = vld1q_f32(out_row_1 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k1_vec); + vst1q_f32(out_row_1 + 2, out02); + + out12 = vld1q_f32(out_row1_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 2, out12); + + out03 = vld1q_f32(out_row_1 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k1_vec); + vst1q_f32(out_row_1 + 3, out03); + + out13 = vld1q_f32(out_row1_1 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 3, out13); + + // + out00 = vld1q_f32(out_row_2 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k2_vec); + vst1q_f32(out_row_2 + 0, out00); + + out10 = vld1q_f32(out_row1_2 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 0, out10); + + out01 = vld1q_f32(out_row_2 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k2_vec); + vst1q_f32(out_row_2 + 1, out01); + + out11 = vld1q_f32(out_row1_2 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 1, out11); + + out02 = vld1q_f32(out_row_2 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k2_vec); + vst1q_f32(out_row_2 + 2, out02); + + out12 = vld1q_f32(out_row1_2 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 2, out12); + + out03 = vld1q_f32(out_row_2 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k2_vec); + vst1q_f32(out_row_2 + 3, out03); + + out13 = vld1q_f32(out_row1_2 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 3, out13); + + // + out00 = vld1q_f32(out_row_3 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k3_vec); + vst1q_f32(out_row_3 + 0, out00); + + out10 = vld1q_f32(out_row1_3 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 0, out10); + + out01 = vld1q_f32(out_row_3 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k3_vec); + vst1q_f32(out_row_3 + 1, out01); + + out11 = vld1q_f32(out_row1_3 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 1, out11); + + out02 = vld1q_f32(out_row_3 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k3_vec); + vst1q_f32(out_row_3 + 2, out02); + + out12 = vld1q_f32(out_row1_3 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 2, out12); + + out03 = vld1q_f32(out_row_3 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k3_vec); + vst1q_f32(out_row_3 + 3, out03); + + out13 = vld1q_f32(out_row1_3 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 3, out13); + + in += 4; + out_row_0 += 4; + out_row_1 += 4; + out_row_2 += 4; + out_row_3 += 4; + out_row1_0 += 4; + out_row1_1 += 4; + out_row1_2 += 4; + out_row1_3 += 4; + } + + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + out_row1_0[k] += val * k10[k]; + out_row1_1[k] += val * k11[k]; + out_row1_2[k] += val * k12[k]; + out_row1_3[k] += val * k13[k]; + } + in++; + out_row_0++; + out_row_1++; + out_row_2++; + out_row_3++; + out_row1_0++; + out_row1_1++; + out_row1_2++; + out_row1_3++; + } + } + } + } else { + const index_t out_offset = + (b * outch + outch_g * g + oc) * out_img_size; + float *out_base = padded_out_data + out_offset; + for (index_t ic = 0; ic < inch_g; ++ic) { + const index_t in_offset = + (b * inch + inch_g * g + ic) * in_img_size; + const index_t kernel_offset = + ((oc * group_ + g) * inch_g + ic) * 16; + + const float *input_base = input_data + in_offset; + const float *kernel_base = filter_data + kernel_offset; + const float *in = input_base; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; + + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); + + for (index_t i = 0; i < h; i++) { + float *out_row = out_base + i * outw; + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + index_t j = 0; + + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00 = vld1q_f32(out_row_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row_0, out00); + + float32x4_t out01 = vld1q_f32(out_row_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row_0 + 1, out01); + + float32x4_t out02 = vld1q_f32(out_row_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row_0 + 2, out02); + + float32x4_t out03 = vld1q_f32(out_row_0 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row_0 + 3, out03); + + // + float32x4_t out10 = vld1q_f32(out_row_1); + out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); + vst1q_f32(out_row_1, out10); + + float32x4_t out11 = vld1q_f32(out_row_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); + vst1q_f32(out_row_1 + 1, out11); + + float32x4_t out12 = vld1q_f32(out_row_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); + vst1q_f32(out_row_1 + 2, out12); + + float32x4_t out13 = vld1q_f32(out_row_1 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); + vst1q_f32(out_row_1 + 3, out13); + + // + float32x4_t out20 = vld1q_f32(out_row_2 + 0); + out20 = neon_vfma_lane_0(out20, in_vec, k2_vec); + vst1q_f32(out_row_2 + 0, out20); + + float32x4_t out21 = vld1q_f32(out_row_2 + 1); + out21 = neon_vfma_lane_1(out21, in_vec, k2_vec); + vst1q_f32(out_row_2 + 1, out21); + + float32x4_t out22 = vld1q_f32(out_row_2 + 2); + out22 = neon_vfma_lane_2(out22, in_vec, k2_vec); + vst1q_f32(out_row_2 + 2, out22); + + float32x4_t out23 = vld1q_f32(out_row_2 + 3); + out23 = neon_vfma_lane_3(out23, in_vec, k2_vec); + vst1q_f32(out_row_2 + 3, out23); + + // + float32x4_t out30 = vld1q_f32(out_row_3 + 0); + out30 = neon_vfma_lane_0(out30, in_vec, k3_vec); + vst1q_f32(out_row_3 + 0, out30); + + float32x4_t out31 = vld1q_f32(out_row_3 + 1); + out31 = neon_vfma_lane_1(out31, in_vec, k3_vec); + vst1q_f32(out_row_3 + 1, out31); + + float32x4_t out32 = vld1q_f32(out_row_3 + 2); + out32 = neon_vfma_lane_2(out32, in_vec, k3_vec); + vst1q_f32(out_row_3 + 2, out32); + + float32x4_t out33 = vld1q_f32(out_row_3 + 3); + out33 = neon_vfma_lane_3(out33, in_vec, k3_vec); + vst1q_f32(out_row_3 + 3, out33); + + in += 4; + out_row_0 += 4; + out_row_1 += 4; + out_row_2 += 4; + out_row_3 += 4; + } + + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0++; + out_row_1++; + out_row_2++; + out_row_3++; + } + } + } + } + } + } + } + }, 0, batch, 1, 0, group_, 1, 0, outch_g, 2); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t in_img_size = h * w; + const index_t out_img_size = outh * outw; + + const index_t inch_g = inch / group_; + const index_t outch_g = outch / group_; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t g = start1; g < end1; g += step1) { + for (index_t oc = start2; oc < end2; oc += step2) { + const index_t out_offset = + (b * outch + outch_g * g + oc) * out_img_size; + float *out_base = padded_out_data + out_offset; + for (index_t ic = 0; ic < inch_g; ic++) { + const index_t in_offset = + (b * inch + inch_g * g + ic) * in_img_size; + const index_t kernel_offset = + ((oc * group_ + g) * inch_g + ic) * 16; + const float *input_base = input_data + in_offset; + const float *kernel_base = filter_data + kernel_offset; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; + + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); + + for (index_t i = 0; i < h; i++) { + float *out_row = out_base + 2 * i * outw; + + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + + index_t j = 0; + + for (index_t n = 0; n + 9 < outw; n += 8) { + float32x4_t in_vec = vld1q_f32(in); + + // row 0 + float32x4x2_t out0 = vld2q_f32(out_row_0); + out0.val[0] = + neon_vfma_lane_0(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_1(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out0); + out0 = vld2q_f32(out_row_0 + 2); + out0.val[0] = + neon_vfma_lane_2(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_3(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out0); + + // row 1 + float32x4x2_t out1 = vld2q_f32(out_row_1); + out1.val[0] = + neon_vfma_lane_0(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_1(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out1); + out1 = vld2q_f32(out_row_1 + 2); + out1.val[0] = + neon_vfma_lane_2(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_3(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out1); + + // row 2 + float32x4x2_t out2 = vld2q_f32(out_row_2); + out2.val[0] = + neon_vfma_lane_0(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_1(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out2); + out2 = vld2q_f32(out_row_2 + 2); + out2.val[0] = + neon_vfma_lane_2(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_3(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out2); + + // row 3 + float32x4x2_t out3 = vld2q_f32(out_row_3); + out3.val[0] = + neon_vfma_lane_0(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_1(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3, out3); + out3 = vld2q_f32(out_row_3 + 2); + out3.val[0] = + neon_vfma_lane_2(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_3(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3 + 2, out3); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + out_row_3 += 8; + j += 4; + } + + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + out_row_3 += 2; + } + } + } + } + } + } + }, 0, batch, 1, 0, group_, 1, 0, outch_g, 1); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h new file mode 100644 index 0000000000000000000000000000000000000000..4b73ed010afdd783f45e39d638db01427070e717 --- /dev/null +++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h @@ -0,0 +1,122 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_ +#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_ + +#include +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class DepthwiseDeconv2dK4x4S1 : public Deconv2dBase { + public: + DepthwiseDeconv2dK4x4S1(const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase({1, 1}, + {1, 1}, + paddings, + padding_type, + framework_type) {} + virtual ~DepthwiseDeconv2dK4x4S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +class DepthwiseDeconv2dK4x4S2 : public Deconv2dBase { + public: + DepthwiseDeconv2dK4x4S2(const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase({2, 2}, + {1, 1}, + paddings, + padding_type, + framework_type) {} + virtual ~DepthwiseDeconv2dK4x4S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +class GroupDeconv2dK4x4S1 : public Deconv2dBase { + public: + GroupDeconv2dK4x4S1(const std::vector &paddings, + const Padding padding_type, + const int group, + const FrameworkType framework_type) + : Deconv2dBase({1, 1}, + {1, 1}, + paddings, + padding_type, + group, + framework_type) {} + virtual ~GroupDeconv2dK4x4S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +class GroupDeconv2dK4x4S2 : public Deconv2dBase { + public: + GroupDeconv2dK4x4S2(const std::vector &paddings, + const Padding padding_type, + const int group, + const FrameworkType framework_type) + : Deconv2dBase({2, 2}, + {1, 1}, + paddings, + padding_type, + group, + framework_type) {} + virtual ~GroupDeconv2dK4x4S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_ diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc new file mode 100644 index 0000000000000000000000000000000000000000..a45d5acc6a663d370f1b741b5b15598c9fd40e22 --- /dev/null +++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc @@ -0,0 +1,213 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/fp32/depthwise_deconv_2d_general.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + group_ = input->dim(1); + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_img_size = out_height * out_width; + const index_t in_img_size = in_height * in_width; + const index_t kernel_h = filter->dim(2); + const index_t kernel_w = filter->dim(3); + const int kernel_size = kernel_h * kernel_w; + + std::vector index_map(kernel_size, 0); + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + index_map[i * kernel_w + j] = i * out_width + j; + } + } + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + float *out_base = + padded_out_data + (b * channels + c) * out_img_size; + for (index_t i = 0; i < in_height; ++i) { + for (index_t j = 0; j < in_width; ++j) { + const index_t out_offset = + i * strides_[0] * out_width + j * strides_[1]; + const index_t input_idx = + (b * channels + c) * in_img_size + i * in_width + j; + const float val = input_data[input_idx]; + const index_t kernel_offset = c * kernel_size; + for (int k = 0; k < kernel_size; ++k) { + const index_t out_idx = out_offset + index_map[k]; + const index_t kernel_idx = kernel_offset + k; + out_base[out_idx] += val * filter_data[kernel_idx]; + } + } + } + } + } + }, 0, batch, 1, 0, channels, 1); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + + MACE_CHECK(in_channels % group_ == 0 && out_channels % group_ == 0, + "invalid input/output channel and group."); + + const index_t out_img_size = out_height * out_width; + const index_t in_img_size = in_height * in_width; + const index_t kernel_h = filter->dim(2); + const index_t kernel_w = filter->dim(3); + + const int kernel_size = kernel_h * kernel_w; + std::vector index_map(kernel_size, 0); + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + index_map[i * kernel_w + j] = i * out_width + j; + } + } + + const int in_channels_g = in_channels / group_; + const int out_channels_g = out_channels / group_; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t g = start1; g < end1; g += step1) { + for (index_t p = start2; p < end2; p += step2) { + const index_t out_base = + ((b * group_ + g) * out_channels_g + p) * out_img_size; + for (index_t i = 0; i < in_height; ++i) { + for (index_t j = 0; j < in_width; ++j) { + const index_t out_offset = + i * strides_[0] * out_width + j * strides_[1]; + for (int q = 0; q < in_channels_g; ++q) { + const index_t in_base = + ((b * group_ + g) * in_channels_g + q) * in_img_size; + const index_t in_offset = + in_base + i * in_width + j; + const float val = input_data[in_offset]; + const index_t k_offset = + ((p * group_ + g) * in_channels_g + q) * kernel_size; + for (int k = 0; k < kernel_size; ++k) { + const index_t out_idx = out_base + out_offset + index_map[k]; + const float w = filter_data[k_offset + k]; + padded_out_data[out_idx] += val * w; + } + } + } + } + } + } + } + }, 0, batch, 1, 0, group_, 1, 0, out_channels_g, 1); + + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_general.h b/mace/ops/arm/fp32/depthwise_deconv_2d_general.h new file mode 100644 index 0000000000000000000000000000000000000000..d73480c5ea1a4fff7aa06656efb9a964acc1b01d --- /dev/null +++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.h @@ -0,0 +1,84 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_ +#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_ + +#include +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class DepthwiseDeconv2dGeneral : public Deconv2dBase { + public: + DepthwiseDeconv2dGeneral(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : Deconv2dBase(strides, + dilations, + paddings, + padding_type, + framework_type) {} + virtual ~DepthwiseDeconv2dGeneral() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +class GroupDeconv2dGeneral : public Deconv2dBase { + public: + GroupDeconv2dGeneral(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, + const Padding padding_type, + const int group, + const FrameworkType framework_type) + : Deconv2dBase(strides, + dilations, + paddings, + padding_type, + group, + framework_type) {} + virtual ~GroupDeconv2dGeneral() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_ diff --git a/mace/ops/arm/fp32/gemm.cc b/mace/ops/arm/fp32/gemm.cc index 8acde2d17a81602b2c7c667a5aef52573eb31977..aacb6636adf2bb9efb75879db7ca545c9e3a4daf 100644 --- a/mace/ops/arm/fp32/gemm.cc +++ b/mace/ops/arm/fp32/gemm.cc @@ -39,8 +39,6 @@ MaceStatus Gemm::Compute(const OpContext *context, const bool lhs_batched, const bool rhs_batched, Tensor *output) { - MACE_UNUSED(context); - MACE_CHECK(output->size() == batch * rows * cols, "Need resize output tensor before call gemm."); Tensor::MappingGuard lhs_guard(lhs); @@ -63,10 +61,8 @@ MaceStatus Gemm::Compute(const OpContext *context, const index_t cols_padded = RoundUp(cols, col_block_size); const index_t depth_padded = RoundUp(depth, depth_block_size); - ScratchBuffer *scratch = &tmp_scratch_buffer_; - if (context != nullptr && context->device()->scratch_buffer() != nullptr) { - scratch = context->device()->scratch_buffer(); - } + ScratchBuffer *scratch = context->device()->scratch_buffer(); + index_t packed_lhs_size = PadAlignSize(sizeof(float) * rows_padded * depth_padded); index_t packed_rhs_size = @@ -101,6 +97,9 @@ MaceStatus Gemm::Compute(const OpContext *context, } } + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + for (index_t b = 0; b < batch; ++b) { MatrixMap lhs_matrix @@ -119,17 +118,21 @@ MaceStatus Gemm::Compute(const OpContext *context, // pack lhs if (cached_ != kCacheLhs) { -#pragma omp parallel for schedule(runtime) - for (index_t row_block_idx = 0; row_block_idx < row_block_count; - ++row_block_idx) { - const index_t start_row = row_block_idx * row_block_size; - const index_t - row_block_len = std::min(row_block_size, rows - start_row); - float *packed_lhs_data_block = - packed_lhs_data + row_block_idx * row_block_size * depth_padded; - PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth), - packed_lhs_data_block); - } + thread_pool.Compute1D([=, &lhs_matrix](index_t start, + index_t end, + index_t step) { + for (index_t row_block_idx = start; row_block_idx < end; + row_block_idx += step) { + const index_t start_row = row_block_idx * row_block_size; + const index_t + row_block_len = std::min(row_block_size, rows - start_row); + float *packed_lhs_data_block = + packed_lhs_data + row_block_idx * row_block_size * depth_padded; + PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth), + packed_lhs_data_block); + } + }, 0, row_block_count, 1); + if (cache_side == kCacheLhs) { cached_ = kCacheLhs; if (lhs->UnderlyingBuffer()->OnHost()) { @@ -142,17 +145,21 @@ MaceStatus Gemm::Compute(const OpContext *context, // pack rhs if (cached_ != kCacheRhs) { -#pragma omp parallel for schedule(runtime) - for (index_t col_block_idx = 0; col_block_idx < col_block_count; - ++col_block_idx) { - const index_t start_col = col_block_idx * col_block_size; - const index_t - col_block_len = std::min(col_block_size, cols - start_col); - float *packed_rhs_data_block = - packed_rhs_data + col_block_idx * col_block_size * depth_padded; - PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len), - packed_rhs_data_block); - } + thread_pool.Compute1D([=, &rhs_matrix](index_t start, + index_t end, + index_t step) { + for (index_t col_block_idx = start; col_block_idx < end; + col_block_idx += step) { + const index_t start_col = col_block_idx * col_block_size; + const index_t + col_block_len = std::min(col_block_size, cols - start_col); + float *packed_rhs_data_block = + packed_rhs_data + col_block_idx * col_block_size * depth_padded; + PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len), + packed_rhs_data_block); + } + }, 0, col_block_count, 1); + if (cache_side == kCacheRhs) { cached_ = kCacheRhs; if (rhs->UnderlyingBuffer()->OnHost()) { @@ -164,35 +171,39 @@ MaceStatus Gemm::Compute(const OpContext *context, } // multiply lhs and rhs -#pragma omp parallel for schedule(runtime) - for (index_t row_block_idx = 0; row_block_idx < row_block_count; - ++row_block_idx) { - const index_t start_row = row_block_idx * row_block_size; - const index_t row_block_len = std::min(row_block_size, rows - start_row); - const float *packed_lhs_data_block = - packed_lhs_data + row_block_idx * row_block_size * depth_padded; - - for (index_t col_block_idx = 0; col_block_idx < col_block_count; - ++col_block_idx) { - const index_t start_col = col_block_idx * col_block_size; + thread_pool.Compute1D([=, &output_matrix](index_t start, + index_t end, + index_t step) { + for (index_t row_block_idx = start; row_block_idx < end; + row_block_idx += step) { + const index_t start_row = row_block_idx * row_block_size; const index_t - col_block_len = std::min(col_block_size, cols - start_col); - const float *packed_rhs_data_block = - packed_rhs_data + col_block_idx * col_block_size * depth_padded; - float *packed_output_data_block = - packed_output_data + row_block_idx * row_block_size * cols_padded - + col_block_idx * col_block_size; - ComputeBlock(packed_lhs_data_block, - packed_rhs_data_block, - depth_padded, - packed_output_data_block); - MatrixMap output_block = output_matrix.block(start_row, - start_col, - row_block_len, - col_block_len); - UnpackOutput(packed_output_data_block, &output_block); - } // col_block_idx - } // row_block_idx + row_block_len = std::min(row_block_size, rows - start_row); + const float *packed_lhs_data_block = + packed_lhs_data + row_block_idx * row_block_size * depth_padded; + + for (index_t col_block_idx = 0; col_block_idx < col_block_count; + ++col_block_idx) { + const index_t start_col = col_block_idx * col_block_size; + const index_t + col_block_len = std::min(col_block_size, cols - start_col); + const float *packed_rhs_data_block = + packed_rhs_data + col_block_idx * col_block_size * depth_padded; + float *packed_output_data_block = + packed_output_data + row_block_idx * row_block_size * cols_padded + + col_block_idx * col_block_size; + ComputeBlock(packed_lhs_data_block, + packed_rhs_data_block, + depth_padded, + packed_output_data_block); + MatrixMap output_block = output_matrix.block(start_row, + start_col, + row_block_len, + col_block_len); + UnpackOutput(packed_output_data_block, &output_block); + } // col_block_idx + } // row_block_idx + }, 0, row_block_count, 1); } // b return MaceStatus::MACE_SUCCESS; @@ -530,140 +541,140 @@ void Gemm::ComputeBlock(const float *packed_lhs_data, MACE_UNUSED(r_depth_block_count); asm volatile( - "mov r0, #0\n" - "vdup.f32 q8, r0 \n" - "vdup.f32 q9, r0 \n" - "vdup.f32 q10, r0 \n" - "vdup.f32 q11, r0 \n" - "vdup.f32 q12, r0 \n" - "vdup.f32 q13, r0 \n" - "vdup.f32 q14, r0 \n" - "vdup.f32 q15, r0 \n" - - // prelogue - "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" - "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" - "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n" - "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n" - - "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" - "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" - "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" - "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" - - "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" - "beq 1f\n" - - "0: \n" - - "vmla.f32 q8, q4, d0[0] \n" - "vmla.f32 q9, q5, d0[0] \n" - "vmla.f32 q10, q4, d0[1] \n" - "vmla.f32 q11, q5, d0[1] \n" - "vmla.f32 q12, q4, d1[0] \n" - "vmla.f32 q13, q5, d1[0] \n" - "vmla.f32 q14, q4, d1[1] \n" - "vmla.f32 q15, q5, d1[1] \n" - - "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" - "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" - "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" - - "vmla.f32 q8, q6, d2[0] \n" - "vmla.f32 q9, q7, d2[0] \n" - "vmla.f32 q10, q6, d2[1] \n" - "vmla.f32 q11, q7, d2[1] \n" - "vmla.f32 q12, q6, d3[0] \n" - "vmla.f32 q13, q7, d3[0] \n" - "vmla.f32 q14, q6, d3[1] \n" - "vmla.f32 q15, q7, d3[1] \n" - - "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" - "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" - "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" - - "vmla.f32 q8, q4, d4[0] \n" - "vmla.f32 q9, q5, d4[0] \n" - "vmla.f32 q10, q4, d4[1] \n" - "vmla.f32 q11, q5, d4[1] \n" - "vmla.f32 q12, q4, d5[0] \n" - "vmla.f32 q13, q5, d5[0] \n" - "vmla.f32 q14, q4, d5[1] \n" - "vmla.f32 q15, q5, d5[1] \n" - - "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n" - "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" - "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" - - "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" - - "vmla.f32 q8, q6, d6[0] \n" - "vmla.f32 q9, q7, d6[0] \n" - "vmla.f32 q10, q6, d6[1] \n" - "vmla.f32 q11, q7, d6[1] \n" - "vmla.f32 q12, q6, d7[0] \n" - "vmla.f32 q13, q7, d7[0] \n" - "vmla.f32 q14, q6, d7[1] \n" - "vmla.f32 q15, q7, d7[1] \n" - - "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n" - "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" - "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" - - "bne 0b \n" - - // prologue - "1:\n" - "vmla.f32 q8, q4, d0[0] \n" - "vmla.f32 q9, q5, d0[0] \n" - "vmla.f32 q10, q4, d0[1] \n" - "vmla.f32 q11, q5, d0[1] \n" - "vmla.f32 q12, q4, d1[0] \n" - "vmla.f32 q13, q5, d1[0] \n" - "vmla.f32 q14, q4, d1[1] \n" - "vmla.f32 q15, q5, d1[1] \n" - - "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" - "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" - - "vmla.f32 q8, q6, d2[0] \n" - "vmla.f32 q9, q7, d2[0] \n" - "vmla.f32 q10, q6, d2[1] \n" - "vmla.f32 q11, q7, d2[1] \n" - "vmla.f32 q12, q6, d3[0] \n" - "vmla.f32 q13, q7, d3[0] \n" - "vmla.f32 q14, q6, d3[1] \n" - "vmla.f32 q15, q7, d3[1] \n" - - "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" - "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" - - "vmla.f32 q8, q4, d4[0] \n" - "vmla.f32 q9, q5, d4[0] \n" - "vmla.f32 q10, q4, d4[1] \n" - "vmla.f32 q11, q5, d4[1] \n" - "vmla.f32 q12, q4, d5[0] \n" - "vmla.f32 q13, q5, d5[0] \n" - "vmla.f32 q14, q4, d5[1] \n" - "vmla.f32 q15, q5, d5[1] \n" - - "vmla.f32 q8, q6, d6[0] \n" - "vmla.f32 q9, q7, d6[0] \n" - "vmla.f32 q10, q6, d6[1] \n" - "vmla.f32 q11, q7, d6[1] \n" - "vmla.f32 q12, q6, d7[0] \n" - "vmla.f32 q13, q7, d7[0] \n" - "vmla.f32 q14, q6, d7[1] \n" - "vmla.f32 q15, q7, d7[1] \n" - - "vst1.f32 {d16-d17}, [%[packed_output_data]]! \n" - "vst1.f32 {d18-d19}, [%[packed_output_data]]! \n" - "vst1.f32 {d20-d21}, [%[packed_output_data]]! \n" - "vst1.f32 {d22-d23}, [%[packed_output_data]]! \n" - "vst1.f32 {d24-d25}, [%[packed_output_data]]! \n" - "vst1.f32 {d26-d27}, [%[packed_output_data]]! \n" - "vst1.f32 {d28-d29}, [%[packed_output_data]]! \n" - "vst1.f32 {d30-d31}, [%[packed_output_data]]! \n" + "mov r0, #0\n" + "vdup.f32 q8, r0 \n" + "vdup.f32 q9, r0 \n" + "vdup.f32 q10, r0 \n" + "vdup.f32 q11, r0 \n" + "vdup.f32 q12, r0 \n" + "vdup.f32 q13, r0 \n" + "vdup.f32 q14, r0 \n" + "vdup.f32 q15, r0 \n" + + // prelogue + "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" + "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" + "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n" + "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n" + + "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" + "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" + "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" + "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" + + "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" + "beq 1f\n" + + "0: \n" + + "vmla.f32 q8, q4, d0[0] \n" + "vmla.f32 q9, q5, d0[0] \n" + "vmla.f32 q10, q4, d0[1] \n" + "vmla.f32 q11, q5, d0[1] \n" + "vmla.f32 q12, q4, d1[0] \n" + "vmla.f32 q13, q5, d1[0] \n" + "vmla.f32 q14, q4, d1[1] \n" + "vmla.f32 q15, q5, d1[1] \n" + + "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" + "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" + "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" + + "vmla.f32 q8, q6, d2[0] \n" + "vmla.f32 q9, q7, d2[0] \n" + "vmla.f32 q10, q6, d2[1] \n" + "vmla.f32 q11, q7, d2[1] \n" + "vmla.f32 q12, q6, d3[0] \n" + "vmla.f32 q13, q7, d3[0] \n" + "vmla.f32 q14, q6, d3[1] \n" + "vmla.f32 q15, q7, d3[1] \n" + + "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" + "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" + "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" + + "vmla.f32 q8, q4, d4[0] \n" + "vmla.f32 q9, q5, d4[0] \n" + "vmla.f32 q10, q4, d4[1] \n" + "vmla.f32 q11, q5, d4[1] \n" + "vmla.f32 q12, q4, d5[0] \n" + "vmla.f32 q13, q5, d5[0] \n" + "vmla.f32 q14, q4, d5[1] \n" + "vmla.f32 q15, q5, d5[1] \n" + + "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n" + "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" + "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" + + "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" + + "vmla.f32 q8, q6, d6[0] \n" + "vmla.f32 q9, q7, d6[0] \n" + "vmla.f32 q10, q6, d6[1] \n" + "vmla.f32 q11, q7, d6[1] \n" + "vmla.f32 q12, q6, d7[0] \n" + "vmla.f32 q13, q7, d7[0] \n" + "vmla.f32 q14, q6, d7[1] \n" + "vmla.f32 q15, q7, d7[1] \n" + + "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n" + "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" + "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" + + "bne 0b \n" + + // prologue + "1:\n" + "vmla.f32 q8, q4, d0[0] \n" + "vmla.f32 q9, q5, d0[0] \n" + "vmla.f32 q10, q4, d0[1] \n" + "vmla.f32 q11, q5, d0[1] \n" + "vmla.f32 q12, q4, d1[0] \n" + "vmla.f32 q13, q5, d1[0] \n" + "vmla.f32 q14, q4, d1[1] \n" + "vmla.f32 q15, q5, d1[1] \n" + + "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" + "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" + + "vmla.f32 q8, q6, d2[0] \n" + "vmla.f32 q9, q7, d2[0] \n" + "vmla.f32 q10, q6, d2[1] \n" + "vmla.f32 q11, q7, d2[1] \n" + "vmla.f32 q12, q6, d3[0] \n" + "vmla.f32 q13, q7, d3[0] \n" + "vmla.f32 q14, q6, d3[1] \n" + "vmla.f32 q15, q7, d3[1] \n" + + "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" + "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" + + "vmla.f32 q8, q4, d4[0] \n" + "vmla.f32 q9, q5, d4[0] \n" + "vmla.f32 q10, q4, d4[1] \n" + "vmla.f32 q11, q5, d4[1] \n" + "vmla.f32 q12, q4, d5[0] \n" + "vmla.f32 q13, q5, d5[0] \n" + "vmla.f32 q14, q4, d5[1] \n" + "vmla.f32 q15, q5, d5[1] \n" + + "vmla.f32 q8, q6, d6[0] \n" + "vmla.f32 q9, q7, d6[0] \n" + "vmla.f32 q10, q6, d6[1] \n" + "vmla.f32 q11, q7, d6[1] \n" + "vmla.f32 q12, q6, d7[0] \n" + "vmla.f32 q13, q7, d7[0] \n" + "vmla.f32 q14, q6, d7[1] \n" + "vmla.f32 q15, q7, d7[1] \n" + + "vst1.f32 {d16-d17}, [%[packed_output_data]]! \n" + "vst1.f32 {d18-d19}, [%[packed_output_data]]! \n" + "vst1.f32 {d20-d21}, [%[packed_output_data]]! \n" + "vst1.f32 {d22-d23}, [%[packed_output_data]]! \n" + "vst1.f32 {d24-d25}, [%[packed_output_data]]! \n" + "vst1.f32 {d26-d27}, [%[packed_output_data]]! \n" + "vst1.f32 {d28-d29}, [%[packed_output_data]]! \n" + "vst1.f32 {d30-d31}, [%[packed_output_data]]! \n" : // outputs [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), diff --git a/mace/ops/arm/fp32/gemm.h b/mace/ops/arm/fp32/gemm.h index ce226c1a341d76d7f873cb527408688c2e538a8c..00b4d80eef4bf27f98c54f1c77a51765cc7f530d 100644 --- a/mace/ops/arm/fp32/gemm.h +++ b/mace/ops/arm/fp32/gemm.h @@ -32,8 +32,7 @@ namespace fp32 { class Gemm { public: explicit Gemm(const bool should_cache_pack) - : tmp_scratch_buffer_(GetCPUAllocator()), - pack_cache_(GetCPUAllocator()), + : pack_cache_(GetCPUAllocator()), should_cache_pack_(should_cache_pack), cached_(0) {} Gemm() : Gemm(false) {} @@ -126,7 +125,6 @@ class Gemm { } } - ScratchBuffer tmp_scratch_buffer_; Buffer pack_cache_; bool should_cache_pack_; diff --git a/mace/ops/arm/fp32/gemm_test.cc b/mace/ops/arm/fp32/gemm_test.cc index 372b3eb6e2580c875285c260c8c43c8fc6f0bc51..805720331b193895301b40b408b4eac0b384104c 100644 --- a/mace/ops/arm/fp32/gemm_test.cc +++ b/mace/ops/arm/fp32/gemm_test.cc @@ -51,7 +51,11 @@ void TestGemmFloat32(const index_t batch, GenerateRandomRealTypeData(output.shape(), output_data); } ::mace::ops::arm::fp32::Gemm gemm; - gemm.Compute(nullptr, + utils::ThreadPool thread_pool(1, AFFINITY_NONE); + thread_pool.Init(); + CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); + OpContext context(nullptr, &cpu_device); + gemm.Compute(&context, &lhs, &rhs, batch, diff --git a/mace/ops/arm/fp32/gemv.cc b/mace/ops/arm/fp32/gemv.cc index 7caa0b5b23d1a9b30d81ce94126bfc2a1a5b82d6..2f2866cf0da86dd70402d28810247821f229d85b 100644 --- a/mace/ops/arm/fp32/gemv.cc +++ b/mace/ops/arm/fp32/gemv.cc @@ -48,8 +48,8 @@ MaceStatus Gemv::Compute(const OpContext *context, Tensor *output) { MACE_UNUSED(context); - MACE_CHECK(output->size() >= batch * lhs_height, - "Output buffer is not large enough for computing gemv."); + MACE_CHECK(output->size() == batch * lhs_height, + "Need resize output tensor before call gemv."); Tensor::MappingGuard lhs_guard(lhs); Tensor::MappingGuard rhs_guard(rhs); @@ -70,24 +70,29 @@ MaceStatus Gemv::Compute(const OpContext *context, const index_t w_block_count = lhs_width / w_block_size; const index_t w_remain = lhs_width - w_block_size * w_block_count; -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch; ++b) { - for (index_t h_block_idx = 0; h_block_idx < h_block_count; ++h_block_idx) { - const index_t h_start = h_block_idx * h_block_size; - const float - *lhs_ptr = lhs_data - + static_cast(lhs_batched) * b * lhs_height * lhs_width - + lhs_width * h_start; - const float *rhs_ptr = - rhs_data + static_cast(rhs_batched) * b * lhs_width; - float - *ret_ptr = output_data + b * lhs_height + h_start; - - const index_t h_block_len = - std::min(h_block_size, lhs_height - h_start); + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t h_block_idx = start1; h_block_idx < end1; + h_block_idx += step1) { + const index_t h_start = h_block_idx * h_block_size; + const float + *lhs_ptr = lhs_data + + static_cast(lhs_batched) * b * lhs_height * lhs_width + + lhs_width * h_start; + const float *rhs_ptr = + rhs_data + static_cast(rhs_batched) * b * lhs_width; + float + *ret_ptr = output_data + b * lhs_height + h_start; + + const index_t h_block_len = + std::min(h_block_size, lhs_height - h_start); #ifdef MACE_GEMV_UNROLL - if (h_block_len == 4) { + if (h_block_len == 4) { float32x4_t vo0 = vdupq_n_f32(0); float32x4_t vo1 = vdupq_n_f32(0); float32x4_t vo2 = vdupq_n_f32(0); @@ -360,10 +365,11 @@ MaceStatus Gemv::Compute(const OpContext *context, } // h #ifdef MACE_GEMV_UNROLL - } // if + } // if #endif // MACE_GEMV_UNROLL - } // h_block_idx - } // b + } // h_block_idx + } // b + }, 0, batch, 1, 0, h_block_count, 1); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/arm/fp32/gemv_test.cc b/mace/ops/arm/fp32/gemv_test.cc index b6b69254a5827f399a56100c2cebd47e5812412d..bc97bc3ee8ed9c52f62518830cba2b8775973702 100644 --- a/mace/ops/arm/fp32/gemv_test.cc +++ b/mace/ops/arm/fp32/gemv_test.cc @@ -49,8 +49,12 @@ void TestGemvFloat32(const index_t batch, GenerateRandomRealTypeData(rhs.shape(), rhs_data); GenerateRandomRealTypeData(bias.shape(), bias_data); } + utils::ThreadPool thread_pool(1, AFFINITY_NONE); + thread_pool.Init(); + CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); + OpContext context(nullptr, &cpu_device); ::mace::ops::arm::fp32::Gemv gemv; - gemv.Compute(nullptr, + gemv.Compute(&context, &lhs, &rhs, &bias, diff --git a/mace/ops/arm/q8/eltwise.cc b/mace/ops/arm/q8/eltwise.cc index f987da81373282f769f660e5f10e7795413b3be4..bdaa57a640ec6e6d66cd080830211b95c4ceb5b5 100644 --- a/mace/ops/arm/q8/eltwise.cc +++ b/mace/ops/arm/q8/eltwise.cc @@ -46,15 +46,9 @@ MaceStatus Eltwise::Compute(const OpContext *context, int32_t input0_shift; int32_t input1_shift; int32_t output_shift; - QuantizeMultiplier(adjusted_input0_scale, - &input0_multiplier, - &input0_shift); - QuantizeMultiplier(adjusted_input1_scale, - &input1_multiplier, - &input1_shift); - QuantizeMultiplier(adjusted_output_scale, - &output_multiplier, - &output_shift); + QuantizeMultiplier(adjusted_input0_scale, &input0_multiplier, &input0_shift); + QuantizeMultiplier(adjusted_input1_scale, &input1_multiplier, &input1_shift); + QuantizeMultiplier(adjusted_output_scale, &output_multiplier, &output_shift); Tensor::MappingGuard input0_guard(input0); Tensor::MappingGuard input1_guard(input1); @@ -64,89 +58,97 @@ MaceStatus Eltwise::Compute(const OpContext *context, auto input1_ptr = input1->data(); auto output_ptr = output->mutable_data(); -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i <= output->size() - 8; i += 8) { - const auto input0_val = vld1_u8(input0_ptr + i); - const auto input1_val = vld1_u8(input1_ptr + i); - const auto input0_val_s16 = - vreinterpretq_s16_u16(vmovl_u8(input0_val)); - const auto input1_val_s16 = - vreinterpretq_s16_u16(vmovl_u8(input1_val)); - const auto offset_input0 = - vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point())); - const auto offset_input1 = - vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point())); - auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0)); - auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0)); - auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1)); - auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1)); - const auto left_shift_dup = vdupq_n_s32(left_shift); - input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup); - input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup); - input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup); - input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup); - input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier); - input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier); - input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier); - input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier); - const auto input0_shift_dup = vdupq_n_s32(input0_shift); - const auto input1_shift_dup = vdupq_n_s32(input1_shift); - input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup); - input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup); - input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup); - input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup); - int32x4_t res_low, res_high; - if (type_ == SUM) { - res_low = vaddq_s32(input0_low_s32, input1_low_s32); - res_high = vaddq_s32(input0_high_s32, input1_high_s32); - } else { - res_low = vsubq_s32(input0_low_s32, input1_low_s32); - res_high = vsubq_s32(input0_high_s32, input1_high_s32); - } - res_low = vqrdmulhq_n_s32(res_low, output_multiplier); - res_high = vqrdmulhq_n_s32(res_high, output_multiplier); - res_low = gemmlowp::RoundingDivideByPOT(res_low, -output_shift); - res_high = gemmlowp::RoundingDivideByPOT(res_high, -output_shift); - const auto res_low_s16 = vmovn_s32(res_low); - const auto res_high_s16 = vmovn_s32(res_high); - const auto output_val = vaddq_s16(vcombine_s16(res_low_s16, - res_high_s16), - vdupq_n_s16(output->zero_point())); - vst1_u8(output_ptr + i, vqmovun_s16(output_val)); - } + utils::ThreadPool &thread_pool = + context->device()->cpu_runtime()->thread_pool(); + thread_pool.Compute1D( + [=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + const auto input0_val = vld1_u8(input0_ptr + i); + const auto input1_val = vld1_u8(input1_ptr + i); + const auto input0_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input0_val)); + const auto input1_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input1_val)); + const auto offset_input0 = + vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point())); + const auto offset_input1 = + vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point())); + auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0)); + auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0)); + auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1)); + auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1)); + const auto left_shift_dup = vdupq_n_s32(left_shift); + input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup); + input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup); + input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup); + input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup); + input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier); + input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier); + input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier); + input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier); + const auto input0_shift_dup = vdupq_n_s32(input0_shift); + const auto input1_shift_dup = vdupq_n_s32(input1_shift); + input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup); + input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup); + input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup); + input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup); + int32x4_t res_low, res_high; + if (type_ == SUM) { + res_low = vaddq_s32(input0_low_s32, input1_low_s32); + res_high = vaddq_s32(input0_high_s32, input1_high_s32); + } else { + res_low = vsubq_s32(input0_low_s32, input1_low_s32); + res_high = vsubq_s32(input0_high_s32, input1_high_s32); + } + res_low = vqrdmulhq_n_s32(res_low, output_multiplier); + res_high = vqrdmulhq_n_s32(res_high, output_multiplier); + res_low = gemmlowp::RoundingDivideByPOT(res_low, -output_shift); + res_high = gemmlowp::RoundingDivideByPOT(res_high, -output_shift); + const auto res_low_s16 = vmovn_s32(res_low); + const auto res_high_s16 = vmovn_s32(res_high); + const auto output_val = + vaddq_s16(vcombine_s16(res_low_s16, res_high_s16), + vdupq_n_s16(output->zero_point())); + vst1_u8(output_ptr + i, vqmovun_s16(output_val)); + } + }, + 0, output->size() - 7, 8); index_t handled_output_size = output->size() - output->size() % 8; -#pragma omp parallel for schedule(runtime) - for (index_t i = handled_output_size; i < output->size(); ++i) { - const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); - const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); - const int32_t shifted_input0 = offset_input0 * (1 << left_shift); - const int32_t shifted_input1 = offset_input1 * (1 << left_shift); - const int32_t multiplied_input0 = - gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0, - input0_multiplier), - -input0_shift); - const int32_t multiplied_input1 = - gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, - input1_multiplier), - -input1_shift); - - int32_t res; - if (type_ == SUM) { - res = multiplied_input0 + multiplied_input1; - } else { - res = multiplied_input0 - multiplied_input1; - } - - const int32_t output_val = - gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(res, - output_multiplier), - -output_shift) + output->zero_point(); - output_ptr[i] = Saturate(output_val); - } + + thread_pool.Compute1D( + [=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); + const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); + const int32_t shifted_input0 = offset_input0 * (1 << left_shift); + const int32_t shifted_input1 = offset_input1 * (1 << left_shift); + const int32_t multiplied_input0 = gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0, + input0_multiplier), + -input0_shift); + const int32_t multiplied_input1 = gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, + input1_multiplier), + -input1_shift); + + int32_t res; + if (type_ == SUM) { + res = multiplied_input0 + multiplied_input1; + } else { + res = multiplied_input0 - multiplied_input1; + } + + const int32_t output_val = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul( + res, output_multiplier), + -output_shift) + + output->zero_point(); + output_ptr[i] = Saturate(output_val); + } + }, + handled_output_size, output->size(), 1); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/arm/q8/gemv.cc b/mace/ops/arm/q8/gemv.cc index ce102e7e3171ff3344b4535576c9187866305fcd..388c68147ff305cf603c95a62293024b7b1db03d 100644 --- a/mace/ops/arm/q8/gemv.cc +++ b/mace/ops/arm/q8/gemv.cc @@ -19,7 +19,7 @@ #include #include "mace/utils/math.h" -#include "mace/utils/quantize.h" +#include "mace/core/quantize.h" #if !defined(__aarch64__) @@ -82,91 +82,94 @@ MaceStatus Gemv::Compute(const OpContext *context, sum_rhs += static_cast(rhs_base[i]); } -#pragma omp parallel for schedule(runtime) - for (index_t h = 0; h < lhs_height; ++h) { - const uint8_t *lhs_ptr = lhs_data - + static_cast(lhs_batched) * b * lhs_height * lhs_width - + h * lhs_width; - const uint8_t *rhs_ptr = rhs_base; - OUTPUT_TYPE *output_ptr = output_data + b * lhs_height + h; - - uint32_t dot = 0; - uint32_t sum_lhs = 0; - uint32x4_t vo0_high_u32 = vdupq_n_u32(0); - uint32x4_t vo0_low_u32 = vdupq_n_u32(0); - uint32x4_t vo1_high_u32 = vdupq_n_u32(0); - uint32x4_t vo1_low_u32 = vdupq_n_u32(0); - uint32x4_t sum_lhs_low_u32 = vdupq_n_u32(0); - uint32x4_t sum_lhs_high_u32 = vdupq_n_u32(0); - - for (index_t w_block_idx = 0; w_block_idx < w_block_count; - ++w_block_idx) { - uint8x8_t vl0_u8 = vld1_u8(lhs_ptr); - uint8x8_t vl1_u8 = vld1_u8(lhs_ptr + 8); - - uint8x8_t vr0_u8 = vld1_u8(rhs_ptr); - uint8x8_t vr1_u8 = vld1_u8(rhs_ptr + 8); - - uint16x8_t vl0_u16 = vmovl_u8(vl0_u8); - uint16x8_t vl1_u16 = vmovl_u8(vl1_u8); - - uint16x8_t vr0_u16 = vmovl_u8(vr0_u8); - uint16x8_t vr1_u16 = vmovl_u8(vr1_u8); - - vo0_high_u32 = vmlal_u16(vo0_high_u32, - vget_high_u16(vl0_u16), - vget_high_u16(vr0_u16)); - vo0_low_u32 = vmlal_u16(vo0_low_u32, - vget_low_u16(vl0_u16), - vget_low_u16(vr0_u16)); - vo1_high_u32 = vmlal_u16(vo1_high_u32, - vget_high_u16(vl1_u16), - vget_high_u16(vr1_u16)); - vo1_low_u32 = vmlal_u16(vo1_low_u32, - vget_low_u16(vl1_u16), - vget_low_u16(vr1_u16)); - - // It can be precuculated if lhs is const, but for this case - // computation is not bottleneck - sum_lhs_high_u32 += vaddl_u16(vget_high_u16(vl0_u16), - vget_high_u16(vl1_u16)); - sum_lhs_low_u32 += vaddl_u16(vget_low_u16(vl0_u16), - vget_low_u16(vl1_u16)); - - lhs_ptr += 16; - rhs_ptr += 16; - } - - vo0_low_u32 = vaddq_u32(vo0_high_u32, vo0_low_u32); - vo1_low_u32 = vaddq_u32(vo1_high_u32, vo1_low_u32); - vo0_low_u32 = vaddq_u32(vo0_low_u32, vo1_low_u32); - dot += vaddvq_u32(vo0_low_u32); - - sum_lhs_low_u32 = vaddq_u32(sum_lhs_high_u32, sum_lhs_low_u32); - sum_lhs = vaddvq_u32(sum_lhs_low_u32); - - for (index_t w = 0; w < w_block_remain; ++w) { - dot += (*lhs_ptr) * (*rhs_ptr); - sum_lhs += (*lhs_ptr); - ++lhs_ptr; - ++rhs_ptr; - } - - const auto zero_point_dot = - static_cast(lhs_zero_point * rhs_zero_point * lhs_width); - int32_t ret = dot - sum_lhs * rhs_zero_point - sum_rhs * lhs_zero_point - + zero_point_dot; - if (bias) { - ret += bias->data()[h]; - } - - if (is_output_type_uint8_) { - *output_ptr = - Saturate(std::roundf(ret * output_multiplier_float)); - } else { - *output_ptr = ret; - } - } // h + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t h = start; h < end; h += step) { + const uint8_t *lhs_ptr = lhs_data + + static_cast(lhs_batched) * b * lhs_height * lhs_width + + h * lhs_width; + const uint8_t *rhs_ptr = rhs_base; + OUTPUT_TYPE *output_ptr = output_data + b * lhs_height + h; + + uint32_t dot = 0; + uint32_t sum_lhs = 0; + uint32x4_t vo0_high_u32 = vdupq_n_u32(0); + uint32x4_t vo0_low_u32 = vdupq_n_u32(0); + uint32x4_t vo1_high_u32 = vdupq_n_u32(0); + uint32x4_t vo1_low_u32 = vdupq_n_u32(0); + uint32x4_t sum_lhs_low_u32 = vdupq_n_u32(0); + uint32x4_t sum_lhs_high_u32 = vdupq_n_u32(0); + + for (index_t w_block_idx = 0; w_block_idx < w_block_count; + ++w_block_idx) { + uint8x8_t vl0_u8 = vld1_u8(lhs_ptr); + uint8x8_t vl1_u8 = vld1_u8(lhs_ptr + 8); + + uint8x8_t vr0_u8 = vld1_u8(rhs_ptr); + uint8x8_t vr1_u8 = vld1_u8(rhs_ptr + 8); + + uint16x8_t vl0_u16 = vmovl_u8(vl0_u8); + uint16x8_t vl1_u16 = vmovl_u8(vl1_u8); + + uint16x8_t vr0_u16 = vmovl_u8(vr0_u8); + uint16x8_t vr1_u16 = vmovl_u8(vr1_u8); + + vo0_high_u32 = vmlal_u16(vo0_high_u32, + vget_high_u16(vl0_u16), + vget_high_u16(vr0_u16)); + vo0_low_u32 = vmlal_u16(vo0_low_u32, + vget_low_u16(vl0_u16), + vget_low_u16(vr0_u16)); + vo1_high_u32 = vmlal_u16(vo1_high_u32, + vget_high_u16(vl1_u16), + vget_high_u16(vr1_u16)); + vo1_low_u32 = vmlal_u16(vo1_low_u32, + vget_low_u16(vl1_u16), + vget_low_u16(vr1_u16)); + + // It can be precalculated if lhs is const, but for this case + // computation is not bottleneck + sum_lhs_high_u32 += vaddl_u16(vget_high_u16(vl0_u16), + vget_high_u16(vl1_u16)); + sum_lhs_low_u32 += vaddl_u16(vget_low_u16(vl0_u16), + vget_low_u16(vl1_u16)); + + lhs_ptr += 16; + rhs_ptr += 16; + } + + vo0_low_u32 = vaddq_u32(vo0_high_u32, vo0_low_u32); + vo1_low_u32 = vaddq_u32(vo1_high_u32, vo1_low_u32); + vo0_low_u32 = vaddq_u32(vo0_low_u32, vo1_low_u32); + dot += vaddvq_u32(vo0_low_u32); + + sum_lhs_low_u32 = vaddq_u32(sum_lhs_high_u32, sum_lhs_low_u32); + sum_lhs = vaddvq_u32(sum_lhs_low_u32); + + for (index_t w = 0; w < w_block_remain; ++w) { + dot += (*lhs_ptr) * (*rhs_ptr); + sum_lhs += (*lhs_ptr); + ++lhs_ptr; + ++rhs_ptr; + } + + const auto zero_point_dot = + static_cast(lhs_zero_point * rhs_zero_point * lhs_width); + int32_t ret = dot - sum_lhs * rhs_zero_point - sum_rhs * lhs_zero_point + + zero_point_dot; + if (bias) { + ret += bias->data()[h]; + } + + if (is_output_type_uint8_) { + *output_ptr = + Saturate(std::roundf(ret * output_multiplier_float)); + } else { + *output_ptr = ret; + } + } // h + }, 0, lhs_height, 1); } // b diff --git a/mace/ops/arm/q8/gemv_test.cc b/mace/ops/arm/q8/gemv_test.cc index ced75f64716e4feb2f24603eda4883078c8ade94..6216cabaed02bbfc84ebc4b10adc0a012cdece3e 100644 --- a/mace/ops/arm/q8/gemv_test.cc +++ b/mace/ops/arm/q8/gemv_test.cc @@ -54,8 +54,12 @@ void TestGemvInt32(const index_t batch, GenerateRandomIntTypeData(bias.shape(), bias_data); } + utils::ThreadPool thread_pool(1, AFFINITY_NONE); + thread_pool.Init(); + CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); + OpContext context(nullptr, &cpu_device); mace::ops::arm::q8::Gemv gemv; - gemv.Compute(nullptr, + gemv.Compute(&context, &lhs, &rhs, &bias, @@ -122,8 +126,12 @@ void TestGemvUint8(const index_t batch, GenerateRandomIntTypeData(bias.shape(), bias_data); } + utils::ThreadPool thread_pool(1, AFFINITY_NONE); + thread_pool.Init(); + CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool); + OpContext context(nullptr, &cpu_device); mace::ops::arm::q8::Gemv gemv; - gemv.Compute(nullptr, + gemv.Compute(&context, &lhs, &rhs, &bias, diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index 469efe2e0c5eaac299d2622931a5e36154973d8e..c6559032973cdc580aa34b6fe53aaae5f8d585b3 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -18,6 +18,13 @@ #include "mace/core/operator.h" #include "mace/ops/activation.h" + +#if defined(MACE_ENABLE_NEON) +#include "mace/ops/arm/fp32/activation.h" +#else +#include "mace/ops/ref/activation.h" +#endif + #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/batch_norm.h" @@ -27,21 +34,22 @@ namespace mace { namespace ops { -template +template class BatchNormOp; -template <> +template<> class BatchNormOp : public Operation { public: explicit BatchNormOp(OpConstructContext *context) : Operation(context), epsilon_(Operation::GetOptionalArg("epsilon", static_cast(1e-4))), - activation_(ops::StringToActivationType( - Operation::GetOptionalArg("activation", "NOOP"))), - relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)), - leakyrelu_coefficient_(Operation::GetOptionalArg( - "leakyrelu_coefficient", 0.0f)) {} + activation_delegator_( + ops::StringToActivationType( + Operation::GetOptionalArg("activation", "NOOP")), + Operation::GetOptionalArg("max_limit", 0.0f), + Operation::GetOptionalArg( + "leakyrelu_coefficient", 0.0f)) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -73,74 +81,85 @@ class BatchNormOp : public Operation { const index_t height = input->dim(2); const index_t width = input->dim(3); - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard scale_mapper(scale); - Tensor::MappingGuard offset_mapper(offset); - Tensor::MappingGuard output_mapper(output); - - const float *input_ptr = input->data(); - const float *scale_ptr = scale->data(); - const float *offset_ptr = offset->data(); - float *output_ptr = output->mutable_data(); - - std::vector new_scale; - std::vector new_offset; - if (not_folded) { - const Tensor *mean = this->Input(MEAN); - const Tensor *var = this->Input(VAR); - MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ", - mean->dim_size()); - MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ", - var->dim_size()); - new_scale.resize(channels); - new_offset.resize(channels); - Tensor::MappingGuard mean_mapper(mean); - Tensor::MappingGuard var_mapper(var); - const float *mean_ptr = mean->data(); - const float *var_ptr = var->data(); -#pragma omp parallel for - for (index_t c = 0; c < channels; ++c) { - new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon_); - new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c]; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + { + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard scale_mapper(scale); + Tensor::MappingGuard offset_mapper(offset); + Tensor::MappingGuard output_mapper(output); + + const float *input_ptr = input->data(); + const float *scale_ptr = scale->data(); + const float *offset_ptr = offset->data(); + float *output_ptr = output->mutable_data(); + + std::vector new_scale; + std::vector new_offset; + if (not_folded) { + const Tensor *mean = this->Input(MEAN); + const Tensor *var = this->Input(VAR); + MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ", + mean->dim_size()); + MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ", + var->dim_size()); + new_scale.resize(channels); + new_offset.resize(channels); + Tensor::MappingGuard mean_mapper(mean); + Tensor::MappingGuard var_mapper(var); + const float *mean_ptr = mean->data(); + const float *var_ptr = var->data(); + + thread_pool.Compute1D([=, &new_scale, &new_offset](index_t start, + index_t end, + index_t step) { + for (index_t c = start; c < end; c += step) { + new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon_); + new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c]; + } + }, 0, channels, 1); } - } - const float *scale_data = not_folded ? new_scale.data() : scale_ptr; - const float - *offset_data = not_folded ? new_offset.data() : offset_ptr; - - index_t channel_size = height * width; - index_t batch_size = channels * channel_size; - - // NEON is slower, so stick to the trivial implementaion -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - index_t offset = b * batch_size + c * channel_size; - for (index_t hw = 0; hw < height * width; ++hw) { - output_ptr[offset + hw] = - scale_data[c] * input_ptr[offset + hw] + offset_data[c]; + const float *scale_data = not_folded ? new_scale.data() : scale_ptr; + const float + *offset_data = not_folded ? new_offset.data() : offset_ptr; + + index_t channel_size = height * width; + index_t batch_size = channels * channel_size; + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + index_t offset = b * batch_size + c * channel_size; + for (index_t hw = 0; hw < height * width; ++hw) { + output_ptr[offset + hw] = + scale_data[c] * input_ptr[offset + hw] + offset_data[c]; + } + } } - } + }, 0, batch, 1, 0, channels, 1); } - DoActivation(output_ptr, output_ptr, output->size(), activation_, - relux_max_limit_, leakyrelu_coefficient_); + + activation_delegator_.Compute(context, output, output); return MaceStatus::MACE_SUCCESS; } private: float epsilon_; - const ActivationType activation_; - const float relux_max_limit_; - const float leakyrelu_coefficient_; +#ifdef MACE_ENABLE_NEON + arm::fp32::Activation activation_delegator_; +#else + ref::Activation activation_delegator_; +#endif // MACE_ENABLE_NEON protected: MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); MACE_OP_OUTPUT_TAGS(OUTPUT); }; - #ifdef MACE_ENABLE_OPENCL template class BatchNormOp : public Operation { @@ -213,7 +232,6 @@ class BatchNormOp : public Operation { }; #endif // MACE_ENABLE_OPENCL - void RegisterBatchNorm(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp, DeviceType::CPU, float); diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index cfd350d458429ea86a68e9176c41108e2469f392..c44501f12e73a92c942d987ac1e51a0fbd1648c9 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -125,7 +125,6 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { std::max(static_cast(1), 8 * 1024 / block_shape_w / out_width); // make channel outter loop so we can make best use of cache -#pragma omp parallel for collapse(3) schedule(runtime) for (index_t c = 0; c < channels; ++c) { for (index_t block_h = 0; block_h < in_height; block_h += block_h_size) { @@ -214,7 +213,6 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { index_t out_width = space_tensor->dim(2); index_t channels = space_tensor->dim(3); -#pragma omp parallel for schedule(runtime) for (index_t in_b = 0; in_b < in_batches; ++in_b) { const index_t b = in_b % out_batches; const index_t tile_index = in_b / out_batches; diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index 6606c2c257b2ead3dd756a8477e39f383a25b37c..9351de79518ee71671f7595f39f2c410a7e7b265 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -18,6 +18,13 @@ #include "mace/core/operator.h" #include "mace/ops/activation.h" + +#ifdef MACE_ENABLE_NEON +#include "mace/ops/arm/fp32/bias_add.h" +#else +#include "mace/ops/ref/bias_add.h" +#endif // MACE_ENABLE_NEON + #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/bias_add.h" @@ -47,36 +54,26 @@ class BiasAddOp : public Operation { bias->dim_size()); Tensor *output = this->Output(0); - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard bias_mapper(bias); - Tensor::MappingGuard output_mapper(output); + if (input->dim_size() == 4 && has_data_format_) { + bias_add_delegator_.Compute(context, input, bias, output); + } else { + // TODO(liyin): remove it and tranform bias to add (eltwise) + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - const float *input_ptr = input->data(); - const float *bias_ptr = bias->data(); - float *output_ptr = output->mutable_data(); + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard bias_mapper(bias); + Tensor::MappingGuard output_mapper(output); - if (input->dim_size() == 4 && has_data_format_) { - const index_t batch = input->dim(0); - const index_t channels = input->dim(1); - const index_t height_width = input->dim(2) * input->dim(3); + const float *input_ptr = input->data(); + const float *bias_ptr = bias->data(); + float *output_ptr = output->mutable_data(); -#pragma omp parallel for collapse(2) - for (index_t n = 0; n < batch; ++n) { - for (index_t c = 0; c < channels; ++c) { - for (index_t hw = 0; hw < height_width; ++hw) { - index_t pos = (n * channels + c) * height_width + hw; - output_ptr[pos] = input_ptr[pos] + bias_ptr[c]; - } - } - } - } else { const std::vector &shape = input->shape(); const index_t fused_batch = std::accumulate( shape.begin(), shape.end() - 1, 1, std::multiplies()); const index_t channels = *shape.rbegin(); -#pragma omp parallel for + for (index_t n = 0; n < fused_batch; ++n) { index_t pos = n * channels; for (index_t c = 0; c < channels; ++c) { @@ -91,6 +88,11 @@ class BiasAddOp : public Operation { private: int has_data_format_; +#ifdef MACE_ENABLE_NEON + arm::fp32::BiasAdd bias_add_delegator_; +#else + ref::BiasAdd bias_add_delegator_; +#endif // MACE_ENABLE_NEON }; #ifdef MACE_ENABLE_OPENCL diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index 70e1811a07292af8eb0982caf46decb393f28325..966b5d57347b9405d3d43d9c113b00de3d38ce3e 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -56,7 +56,6 @@ class ChannelShuffleOp : public Operation { index_t batch_size = channels * image_size; index_t channels_per_group = channels / groups_; -#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { index_t g = c % groups_; diff --git a/mace/ops/common/conv_pool_2d_util.cc b/mace/ops/common/conv_pool_2d_util.cc index ade33c59002d3924123eede8687269de3abb2119..2ca95a7d75986c03c81d80f9ce0365d53df7005b 100644 --- a/mace/ops/common/conv_pool_2d_util.cc +++ b/mace/ops/common/conv_pool_2d_util.cc @@ -76,16 +76,14 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, output_height = (input_height - k_extent_height) / strides[0] + 1; output_width = (input_width - k_extent_width) / strides[1] + 1; break; - case SAME: - output_height = (input_height - 1) / strides[0] + 1; + case SAME:output_height = (input_height - 1) / strides[0] + 1; output_width = (input_width - 1) / strides[1] + 1; break; case FULL: output_height = (input_height + k_extent_height - 2) / strides[0] + 1; output_width = (input_width + k_extent_width - 2) / strides[1] + 1; break; - default: - MACE_CHECK(false, "Unsupported padding type: ", padding); + default:MACE_CHECK(false, "Unsupported padding type: ", padding); } // Note: TensorFlow may padded one more on the right/bottom side @@ -210,20 +208,6 @@ void CalcOutputSize(const index_t *input_shape, } } -void CalcNCHWInputShape(const index_t *output_shape, - const index_t *filter_shape, - const int *strides, - const int *dilations, - index_t *input_shape) { - MACE_CHECK_NOTNULL(input_shape); - input_shape[0] = output_shape[0]; - input_shape[1] = filter_shape[1]; - input_shape[2] = (output_shape[2] - 1) * strides[0] + - (filter_shape[2] - 1) * dilations[0] + 1; - input_shape[3] = (output_shape[3] - 1) * strides[1] + - (filter_shape[3] - 1) * dilations[1] + 1; -} - void CalcOutputSize(const index_t *input_shape, // NHWC const index_t *filter_shape, // OIHW const int *padding_size, @@ -236,231 +220,202 @@ void CalcOutputSize(const index_t *input_shape, // NHWC } void CalcNCHWOutputSize(const index_t *input_shape, // NCHW - const index_t *filter_shape, // OIHW - const int *padding_size, - const int *dilations, - const int *strides, - const RoundType round_type, - index_t *output_shape) { + const index_t *filter_shape, // OIHW + const int *padding_size, + const int *dilations, + const int *strides, + const RoundType round_type, + index_t *output_shape) { CalcOutputSize(input_shape, NCHW, filter_shape, OIHW, padding_size, dilations, strides, round_type, output_shape); } -void CalPaddingSize(const index_t *input_shape, // NCHW - const index_t *filter_shape, // OIHW - const int *strides, - const int *dilations, - Padding padding, - int *padding_size) { - MACE_CHECK(dilations[0] > 0 && dilations[1] > 0, - "Invalid dilations, must >= 1"); - MACE_CHECK((dilations[0] == 1 || strides[0] == 1) && - (dilations[1] == 1 || strides[1] == 1), - "If dilations > 1, strides should be 1"); - MACE_CHECK_NOTNULL(padding_size); - - index_t output_height = 0, output_width = 0; - index_t k_extent_height = (filter_shape[2] - 1) * dilations[0] + 1; - index_t k_extent_width = (filter_shape[3] - 1) * dilations[1] + 1; - - switch (padding) { +void CalcDeconvShape_TF(const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, + const std::vector &strides, + Padding padding_type, + const int group, + std::vector *in_pad_size, + std::vector *out_pad_size, + std::vector *padded_out_shape, + DataFormat data_format) { + const index_t + in_height = data_format == NCHW ? input_shape[2] : input_shape[1]; + const index_t + in_width = data_format == NCHW ? input_shape[3] : input_shape[2]; + + const index_t + out_height = data_format == NCHW ? output_shape[2] : output_shape[1]; + const index_t + out_width = data_format == NCHW ? output_shape[3] : output_shape[2]; + + const index_t extended_in_height = (in_height - 1) * strides[0] + 1; + const index_t extended_in_width = (in_width - 1) * strides[1] + 1; + + const index_t kernel_h = filter_shape[2]; + const index_t kernel_w = filter_shape[3]; + + index_t expected_input_height = 0, expected_input_width = 0; + + switch (padding_type) { case VALID: - output_height = (input_shape[2] - k_extent_height) / strides[0] + 1; - output_width = (input_shape[3] - k_extent_width) / strides[1] + 1; + expected_input_height = + (out_height - kernel_h + strides[0]) / strides[0]; + expected_input_width = + (out_width - kernel_w + strides[1]) / strides[1]; break; case SAME: - output_height = (input_shape[2] - 1) / strides[0] + 1; - output_width = (input_shape[3] - 1) / strides[1] + 1; - break; - case FULL: - output_height = (input_shape[2] + k_extent_height - 2) / strides[0] + 1; - output_width = (input_shape[3] + k_extent_width - 2) / strides[1] + 1; + expected_input_height = + (out_height + strides[0] - 1) / strides[0]; + expected_input_width = + (out_width + strides[1] - 1) / strides[1]; break; - default: - MACE_CHECK(false, "Unsupported padding type: ", padding); + default:MACE_CHECK(false, "Unsupported padding type: ", padding_type); } - // Note: TensorFlow may padded one more on the right/bottom side - // TODO(liuqi): may be it's better to also truncate the left/top to - // utilize the more centered features. We need to benchmark - // based on the model accuracy. - padding_size[0] = std::max( - 0, (output_height - 1) * strides[0] + k_extent_height - input_shape[2]); - padding_size[1] = std::max( - 0, (output_width - 1) * strides[1] + k_extent_width - input_shape[3]); -} - - -MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor, - const int *paddings, - Tensor *output_tensor, - bool padding_same_value) { - Tensor::MappingGuard input_mapper(input_tensor); - const float *input = input_tensor->data(); - const index_t *input_shape = input_tensor->shape().data(); - - index_t batch = input_shape[0]; - index_t channels = input_shape[1]; - index_t height = input_shape[2]; - index_t width = input_shape[3]; - - std::vector output_shape( - {batch, channels, paddings[0] + height, paddings[1] + width}); - - const index_t output_width = output_shape[3]; - const int padded_top = paddings[0] / 2; - const int padded_left = paddings[1] / 2; - - MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape)); - - Tensor::MappingGuard padded_output_mapper(output_tensor); - float *output_data = output_tensor->mutable_data(); - memset(output_data, 0, output_tensor->size() * sizeof(float)); - - // Skip the padded top rows - if (padding_same_value) { -#define MACE_COPY_INPUT \ - std::fill(output_data, output_data + padded_left, input[0]); \ - output_data += padded_left; \ - memcpy(output_data, input, width * sizeof(float)); \ - output_data += width; \ - std::fill(output_data, output_data + padded_right, input[width - 1]); \ - output_data += padded_right; - - const int padded_bottom = paddings[0] - padded_top; - const int padded_right = paddings[1] - padded_left; - - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - for (int k = 0; k < padded_top; ++k) { - MACE_COPY_INPUT; - } - for (int k = 0; k < height; ++k) { - MACE_COPY_INPUT; - input += width; - } - input -= width; - for (int k = 0; k < padded_bottom; ++k) { - MACE_COPY_INPUT; - } - input += width; - } - } -#undef MACE_COPY_INPUT - } else { - output_data += padded_top * output_width; - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - for (int k = 0; k < height; ++k) { - memcpy(output_data + padded_left, input, width * sizeof(float)); - input += width; - output_data += output_width; - } - // Skip the padded bottom in this channel and top in the next channel - output_data += paddings[0] * output_width; - } - } + MACE_CHECK(expected_input_height == in_height, + expected_input_height, "!=", in_height); + MACE_CHECK(expected_input_width == in_width, + expected_input_width, "!=", in_width); + + const index_t padded_out_height = + (in_height - 1) * strides[0] + kernel_h; + const index_t padded_out_width = + (in_width - 1) * strides[1] + kernel_w; + + if (in_pad_size != nullptr) { + const int p_h = + static_cast(out_height + kernel_h - 1 - extended_in_height); + const int p_w = + static_cast(out_width + kernel_w - 1 - extended_in_width); + in_pad_size->resize(2); + (*in_pad_size)[0] = std::max(0, p_h); + (*in_pad_size)[1] = std::max(0, p_w); } - return MaceStatus::MACE_SUCCESS; -} - -MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor, - const int pad_top, - const int pad_bottom, - const int pad_left, - const int pad_right, - Tensor *output_tensor) { - const float *input = input_tensor->data(); - const index_t *input_shape = input_tensor->shape().data(); - - index_t batch = input_shape[0]; - index_t channels = input_shape[1]; - index_t height = input_shape[2]; - index_t width = input_shape[3]; - - const int pad_height = pad_top + pad_bottom; - const int pad_width = pad_left + pad_right; - std::vector output_shape( - {batch, channels, height + pad_height, width + pad_width}); - MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape)); - output_tensor->Clear(); - Tensor::MappingGuard padded_output_mapper(output_tensor); - float *output_data = output_tensor->mutable_data(); - - const index_t output_height = output_shape[2]; - const index_t output_width = output_shape[3]; - const index_t in_image_size = height * width; - const index_t out_image_size = output_height * output_width; - const index_t in_batch_size = channels * in_image_size; - const index_t out_batch_size = channels * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - for (int k = 0; k < height; ++k) { - memcpy(output_data + i * out_batch_size + j * out_image_size - + (pad_top + k) * output_width + pad_left, - input + i * in_batch_size + j * in_image_size + k * width, - width * sizeof(float)); - } - // Skip the padded bottom in this channel and top in the next channel - } + if (out_pad_size != nullptr) { + const int o_p_h = static_cast(padded_out_height - out_height); + const int o_p_w = static_cast(padded_out_width - out_width); + out_pad_size->resize(2); + (*out_pad_size)[0] = std::max(0, o_p_h); + (*out_pad_size)[1] = std::max(0, o_p_w); } - return MaceStatus::MACE_SUCCESS; + if (padded_out_shape != nullptr) { + index_t output_channel = filter_shape[0] * group; + padded_out_shape->resize(4); + (*padded_out_shape)[0] = output_shape[0]; + (*padded_out_shape)[1] = + data_format == NCHW ? output_channel : padded_out_height; + (*padded_out_shape)[2] = + data_format == NCHW ? padded_out_height : padded_out_width; + (*padded_out_shape)[3] = + data_format == NCHW ? padded_out_width : output_channel; + } } +void CalcDeconvShape_Caffe(const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &strides, + const std::vector &out_pad_size, + const int group, + std::vector *out_shape, + std::vector *in_pad_size, + std::vector *padded_out_shape, + DataFormat data_format) { + const index_t + in_height = data_format == NCHW ? input_shape[2] : input_shape[1]; + const index_t + in_width = data_format == NCHW ? input_shape[3] : input_shape[2]; + + const index_t output_channel = filter_shape[0] * group; + + const index_t kernel_h = filter_shape[2]; + const index_t kernel_w = filter_shape[3]; + + index_t padded_out_height = + (in_height - 1) * strides[0] + kernel_h; + index_t padded_out_width = + (in_width - 1) * strides[1] + kernel_w; + + if (in_pad_size != nullptr) { + in_pad_size->resize(2); + (*in_pad_size)[0] = static_cast((kernel_h - 1) * 2 - out_pad_size[0]); + (*in_pad_size)[1] = static_cast((kernel_w - 1) * 2 - out_pad_size[1]); + (*in_pad_size)[0] = std::max(0, (*in_pad_size)[0]); + (*in_pad_size)[1] = std::max(0, (*in_pad_size)[1]); + } -MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor, - const int *paddings, - Tensor *output_tensor, - bool padding_same_value) { - Tensor::MappingGuard input_mapper(input_tensor); - const float *input = input_tensor->data(); - const index_t *input_shape = input_tensor->shape().data(); - - index_t batch = input_shape[0]; - index_t height = input_shape[1]; - index_t width = input_shape[2]; - index_t channels = input_shape[3]; - - std::vector output_shape( - {batch, paddings[0] + height, paddings[1] + width, channels}); - - const int output_height = output_shape[1]; - const int output_width = output_shape[2]; - const int padded_top = paddings[0] / 2; - const int padded_left = paddings[1] / 2; - - MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape)); + if (padded_out_shape != nullptr) { + padded_out_shape->resize(4); + (*padded_out_shape)[0] = input_shape[0]; + (*padded_out_shape)[1] = + data_format == NCHW ? output_channel : padded_out_height; + (*padded_out_shape)[2] = + data_format == NCHW ? padded_out_height : padded_out_width; + (*padded_out_shape)[3] = + data_format == NCHW ? padded_out_width : output_channel; + } - Tensor::MappingGuard padded_output_mapper(output_tensor); - float *output_data = output_tensor->mutable_data(); - memset(output_data, 0, output_tensor->size() * sizeof(float)); + if (out_shape != nullptr) { + index_t out_height = padded_out_height - out_pad_size[0]; + index_t out_width = padded_out_width - out_pad_size[1]; + out_shape->resize(4); + (*out_shape)[0] = input_shape[0]; + (*out_shape)[1] = data_format == NCHW ? output_channel : out_height; + (*out_shape)[2] = data_format == NCHW ? out_height : out_width; + (*out_shape)[3] = data_format == NCHW ? out_width : output_channel; + } +} - // Skip the padded top rows - if (padding_same_value) { - LOG(FATAL) << "Not implemented"; - } else { -#pragma omp parallel for collapse(3) schedule(runtime) - for (int n = 0; n < batch; ++n) { - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - const float *input_ptr = - input + ((n * height + h) * width + w) * channels; - float *output_ptr = - output_data + - ((n * output_height + h + padded_top) * output_width + w + - padded_left) * - channels; - memcpy(output_ptr, input_ptr, channels * sizeof(float)); - } - } +void CalDeconvOutputShapeAndPadSize(const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &strides, + Padding padding_type, + const std::vector &paddings, + int group, + std::vector *output_shape, + std::vector *in_pad_size, + std::vector *out_pad_size, + std::vector *padded_out_shape, + FrameworkType framework_type, + DataFormat data_format) { + if (framework_type == FrameworkType::TENSORFLOW) { + MACE_CHECK(output_shape->size() == 4, + "deconv output shape shoud be 4-dims"); + std::vector &out_shape = *output_shape; + if (data_format == NCHW) { + const index_t t = out_shape[1]; + out_shape[1] = out_shape[3]; + out_shape[3] = out_shape[2]; + out_shape[2] = t; } - } - return MaceStatus::MACE_SUCCESS; + CalcDeconvShape_TF( + input_shape, + filter_shape, + *output_shape, + strides, + padding_type, + group, + in_pad_size, + out_pad_size, + padded_out_shape, + data_format); + } else { // caffe + if (!paddings.empty()) *out_pad_size = paddings; + CalcDeconvShape_Caffe( + input_shape, + filter_shape, + strides, + *out_pad_size, + group, + output_shape, + in_pad_size, + padded_out_shape, + data_format); + } } } // namespace ops diff --git a/mace/ops/common/conv_pool_2d_util.h b/mace/ops/common/conv_pool_2d_util.h index e8d0d335f1e0900cf1c265817cbcd73dd63c66b3..389575d76a78b7154887865f203ee8c29f059a4d 100644 --- a/mace/ops/common/conv_pool_2d_util.h +++ b/mace/ops/common/conv_pool_2d_util.h @@ -15,6 +15,7 @@ #ifndef MACE_OPS_COMMON_CONV_POOL_2D_UTIL_H_ #define MACE_OPS_COMMON_CONV_POOL_2D_UTIL_H_ +#include #include "mace/core/tensor.h" namespace mace { @@ -77,41 +78,25 @@ void CalcOutputSize(const index_t *input_shape, // NHWC index_t *output_shape); void CalcNCHWOutputSize(const index_t *input_shape, - const index_t *filter_shape, - const int *padding_size, - const int *dilations, - const int *strides, - const RoundType round_type, - index_t *output_shape); - -void CalcNCHWInputShape(const index_t *output_shape, const index_t *filter_shape, - const int *strides, + const int *padding_size, const int *dilations, - index_t *input_shape); - -void CalPaddingSize(const index_t *input_shape, // NCHW - const index_t *filter_shape, // OIHW - const int *dilations, - const int *strides, - Padding padding, - int *padding_size); - - -MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input, - const int pad_top, const int pad_bottom, - const int pad_left, const int pad_right, - Tensor *output_tensor); - -MaceStatus ConstructNCHWInputWithPadding(const Tensor *input, - const int *paddings, - Tensor *output_tensor, - bool padding_same_value = false); - -MaceStatus ConstructNHWCInputWithPadding(const Tensor *input, - const int *paddings, - Tensor *output_tensor, - bool padding_same_value = false); + const int *strides, + const RoundType round_type, + index_t *output_shape); + +void CalDeconvOutputShapeAndPadSize(const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &strides, + Padding padding_type, + const std::vector &paddings, + int group, + std::vector *output_shape, + std::vector *in_pad_size, + std::vector *out_pad_size, + std::vector *padded_out_shape, + FrameworkType framework_type, + DataFormat data_format); } // namespace ops } // namespace mace diff --git a/mace/ops/common/gemmlowp_util.h b/mace/ops/common/gemmlowp_util.h index c7eed2ad275c9b51cc5cf55cf2f88f90edf3d500..a01ec82ef68cd84897d1090e7e958d8807fae214 100644 --- a/mace/ops/common/gemmlowp_util.h +++ b/mace/ops/common/gemmlowp_util.h @@ -19,7 +19,7 @@ #include "public/gemmlowp.h" #include "mace/core/types.h" -#include "mace/utils/quantize.h" +#include "mace/core/quantize.h" namespace mace { diff --git a/mace/ops/common/lstm.cc b/mace/ops/common/lstm.cc index beea3f5b8081584b219cd6c662c4451dfe4cc223..cde148e1560168b7ddd9138a7fb4847663bc9de2 100644 --- a/mace/ops/common/lstm.cc +++ b/mace/ops/common/lstm.cc @@ -21,7 +21,8 @@ namespace mace { namespace ops { -void LSTMNonlinearKernel(const float *input_data, +void LSTMNonlinearKernel(const OpContext *context, + const float *input_data, const float *prev_data, const float *scale_data, const float *params_data, @@ -34,41 +35,44 @@ void LSTMNonlinearKernel(const float *input_data, float f_scale = (embed_scales && scale_data) ? scale_data[1] : 1.0f; float o_scale = (embed_scales && scale_data) ? scale_data[2] : 1.0f; - if (prev_data == nullptr) { -#pragma omp parallel for schedule(runtime) - for (int c = 0; c < cell_dim; ++c) { - float i_part = input_data[c]; - float c_part = input_data[c + 2 * cell_dim]; - float o_part = input_data[c + 3 * cell_dim]; - float w_oc = params_data[c + params_stride * 2]; - float i_t = ScalarSigmoid(i_part); - float c_t = i_t * i_scale * std::tanh(c_part); - float o_t = ScalarSigmoid(o_part + w_oc * c_t); - float m_t = o_t * o_scale * std::tanh(c_t); - output_cell[c] = c_t; - output_data[c] = m_t; - } - } else { -#pragma omp parallel for schedule(runtime) - for (int c = 0; c < cell_dim; ++c) { - float i_part = input_data[c]; - float f_part = input_data[c + cell_dim]; - float c_part = input_data[c + 2 * cell_dim]; - float o_part = input_data[c + 3 * cell_dim]; - float c_prev = prev_data[c]; - float w_ic = params_data[c]; - float w_fc = params_data[c + params_stride]; - float w_oc = params_data[c + params_stride * 2]; - float i_t = ScalarSigmoid(i_part + w_ic * c_prev); - float f_t = ScalarSigmoid(f_part + w_fc * c_prev); - float c_t = - f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part); - float o_t = ScalarSigmoid(o_part + w_oc * c_t); - float m_t = o_t * o_scale * std::tanh(c_t); - output_cell[c] = c_t; - output_data[c] = m_t; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + if (prev_data == nullptr) { + for (index_t c = start; c < end; c += step) { + float i_part = input_data[c]; + float c_part = input_data[c + 2 * cell_dim]; + float o_part = input_data[c + 3 * cell_dim]; + float w_oc = params_data[c + params_stride * 2]; + float i_t = ScalarSigmoid(i_part); + float c_t = i_t * i_scale * std::tanh(c_part); + float o_t = ScalarSigmoid(o_part + w_oc * c_t); + float m_t = o_t * o_scale * std::tanh(c_t); + output_cell[c] = c_t; + output_data[c] = m_t; + } + } else { + for (index_t c = start; c < end; c += step) { + float i_part = input_data[c]; + float f_part = input_data[c + cell_dim]; + float c_part = input_data[c + 2 * cell_dim]; + float o_part = input_data[c + 3 * cell_dim]; + float c_prev = prev_data[c]; + float w_ic = params_data[c]; + float w_fc = params_data[c + params_stride]; + float w_oc = params_data[c + params_stride * 2]; + float i_t = ScalarSigmoid(i_part + w_ic * c_prev); + float f_t = ScalarSigmoid(f_part + w_fc * c_prev); + float c_t = + f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part); + float o_t = ScalarSigmoid(o_part + w_oc * c_t); + float m_t = o_t * o_scale * std::tanh(c_t); + output_cell[c] = c_t; + output_data[c] = m_t; + } } - } + }, 0, cell_dim, 1); } } // namespace ops diff --git a/mace/ops/common/lstm.h b/mace/ops/common/lstm.h index b835386041b6ba86f13818fe4f57c1efb1dff15d..d9e4024894dba1a7c3995e8239ef0a9e814a50e9 100644 --- a/mace/ops/common/lstm.h +++ b/mace/ops/common/lstm.h @@ -16,10 +16,13 @@ #define MACE_OPS_COMMON_LSTM_H_ #include "mace/core/types.h" +#include "mace/core/op_context.h" + namespace mace { namespace ops { -void LSTMNonlinearKernel(const float *input_data, +void LSTMNonlinearKernel(const OpContext *opContext, + const float *input_data, const float *prev_data, const float *scale_data, const float *params_data, @@ -29,7 +32,6 @@ void LSTMNonlinearKernel(const float *input_data, float *output_cell, float *output_data); - } // namespace ops } // namespace mace diff --git a/mace/ops/common/transpose.cc b/mace/ops/common/transpose.cc deleted file mode 100644 index 79a7a6be064368f34864fee115af6d7735b50a83..0000000000000000000000000000000000000000 --- a/mace/ops/common/transpose.cc +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/common/transpose.h" - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -namespace mace { -namespace ops { - -namespace transpose { -void TransposeNHWCToNCHWC3(const float *input, - float *output, - const index_t height, - const index_t width) { - index_t image_size = height * width; - -#pragma omp parallel for - for (index_t h = 0; h < height; ++h) { - index_t in_offset = h * width * 3; - index_t out_offset = h * width; - -#if defined(MACE_ENABLE_NEON) - index_t w; - for (w = 0; w + 3 < width; w += 4) { - float32x4x3_t vi = vld3q_f32(input + in_offset); - vst1q_f32(output + out_offset, vi.val[0]); - vst1q_f32(output + out_offset + image_size, vi.val[1]); - vst1q_f32(output + out_offset + image_size * 2, vi.val[2]); - - in_offset += 12; - out_offset += 4; - } - for (; w < width; ++w) { - for (index_t c = 0; c < 3; ++c) { - output[h * width + image_size * c + w] = - input[h * width * 3 + w * 3 + c]; - } - } -#else - for (index_t w = 0; w < width; ++w) { - for (index_t c = 0; c < 3; ++c) { - output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c]; - } - } -#endif - } -} - -void TransposeNCHWToNHWCC2(const float *input, - float *output, - const index_t height, - const index_t width) { - index_t image_size = height * width; -#pragma omp parallel for - for (index_t h = 0; h < height; ++h) { - index_t in_offset = h * width; - index_t out_offset = h * width * 2; - -#if defined(MACE_ENABLE_NEON) - index_t w; - for (w = 0; w + 3 < width; w += 4) { - float32x4_t vi0 = vld1q_f32(input + in_offset); - float32x4_t vi1 = vld1q_f32(input + in_offset + image_size); - float32x4x2_t vi = {vi0, vi1}; - vst2q_f32(output + out_offset, vi); - in_offset += 4; - out_offset += 8; - } - for (; w < width; ++w) { - for (index_t c = 0; c < 2; ++c) { - output[h * width * 2 + w * 2 + c] = - input[h * width + image_size * c + w]; - } - } -#else - for (index_t w = 0; w < width; ++w) { - for (index_t c = 0; c < 2; ++c) { - output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w]; - } - } -#endif - } -} - -void TransposeNHWCToNCHWC3(const int *input, - int *output, - const index_t height, - const index_t width) { - index_t image_size = height * width; - -#pragma omp parallel for - for (index_t h = 0; h < height; ++h) { - index_t in_offset = h * width * 3; - index_t out_offset = h * width; - - for (index_t w = 0; w < width; ++w) { - for (index_t c = 0; c < 3; ++c) { - output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c]; - } - } - } -} - -void TransposeNCHWToNHWCC2(const int *input, - int *output, - const index_t height, - const index_t width) { - index_t image_size = height * width; -#pragma omp parallel for - for (index_t h = 0; h < height; ++h) { - index_t in_offset = h * width; - index_t out_offset = h * width * 2; - - for (index_t w = 0; w < width; ++w) { - for (index_t c = 0; c < 2; ++c) { - output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w]; - } - } - } -} -} // namespace transpose - -} // namespace ops -} // namespace mace diff --git a/mace/ops/common/transpose.h b/mace/ops/common/transpose.h index 4d2e5a519e680276884fb95ad6edf088738c99d0..0c0751851f695ac9974bf3e386b32adf2cf28370 100644 --- a/mace/ops/common/transpose.h +++ b/mace/ops/common/transpose.h @@ -15,43 +15,152 @@ #ifndef MACE_OPS_COMMON_TRANSPOSE_H_ #define MACE_OPS_COMMON_TRANSPOSE_H_ +#if defined(MACE_ENABLE_NEON) +#include +#endif // MACE_ENABLE_NEON #include #include - +#include "mace/core/op_context.h" #include "mace/public/mace.h" -#include "mace/core/tensor.h" namespace mace { namespace ops { -namespace transpose { -void TransposeNHWCToNCHWC3(const float *input, - float *output, +template +void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool, + const T *input, + T *output, const index_t height, - const index_t width); + const index_t width) { + index_t image_size = height * width; -void TransposeNHWCToNCHWC3(const int *input, - int *output, - const index_t height, - const index_t width); + thread_pool->Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t h = start; h < end; h += step) { + index_t in_offset = h * width * 3; + index_t out_offset = h * width; -void TransposeNCHWToNHWCC2(const float *input, - float *output, - const index_t height, - const index_t width); + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < 3; ++c) { + output[out_offset + c * image_size + w] = + input[in_offset + w * 3 + c]; + } + } + } + }, 0, height, 1); +} + +template<> +inline void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool, + const float *input, + float *output, + const index_t height, + const index_t width) { + index_t image_size = height * width; + + thread_pool->Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t h = start; h < end; h += step) { + index_t in_offset = h * width * 3; + index_t out_offset = h * width; -void TransposeNCHWToNHWCC2(const int *input, - int *output, +#if defined(MACE_ENABLE_NEON) + index_t w; + for (w = 0; w + 3 < width; w += 4) { + float32x4x3_t vi = vld3q_f32(input + in_offset); + vst1q_f32(output + out_offset, vi.val[0]); + vst1q_f32(output + out_offset + image_size, vi.val[1]); + vst1q_f32(output + out_offset + image_size * 2, vi.val[2]); + + in_offset += 12; + out_offset += 4; + } + for (; w < width; ++w) { + for (index_t c = 0; c < 3; ++c) { + output[h * width + image_size * c + w] = + input[h * width * 3 + w * 3 + c]; + } + } +#else + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < 3; ++c) { + output[out_offset + c * image_size + w] = + input[in_offset + w * 3 + c]; + } + } +#endif + } + }, 0, height, 1); +} + +template +void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool, + const T *input, + T *output, const index_t height, - const index_t width); -} // namespace transpose + const index_t width) { + index_t image_size = height * width; + + thread_pool->Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t h = start; h < end; h += step) { + index_t in_offset = h * width; + index_t out_offset = h * width * 2; + + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < 2; ++c) { + output[out_offset + w * 2 + c] = + input[in_offset + c * image_size + w]; + } + } + } + }, 0, height, 1); +} -template -MaceStatus Transpose(const T *input, +template<> +inline void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool, + const float *input, + float *output, + const index_t height, + const index_t width) { + index_t image_size = height * width; + + thread_pool->Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t h = start; h < end; h += step) { + index_t in_offset = h * width; + index_t out_offset = h * width * 2; + +#if defined(MACE_ENABLE_NEON) + index_t w; + for (w = 0; w + 3 < width; w += 4) { + float32x4_t vi0 = vld1q_f32(input + in_offset); + float32x4_t vi1 = vld1q_f32(input + in_offset + image_size); + float32x4x2_t vi = {vi0, vi1}; + vst2q_f32(output + out_offset, vi); + in_offset += 4; + out_offset += 8; + } + for (; w < width; ++w) { + for (index_t c = 0; c < 2; ++c) { + output[h * width * 2 + w * 2 + c] = + input[h * width + image_size * c + w]; + } + } +#else + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < 2; ++c) { + output[out_offset + w * 2 + c] = + input[in_offset + c * image_size + w]; + } + } +#endif + } + }, 0, height, 1); +} + +template +MaceStatus Transpose(utils::ThreadPool *thread_pool, + const T *input, const std::vector &input_shape, const std::vector &dst_dims, - T *output, - DataType data_type = DataType::DT_FLOAT) { + T *output) { MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) || (input_shape.size() == 4 && dst_dims.size() == 4), "Only support 2D or 4D transpose"); @@ -68,41 +177,43 @@ MaceStatus Transpose(const T *input, index_t stride_i = height; index_t stride_j = width; index_t tile_size = height > 512 || width > 512 ? 64 : 32; -#pragma omp parallel for collapse(2) - for (index_t i = 0; i < height; i += tile_size) { - for (index_t j = 0; j < width; j += tile_size) { - index_t end_i = std::min(i + tile_size, height); - index_t end_j = std::min(j + tile_size, width); - for (index_t tile_i = i; tile_i < end_i; ++tile_i) { - for (index_t tile_j = j; tile_j < end_j; ++tile_j) { - output[tile_j * stride_i + tile_i] = - input[tile_i * stride_j + tile_j]; + + thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + index_t end_i = std::min(i + tile_size, height); + index_t end_j = std::min(j + tile_size, width); + for (index_t tile_i = i; tile_i < end_i; ++tile_i) { + for (index_t tile_j = j; tile_j < end_j; ++tile_j) { + output[tile_j * stride_i + tile_i] = + input[tile_i * stride_j + tile_j]; + } } } } - } + }, 0, height, tile_size, 0, width, tile_size); } else if (input_shape.size() == 4) { std::vector transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2}; std::vector transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1}; index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3]; - bool supported_dt = (data_type == DataType::DT_FLOAT || - data_type == DataType::DT_INT32); - if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3 && - supported_dt) { + if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3) { for (index_t b = 0; b < input_shape[0]; ++b) { - transpose::TransposeNHWCToNCHWC3(input + b * batch_size, - output + b * batch_size, - input_shape[1], - input_shape[2]); + TransposeNHWCToNCHWC3(thread_pool, + input + b * batch_size, + output + b * batch_size, + input_shape[1], + input_shape[2]); } } else if (dst_dims == transpose_order_from_NCHW_to_NHWC - && input_shape[1] == 2 && supported_dt) { + && input_shape[1] == 2) { for (index_t b = 0; b < input_shape[0]; ++b) { - transpose::TransposeNCHWToNHWCC2(input + b * batch_size, - output + b * batch_size, - input_shape[2], - input_shape[3]); + TransposeNCHWToNHWCC2(thread_pool, + input + b * batch_size, + output + b * batch_size, + input_shape[2], + input_shape[3]); } } else if (dst_dims == std::vector{0, 2, 1, 3}) { index_t height = input_shape[1]; @@ -114,7 +225,6 @@ MaceStatus Transpose(const T *input, index_t tile_size = std::max(static_cast(1), static_cast(std::sqrt( 8 * 1024 / channel))); -#pragma omp parallel for collapse(2) for (index_t i = 0; i < height; i += tile_size) { for (index_t j = 0; j < width; j += tile_size) { index_t end_i = std::min(i + tile_size, height); @@ -163,7 +273,6 @@ MaceStatus Transpose(const T *input, return MaceStatus::MACE_SUCCESS; } - } // namespace ops } // namespace mace diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index 8d4248570f7453002cc68024cd4017208da7e284..1254c643ceee467276d4c3b7af83d6f9f9238458 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -15,7 +15,7 @@ #include #include "mace/core/operator.h" -#include "mace/utils/quantize.h" +#include "mace/core/quantize.h" #include "mace/utils/memory.h" #ifdef MACE_ENABLE_OPENCL diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index a6421f45fed1b0520e468acaae58c5439c8c03e3..5fefeddcd1c523c0da1c3f1c384119f4865b361e 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -41,6 +41,11 @@ #include "mace/ops/arm/fp32/conv_2d_7x7.h" #include "mace/ops/arm/fp32/conv_2d_1xn.h" #include "mace/ops/arm/fp32/conv_general.h" +#include "mace/ops/arm/fp32/bias_add.h" +#include "mace/ops/arm/fp32/activation.h" +#else +#include "mace/ops/ref/activation.h" +#include "mace/ops/ref/bias_add.h" #endif // MACE_ENABLE_NEON #include "mace/ops/ref/conv_2d.h" @@ -67,12 +72,13 @@ class Conv2dOp : public ConvPool2dOpBase { public: explicit Conv2dOp(OpConstructContext *context) : ConvPool2dOpBase(context), - activation_(ops::StringToActivationType( + activation_delegator_(ops::StringToActivationType( Operation::GetOptionalArg("activation", - "NOOP"))), - relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)), - leakyrelu_coefficient_(Operation::GetOptionalArg( - "leakyrelu_coefficient", 0.0f)) {} + "NOOP")), + Operation::GetOptionalArg("max_limit", + 0.0f), + Operation::GetOptionalArg( + "leakyrelu_coefficient", 0.0f)) {} MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); @@ -80,8 +86,6 @@ class Conv2dOp : public ConvPool2dOpBase { const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; Tensor *output = this->Output(OUTPUT); - const index_t channels = filter->dim(0); - #ifdef MACE_ENABLE_NEON // the following params are used to decide which conv delegator to use const index_t stride_h = strides_[0]; @@ -91,11 +95,12 @@ class Conv2dOp : public ConvPool2dOpBase { const index_t filter_h = filter->dim(2); const index_t filter_w = filter->dim(3); const index_t input_channels = input->dim(1); + const index_t channels = filter->dim(0); // NOTE: delegator is fixed after first round of running, // although winograd depends on input params. // We do not support changeable filter for now. - if (conv2d_delegator_.get() == nullptr) { + if (conv2d_delegator_ == nullptr) { if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { conv2d_delegator_ = make_unique( @@ -166,7 +171,7 @@ class Conv2dOp : public ConvPool2dOpBase { conv2d_delegator_->Compute(context, input, filter, output); #else - if (ref_conv2d_delegator_.get() == nullptr) { + if (ref_conv2d_delegator_ == nullptr) { ref_conv2d_delegator_ = make_unique>(strides_, dilations_, paddings_, @@ -175,53 +180,21 @@ class Conv2dOp : public ConvPool2dOpBase { ref_conv2d_delegator_->Compute(context, input, filter, output); #endif - Tensor::MappingGuard bias_guard(bias); - Tensor::MappingGuard output_guard(output); - auto bias_data = bias == nullptr ? nullptr : bias->data(); - auto output_data = output->mutable_data(); - if (bias_data != nullptr) { - const index_t batch = input->dim(0); - const index_t height = output->dim(2); - const index_t width = output->dim(3); - const index_t image_size = height * width; -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - float *output_ptr = output_data + (b * channels + c) * image_size; - const float bias = bias_data[c]; -#if defined(MACE_ENABLE_NEON) - float32x4_t vbias = vdupq_n_f32(bias); - for (index_t i = 0; i <= image_size - 4; i += 4) { - float32x4_t v = vld1q_f32(output_ptr + i); - v = vaddq_f32(v, vbias); - vst1q_f32(output_ptr + i, v); - } - for (index_t i = (image_size >> 2) << 2; i < image_size; ++i) { - output_ptr[i] += bias; - } -#else - for (index_t i = 0; i < image_size; ++i) { - output_ptr[i] += bias; - } -#endif - } - } - } - - DoActivation(output_data, output_data, output->size(), activation_, - relux_max_limit_, leakyrelu_coefficient_); + bias_add_delegator_.Compute(context, output, bias, output); + activation_delegator_.Compute(context, output, output); return MaceStatus::MACE_SUCCESS; } private: - const ActivationType activation_; - const float relux_max_limit_; - const float leakyrelu_coefficient_; #ifdef MACE_ENABLE_NEON std::unique_ptr conv2d_delegator_; + arm::fp32::BiasAdd bias_add_delegator_; + arm::fp32::Activation activation_delegator_; #else std::unique_ptr> ref_conv2d_delegator_; + ref::BiasAdd bias_add_delegator_; + ref::Activation activation_delegator_; #endif // MACE_ENABLE_NEON private: @@ -230,17 +203,17 @@ class Conv2dOp : public ConvPool2dOpBase { }; #ifdef MACE_ENABLE_QUANTIZE -template <> +template<> class Conv2dOp : public ConvPool2dOpBase { public: explicit Conv2dOp(OpConstructContext *context) : ConvPool2dOpBase(context), activation_(ops::StringToActivationType( Operation::GetOptionalArg("activation", - "NOOP"))), + "NOOP"))), relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)), leakyrelu_coefficient_(Operation::GetOptionalArg( - "leakyrelu_coefficient", 0.0f)) {} + "leakyrelu_coefficient", 0.0f)) {} MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); @@ -334,7 +307,7 @@ class Conv2dOp : public ConvPool2dOpBase { scratch->GrowSize(im2col_size); im2col = make_unique(scratch->Scratch(im2col_size), DT_UINT8); uint8_t *im2col_data = im2col->mutable_data(); - Im2col(input_data, input->shape(), filter_h, filter_w, stride_h, + Im2col(context, input_data, input->shape(), filter_h, filter_w, stride_h, stride_w, static_cast(input->zero_point()), paddings[0], paddings[1], output->shape(), depth, im2col_data); gemm_input_data = im2col_data; @@ -366,87 +339,98 @@ class Conv2dOp : public ConvPool2dOpBase { } private: - template + template inline void Im2col( + const OpContext *context, const T *in_data, const std::vector &in_shape, const index_t filter_h, const index_t filter_w, const index_t stride_h, const index_t stride_w, const T zero_point, const int pad_height, const int pad_width, const std::vector &out_shape, - const index_t depth, T* im2col_data) { + const index_t depth, T *im2col_data) { const index_t input_row_size = in_shape[2] * in_shape[3]; const index_t patch_row_size = filter_w * in_shape[3]; -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t h = 0; h < out_shape[1]; ++h) { - for (index_t w = 0; w < out_shape[2]; ++w) { - // Reshape a patch of input to column, which is corresponding to - // a column of output(:, column). - const index_t ih_begin = h * stride_h - (pad_height >> 1); - const index_t ih_end = ih_begin + filter_h; - const index_t iw_begin = w * stride_w - (pad_width >> 1); - const index_t iw_end = iw_begin + filter_w; - // gate height and width to separate padding - const index_t ih_begin_gated = std::max(0, ih_begin); - const index_t ih_end_gated = std::min(ih_end, in_shape[1]); - const index_t iw_begin_gated = std::max(0, iw_begin); - const index_t iw_end_gated = std::min(iw_end, in_shape[2]); - const index_t pad_top = std::max(0, -ih_begin); - const index_t pad_bottom = ih_end - ih_end_gated; - const index_t pad_left = std::max(0, -iw_begin); - const index_t pad_right = iw_end - iw_end_gated; - index_t im2col_column_offset = - ((b * out_shape[1] + h) * out_shape[2] + w) * depth; - - // fill in padding top - if (pad_top > 0) { - std::fill_n(im2col_data + im2col_column_offset, - pad_top * patch_row_size, zero_point); - } + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t h = start1; h < end1; h += step1) { + for (index_t w = start2; w < end2; w += step2) { + // Reshape a patch of input to column, which is corresponding to + // a column of output(:, column). + const index_t ih_begin = h * stride_h - (pad_height >> 1); + const index_t ih_end = ih_begin + filter_h; + const index_t iw_begin = w * stride_w - (pad_width >> 1); + const index_t iw_end = iw_begin + filter_w; + // gate height and width to separate padding + const index_t ih_begin_gated = std::max(0, ih_begin); + const index_t ih_end_gated = std::min(ih_end, in_shape[1]); + const index_t iw_begin_gated = std::max(0, iw_begin); + const index_t iw_end_gated = std::min(iw_end, in_shape[2]); + const index_t pad_top = std::max(0, -ih_begin); + const index_t pad_bottom = ih_end - ih_end_gated; + const index_t pad_left = std::max(0, -iw_begin); + const index_t pad_right = iw_end - iw_end_gated; + index_t im2col_column_offset = + ((b * out_shape[1] + h) * out_shape[2] + w) * depth; + + // fill in padding top + if (pad_top > 0) { + std::fill_n(im2col_data + im2col_column_offset, + pad_top * patch_row_size, zero_point); + } - const index_t patch_row_size_gated = - std::min(filter_w - pad_left, - in_shape[2] - iw_begin_gated) * in_shape[3]; - MACE_CHECK(patch_row_size_gated == - ((filter_w - (pad_left + pad_right)) * in_shape[3])); - const index_t pad_left_size = pad_left * in_shape[3]; - const index_t pad_right_size = pad_right * in_shape[3]; - index_t im2col_offset = im2col_column_offset + - (pad_top * filter_w + pad_left) * in_shape[3]; - index_t in_offset = ((b * in_shape[1] + ih_begin_gated) * in_shape[2] - + iw_begin_gated) * in_shape[3]; - - // fill in effective rows - for (index_t ih = ih_begin_gated; ih < ih_end_gated; ++ih) { - // fill in padding left - if (pad_left > 0) { - const index_t left_offset = im2col_offset - pad_left_size; - std::fill_n(im2col_data + left_offset, pad_left_size, zero_point); + const index_t patch_row_size_gated = + std::min(filter_w - pad_left, + in_shape[2] - iw_begin_gated) * in_shape[3]; + MACE_CHECK(patch_row_size_gated == + ((filter_w - (pad_left + pad_right)) * in_shape[3])); + const index_t pad_left_size = pad_left * in_shape[3]; + const index_t pad_right_size = pad_right * in_shape[3]; + index_t im2col_offset = im2col_column_offset + + (pad_top * filter_w + pad_left) * in_shape[3]; + index_t + in_offset = ((b * in_shape[1] + ih_begin_gated) * in_shape[2] + + iw_begin_gated) * in_shape[3]; + + // fill in effective rows + for (index_t ih = ih_begin_gated; ih < ih_end_gated; ++ih) { + // fill in padding left + if (pad_left > 0) { + const index_t left_offset = im2col_offset - pad_left_size; + std::fill_n(im2col_data + left_offset, + pad_left_size, + zero_point); + } + // copy effective data + std::copy_n(in_data + in_offset, patch_row_size_gated, + im2col_data + im2col_offset); + // fill in padding right + if (pad_right > 0) { + const index_t + right_offset = im2col_offset + patch_row_size_gated; + std::fill_n(im2col_data + right_offset, pad_right_size, + zero_point); + } + in_offset += input_row_size; + im2col_offset += patch_row_size; } - // copy effective data - std::copy_n(in_data + in_offset, patch_row_size_gated, - im2col_data + im2col_offset); - // fill in padding right - if (pad_right > 0) { - const index_t right_offset = im2col_offset + patch_row_size_gated; - std::fill_n(im2col_data + right_offset, pad_right_size, + + // fill in padding bottom + if (pad_bottom > 0) { + const index_t pad_bottom_size = pad_bottom * patch_row_size; + const index_t bottom_offset = + im2col_column_offset + depth - pad_bottom_size; + std::fill_n(im2col_data + bottom_offset, pad_bottom_size, zero_point); } - in_offset += input_row_size; - im2col_offset += patch_row_size; - } - - // fill in padding bottom - if (pad_bottom > 0) { - const index_t pad_bottom_size = pad_bottom * patch_row_size; - const index_t bottom_offset = - im2col_column_offset + depth - pad_bottom_size; - std::fill_n(im2col_data + bottom_offset, pad_bottom_size, - zero_point); } } } - } + }, 0, out_shape[0], 1, 0, out_shape[1], 1, 0, out_shape[2], 1); } private: diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 49d11700a19668082a43efe8008f07ae8123acb4..7fb854787c032a5106c065d92830729d8243e9a1 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -1172,7 +1172,8 @@ void TestQuant(const index_t batch, auto bias_data = bias->data(); float bias_scale = q_input->scale() * q_filter->scale(); std::vector q_bias(bias->size()); - QuantizeWithScaleAndZeropoint( + QuantizeUtil quantize_util(OpTestContext::Get()->thread_pool()); + quantize_util.QuantizeWithScaleAndZeropoint( bias_data, bias->size(), bias_scale, 0, q_bias.data()); net.AddInputFromArray( "QuantizedBias", {out_channels}, q_bias, true, bias_scale, 0); diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc index 3dda169dd80f02a258d854ce88c7f511beab0167..7265208efdd3d62d682c1689b82049ce2dd42e07 100644 --- a/mace/ops/crop.cc +++ b/mace/ops/crop.cc @@ -90,7 +90,7 @@ class CropOp : public Operation { const index_t in_img_size = input_shape[1] * input_shape[2] * input_shape[3]; const index_t in_hw = input_shape[2] * input_shape[3]; -#pragma omp parallel for collapse(3) + for (int b = 0; b < output_shape[0]; ++b) { for (int c = 0; c < output_shape[1]; ++c) { for (int h = 0; h < output_shape[2]; ++h) { diff --git a/mace/ops/cumsum.cc b/mace/ops/cumsum.cc index f0117270c80ce25bda50ab8e8461302b521c484e..302fdfd585f4a16a7da42ebe1fd495c4f0ce9b6e 100644 --- a/mace/ops/cumsum.cc +++ b/mace/ops/cumsum.cc @@ -78,7 +78,6 @@ class CumsumOp : public Operation { const index_t cum_size = input_shape[axis_]; if (!reverse_) { -#pragma omp parallel for for (index_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) { index_t start_idx = outer_idx * cum_size * inner_size; for (index_t cum_idx = 0; cum_idx < cum_size; ++cum_idx) { @@ -105,7 +104,6 @@ class CumsumOp : public Operation { } } } else { -#pragma omp parallel for for (index_t outer_idx = outer_size - 1; outer_idx >= 0; --outer_idx) { index_t start_idx = outer_idx * cum_size * inner_size; for (index_t cum_idx = cum_size - 1; cum_idx >= 0; --cum_idx) { diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 6e9a0fa8db36209887f86d0fdc75d5c5d1a5c2bc..5692425ad10ba05f92fdf06c428106bdf15455a9 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -16,6 +16,16 @@ #if defined(MACE_ENABLE_NEON) #include +#include "mace/ops/arm/fp32/deconv_2d_2x2.h" +#include "mace/ops/arm/fp32/deconv_2d_3x3.h" +#include "mace/ops/arm/fp32/deconv_2d_4x4.h" +#include "mace/ops/arm/fp32/deconv_2d_general.h" +#include "mace/ops/arm/fp32/bias_add.h" +#include "mace/ops/arm/fp32/activation.h" +#else +#include "mace/ops/ref/bias_add.h" +#include "mace/ops/ref/activation.h" +#include "mace/ops/ref/deconv_2d.h" #endif #include @@ -27,9 +37,10 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/ops/activation.h" -#include "mace/ops/arm/deconv_2d_neon.h" +#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/utils/memory.h" #include "mace/utils/math.h" + #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/deconv_2d.h" @@ -38,21 +49,24 @@ namespace mace { namespace ops { -template +template class Deconv2dOp; -template <> +template<> class Deconv2dOp : public Deconv2dOpBase { public: explicit Deconv2dOp(OpConstructContext *context) - : Deconv2dOpBase(context) {} + : Deconv2dOpBase(context), + activation_delegator_(activation_, + relux_max_limit_, + leakyrelu_coefficient_) {} MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(0); const Tensor *filter = this->Input(1); const Tensor *bias = nullptr; const Tensor *output_shape_tensor = nullptr; - if (model_type_ == ops::CAFFE) { + if (model_type_ == CAFFE) { bias = this->InputSize() >= 3 ? this->Input(2) : nullptr; } else { output_shape_tensor = @@ -65,91 +79,9 @@ class Deconv2dOp : public Deconv2dOpBase { MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); - std::vector in_paddings(2, 0); - std::vector out_paddings(2, 0); - std::vector out_shape(4, 0); - std::vector padded_out_shape(4, 0); - - if (model_type_ == FrameworkType::TENSORFLOW) { // tensorflow - MACE_CHECK_NOTNULL(output_shape_tensor); - MACE_CHECK(output_shape_tensor->size() == 4); - Tensor::MappingGuard output_shape_mapper(output_shape_tensor); - auto output_shape_data = - output_shape_tensor->data(); - out_shape = - std::vector(output_shape_data, output_shape_data + 4); - - const index_t t = out_shape[1]; - out_shape[1] = out_shape[3]; - out_shape[3] = out_shape[2]; - out_shape[2] = t; - - CalcDeconvShape_TF( - input->shape().data(), - filter->shape().data(), - out_shape.data(), - strides_.data(), - 1, - padding_type_, - in_paddings.data(), - out_paddings.data(), - padded_out_shape.data(), - true); - } else { // caffe - if (!paddings_.empty()) out_paddings = paddings_; - CalcDeconvShape_Caffe( - input->shape().data(), - filter->shape().data(), - strides_.data(), - out_paddings.data(), - 1, - in_paddings.data(), - out_shape.data(), - padded_out_shape.data(), - true); - } - MACE_RETURN_IF_ERROR(output->Resize(out_shape)); - output->Clear(); - index_t kernel_h = filter->dim(2); - index_t kernel_w = filter->dim(3); - const index_t *in_shape = input->shape().data(); - - MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ", - out_shape[1]); - MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ", - in_shape[1]); - MACE_CHECK(in_shape[0] == out_shape[0], - "Input/Output batch size mismatch"); - std::function deconv_func; - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard bias_mapper(bias); - Tensor::MappingGuard output_mapper(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto bias_data = bias == nullptr ? nullptr : bias->data(); - auto output_data = output->mutable_data(); - - const index_t pad_h = out_paddings[0] / 2; - const index_t pad_w = out_paddings[1] / 2; - - index_t padded_out_size = - std::accumulate(padded_out_shape.begin(), - padded_out_shape.end(), - 1, - std::multiplies()) * sizeof(float); - ScratchBuffer *scratch = context->device()->scratch_buffer(); - scratch->Rewind(); - scratch->GrowSize(padded_out_size); - Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT); - padded_out.Reshape(padded_out_shape); - padded_out.Clear(); - auto *padded_out_data = padded_out.mutable_data(); +#ifdef MACE_ENABLE_NEON + const index_t kernel_h = filter->dim(2); + const index_t kernel_w = filter->dim(3); bool use_neon_2x2_s1 = kernel_h == kernel_w && kernel_h == 2 && strides_[0] == strides_[1] && strides_[0] == 1; @@ -166,197 +98,76 @@ class Deconv2dOp : public Deconv2dOpBase { bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 && strides_[0] == strides_[1] && strides_[0] == 2; - if (use_neon_2x2_s1) { - deconv_func = [=](const float *input, - const float *filter, - const index_t *input_shape, - const index_t *padded_output_shape, - float *padded_output) { - Deconv2dNeonK2x2S1(input, - filter, - input_shape, - padded_output_shape, - padded_output); - }; - } else if (use_neon_2x2_s2) { - deconv_func = [=](const float *input, - const float *filter, - const index_t *input_shape, - const index_t *padded_output_shape, - float *padded_output) { - Deconv2dNeonK2x2S2(input, - filter, - input_shape, - padded_output_shape, - padded_output); - }; - } else if (use_neon_3x3_s1) { - deconv_func = [=](const float *input, - const float *filter, - const index_t *input_shape, - const index_t *padded_output_shape, - float *padded_output) { - Deconv2dNeonK3x3S1(input, - filter, - input_shape, - padded_output_shape, - padded_output); - }; - } else if (use_neon_3x3_s2) { - deconv_func = [=](const float *input, - const float *filter, - const index_t *input_shape, - const index_t *padded_output_shape, - float *padded_output) { - Deconv2dNeonK3x3S2(input, - filter, - input_shape, - padded_output_shape, - padded_output); - }; - } else if (use_neon_4x4_s1) { - deconv_func = [=](const float *input, - const float *filter, - const index_t *input_shape, - const index_t *padded_output_shape, - float *padded_output) { - Deconv2dNeonK4x4S1(input, - filter, - input_shape, - padded_output_shape, - padded_output); - }; - } else if (use_neon_4x4_s2) { - deconv_func = [=](const float *input, - const float *filter, - const index_t *input_shape, - const index_t *padded_output_shape, - float *padded_output) { - Deconv2dNeonK4x4S2(input, - filter, - input_shape, - padded_output_shape, - padded_output); - }; - } else { - deconv_func = [=](const float *input, - const float *filter, - const index_t *input_shape, - const index_t *padded_output_shape, - float *padded_output) { - Deconv2dGeneral(input, - filter, - kernel_h, - kernel_w, - strides_.data(), - input_shape, - padded_output_shape, - padded_output); - }; + if (deconv2d_delegator_ == nullptr) { + if (use_neon_2x2_s1) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, model_type_); + } else if (use_neon_2x2_s2) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, model_type_); + } else if (use_neon_3x3_s1) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, model_type_); + } else if (use_neon_3x3_s2) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, model_type_); + } else if (use_neon_4x4_s1) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, model_type_); + } else if (use_neon_4x4_s2) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, model_type_); + } else { + deconv2d_delegator_ = + make_unique(strides_, + std::vector{1, 1}, + paddings_, + padding_type_, + model_type_); + } } - - bool no_pad = - (padded_out_shape[2] == out_shape[2]) && - (padded_out_shape[3] == out_shape[3]); - float *out_data = no_pad ? output_data : padded_out_data; - - deconv_func(input_data, - filter_data, - in_shape, - padded_out_shape.data(), - out_data); - if (!no_pad) { - CropPadOut(out_data, - padded_out_shape.data(), - out_shape.data(), - pad_h, - pad_w, - output_data); + deconv2d_delegator_->Compute(context, + input, + filter, + output_shape_tensor, + output); +#else + if (deconv2d_delegator_ == nullptr) { + deconv2d_delegator_ = make_unique>(strides_, + std::vector{ + 1, 1}, + paddings_, + padding_type_, + model_type_); } + deconv2d_delegator_->Compute(context, + input, + filter, + output_shape_tensor, + output); - if (bias_data != nullptr) { - const index_t batch = out_shape[0]; - const index_t channels = out_shape[1]; - const index_t img_size = out_shape[2] * out_shape[3]; -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - for (index_t i = 0; i < img_size; ++i) { - output_data[(b * channels + c) * img_size + i] += - bias_data[c]; - } - } - } - } +#endif // MACE_ENABLE_NEON - DoActivation(output_data, - output_data, - output->size(), - activation_, - relux_max_limit_, - leakyrelu_coefficient_); + bias_add_delegator_.Compute(context, output, bias, output); + activation_delegator_.Compute(context, output, output); return MaceStatus::MACE_SUCCESS; } private: - void Deconv2dGeneral(const float *input, - const float *filter, - const index_t kernel_h, - const index_t kernel_w, - const int *strides, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_img_size = out_height * out_width; - const index_t in_img_size = in_height * in_width; - - const int kernel_size = static_cast(kernel_h * kernel_w); - std::vector index_map(kernel_size, 0); - for (index_t i = 0; i < kernel_h; ++i) { - for (index_t j = 0; j < kernel_w; ++j) { - index_map[i * kernel_w + j] = i * out_width + j; - } - } - - const index_t batch = in_shape[0]; - const index_t out_channels = out_shape[1]; - const index_t in_channels = in_shape[1]; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (int b = 0; b < batch; ++b) { - for (int oc = 0; oc < out_channels; ++oc) { - float *out_base = - output + (b * out_channels + oc) * out_img_size; - for (int i = 0; i < in_height; ++i) { - for (int j = 0; j < in_width; ++j) { - const index_t out_offset = - i * strides[0] * out_width + j * strides[1]; - for (int ic = 0; ic < in_channels; ++ic) { - const index_t input_idx = - (b * in_channels + ic) * in_img_size + i * in_width + j; - const float val = input[input_idx]; - const index_t kernel_offset = - (oc * in_channels + ic) * kernel_size; - for (int k = 0; k < kernel_size; ++k) { - const index_t out_idx = out_offset + index_map[k]; - const index_t kernel_idx = kernel_offset + k; - out_base[out_idx] += val * filter[kernel_idx]; - } - } - } - } - } - } - } +#ifdef MACE_ENABLE_NEON + std::unique_ptr deconv2d_delegator_; + arm::fp32::BiasAdd bias_add_delegator_; + arm::fp32::Activation activation_delegator_; +#else + ref::BiasAdd bias_add_delegator_; + ref::Activation activation_delegator_; + std::unique_ptr> deconv2d_delegator_; +#endif // MACE_ENABLE_NEON }; #ifdef MACE_ENABLE_OPENCL -template +template class Deconv2dOp : public Deconv2dOpBase { public: explicit Deconv2dOp(OpConstructContext *context) @@ -394,7 +205,7 @@ class Deconv2dOp : public Deconv2dOpBase { const Tensor *filter = this->Input(1); const Tensor *bias = nullptr; const Tensor *output_shape_tensor = nullptr; - if (model_type_ == ops::CAFFE) { + if (model_type_ == CAFFE) { bias = this->InputSize() >= 3 ? this->Input(2) : nullptr; } else { output_shape_tensor = @@ -407,41 +218,30 @@ class Deconv2dOp : public Deconv2dOpBase { MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); - std::vector in_paddings(2, 0); - std::vector out_shape(4, 0); - - if (model_type_ == FrameworkType::TENSORFLOW) { - MACE_CHECK_NOTNULL(output_shape_tensor); - MACE_CHECK(output_shape_tensor->size() == 4); - Tensor::MappingGuard output_shape_mapper(output_shape_tensor); - auto output_shape_data = - output_shape_tensor->data(); + std::vector out_shape; + if (output_shape_tensor) { + Tensor::MappingGuard out_shape_guard(output_shape_tensor); + MACE_CHECK(output_shape_tensor->size() == 4, + "output shape should be 4-dims"); out_shape = - std::vector(output_shape_data, output_shape_data + 4); - - CalcDeconvShape_TF( - input->shape().data(), - filter->shape().data(), - out_shape.data(), - strides_.data(), - 1, - padding_type_, - in_paddings.data(), - nullptr, - nullptr); - } else { - std::vector out_paddings(2, 0); - if (!paddings_.empty()) out_paddings = paddings_; - CalcDeconvShape_Caffe( - input->shape().data(), - filter->shape().data(), - strides_.data(), - out_paddings.data(), - 1, - in_paddings.data(), - out_shape.data(), - nullptr); + std::vector(output_shape_tensor->data(), + output_shape_tensor->data() + 4); } + std::vector in_paddings; + std::vector out_paddings; + + CalDeconvOutputShapeAndPadSize(input->shape(), + filter->shape(), + strides_, + padding_type_, + paddings_, + 1, + &out_shape, + &in_paddings, + &out_paddings, + nullptr, + model_type_, + NHWC); return kernel_->Compute(context, input, filter, bias, strides_.data(), in_paddings.data(), activation_, @@ -454,7 +254,6 @@ class Deconv2dOp : public Deconv2dOpBase { }; #endif // MACE_ENABLE_OPENCL - void RegisterDeconv2D(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::CPU, float); diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h index 008c6a5b5ea2cb9cc14c7c40940206e81c4f7aed..50a2ecee5e8329ea24aa3fbae419823831d1b370 100644 --- a/mace/ops/deconv_2d.h +++ b/mace/ops/deconv_2d.h @@ -27,11 +27,6 @@ namespace mace { namespace ops { -enum FrameworkType { - TENSORFLOW = 0, - CAFFE = 1, -}; - class Deconv2dOpBase : public Operation { public: explicit Deconv2dOpBase(OpConstructContext *context) @@ -41,7 +36,7 @@ class Deconv2dOpBase : public Operation { "padding", static_cast(SAME)))), paddings_(Operation::GetRepeatedArgs("padding_values")), group_(Operation::GetOptionalArg("group", 1)), - model_type_(static_cast( + model_type_(static_cast( Operation::GetOptionalArg("framework_type", 0))), activation_(ops::StringToActivationType( Operation::GetOptionalArg("activation", @@ -51,140 +46,6 @@ class Deconv2dOpBase : public Operation { leakyrelu_coefficient_( Operation::GetOptionalArg("leakyrelu_coefficient", 0.0f)) {} - static void CalcDeconvShape_Caffe( - const index_t *input_shape, // NHWC - const index_t *filter_shape, // OIHW - const int *strides, - const int *out_paddings, - const int group, - int *in_paddings, - index_t *out_shape, - index_t *padded_out_shape, - const bool isNCHW = false) { - MACE_CHECK_NOTNULL(out_paddings); - MACE_CHECK_NOTNULL(input_shape); - MACE_CHECK_NOTNULL(filter_shape); - MACE_CHECK_NOTNULL(strides); - - const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; - const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; - - const index_t output_channel = filter_shape[0] * group; - - const index_t kernel_h = filter_shape[2]; - const index_t kernel_w = filter_shape[3]; - - index_t padded_out_height = - (in_height - 1) * strides[0] + kernel_h; - index_t padded_out_width = - (in_width - 1) * strides[1] + kernel_w; - - if (in_paddings != nullptr) { - in_paddings[0] = static_cast((kernel_h - 1) * 2 - out_paddings[0]); - in_paddings[1] = static_cast((kernel_w - 1) * 2 - out_paddings[1]); - in_paddings[0] = std::max(0, in_paddings[0]); - in_paddings[1] = std::max(0, in_paddings[1]); - } - - if (padded_out_shape != nullptr) { - padded_out_shape[0] = input_shape[0]; - padded_out_shape[1] = isNCHW ? output_channel : padded_out_height; - padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width; - padded_out_shape[3] = isNCHW ? padded_out_width : output_channel; - } - - if (out_shape != nullptr) { - index_t out_height = padded_out_height - out_paddings[0]; - index_t out_width = padded_out_width - out_paddings[1]; - out_shape[0] = input_shape[0]; - out_shape[1] = isNCHW ? output_channel : out_height; - out_shape[2] = isNCHW ? out_height : out_width; - out_shape[3] = isNCHW ? out_width : output_channel; - } - } - - static void CalcDeconvShape_TF( - const index_t *input_shape, // NHWC - const index_t *filter_shape, // OIHW - const index_t *output_shape, - const int *strides, - const int group, - Padding padding_type, - int *in_paddings, - int *out_paddings, - index_t *padded_out_shape, - const bool isNCHW = false) { - MACE_CHECK_NOTNULL(output_shape); - MACE_CHECK_NOTNULL(input_shape); - MACE_CHECK_NOTNULL(filter_shape); - MACE_CHECK_NOTNULL(strides); - - const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; - const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; - - const index_t out_height = isNCHW ? output_shape[2] : output_shape[1]; - const index_t out_width = isNCHW ? output_shape[3] : output_shape[2]; - - const index_t extended_in_height = (in_height - 1) * strides[0] + 1; - const index_t extended_in_width = (in_width - 1) * strides[1] + 1; - - const index_t kernel_h = filter_shape[2]; - const index_t kernel_w = filter_shape[3]; - - index_t expected_input_height = 0, expected_input_width = 0; - - switch (padding_type) { - case VALID: - expected_input_height = - (out_height - kernel_h + strides[0]) / strides[0]; - expected_input_width = - (out_width - kernel_w + strides[1]) / strides[1]; - break; - case SAME: - expected_input_height = - (out_height + strides[0] - 1) / strides[0]; - expected_input_width = - (out_width + strides[1] - 1) / strides[1]; - break; - default: - MACE_CHECK(false, "Unsupported padding type: ", padding_type); - } - - MACE_CHECK(expected_input_height == in_height, - expected_input_height, "!=", in_height); - MACE_CHECK(expected_input_width == in_width, - expected_input_width, "!=", in_width); - - const index_t padded_out_height = - (in_height - 1) * strides[0] + kernel_h; - const index_t padded_out_width = - (in_width - 1) * strides[1] + kernel_w; - - if (in_paddings != nullptr) { - const int p_h = - static_cast(out_height + kernel_h - 1 - extended_in_height); - const int p_w = - static_cast(out_width + kernel_w - 1 - extended_in_width); - in_paddings[0] = std::max(0, p_h); - in_paddings[1] = std::max(0, p_w); - } - - if (out_paddings != nullptr) { - const int o_p_h = static_cast(padded_out_height - out_height); - const int o_p_w = static_cast(padded_out_width - out_width); - out_paddings[0] = std::max(0, o_p_h); - out_paddings[1] = std::max(0, o_p_w); - } - - if (padded_out_shape != nullptr) { - index_t output_channel = filter_shape[0] * group; - padded_out_shape[0] = output_shape[0]; - padded_out_shape[1] = isNCHW ? output_channel : padded_out_height; - padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width; - padded_out_shape[3] = isNCHW ? padded_out_width : output_channel; - } - } - protected: std::vector strides_; // [stride_h, stride_w] const Padding padding_type_; @@ -196,34 +57,6 @@ class Deconv2dOpBase : public Operation { const float leakyrelu_coefficient_; }; -template -void CropPadOut(const T *input, - const index_t *in_shape, - const index_t *out_shape, - const index_t pad_h, - const index_t pad_w, - T *output) { - const index_t batch = in_shape[0]; - const index_t channel = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; -#pragma omp parallel for collapse(3) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channel; ++j) { - for (int k = 0; k < out_height; ++k) { - const T *input_base = - input + ((i * channel + j) * in_height + (k + pad_h)) * in_width; - T *output_base = - output + ((i * channel + j) * out_height + k)* out_width; - memcpy(output_base, input_base + pad_w, out_width * sizeof(T)); - } - } - } -} - } // namespace ops } // namespace mace diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc index d8a1c621a49656a845319e1c849b9037e618fec4..25aa7eeeeed80e6403c125ec101a95c536eebe2c 100644 --- a/mace/ops/deconv_2d_test.cc +++ b/mace/ops/deconv_2d_test.cc @@ -38,7 +38,7 @@ void RunTestSimple(const std::vector &input_shape, const std::vector &filter_data, const std::vector &expected_shape, const std::vector &expected_data, - ops::FrameworkType model_type) { + FrameworkType model_type) { OpsTestNet net; // Add input data const index_t out_channels = filter_shape[2]; @@ -49,7 +49,7 @@ void RunTestSimple(const std::vector &input_shape, // TODO(liutuo): remove the unused transform net.TransformFilterDataFormat("Filter", HWOI, "FilterOIHW", OIHW); if (D == DeviceType::GPU) { - if (model_type == ops::FrameworkType::CAFFE) { + if (model_type == FrameworkType::CAFFE) { OpDefBuilder("Deconv2D", "Deconv2dTest") .Input("Input") .Input("FilterOIHW") @@ -80,7 +80,7 @@ void RunTestSimple(const std::vector &input_shape, net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - if (model_type == ops::FrameworkType::CAFFE) { + if (model_type == FrameworkType::CAFFE) { OpDefBuilder("Deconv2D", "Deconv2dTest") .Input("InputNCHW") .Input("FilterOIHW") @@ -128,7 +128,7 @@ void TestNHWCSimple3x3SAME_S1() { {4.5, 4.6, 4.7, 6.5, 6.6, 6.7, 4.5, 4.6, 4.7, 6.5, 6.6, 6.7, 9.5, 9.6, 9.7, 6.5, 6.6, 6.7, 4.5, 4.6, 4.7, 6.5, 6.6, 6.7, 4.5, 4.6, 4.7}, - ops::FrameworkType::TENSORFLOW); + FrameworkType::TENSORFLOW); RunTestSimple({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, {0, 0, 0}, 1, Padding::VALID, {2, 2}, {0}, {3, 3, 3, 1}, @@ -137,7 +137,7 @@ void TestNHWCSimple3x3SAME_S1() { {1, 3, 3, 3}, {4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 6, 6, 9, 9, 9, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 4, 4}, - ops::FrameworkType::CAFFE); + FrameworkType::CAFFE); RunTestSimple({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0}, 1, Padding::SAME, {}, {1, 3, 3, 3}, {3, 3, 3, 1}, @@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME_S1() { {54, 66, 78, 126, 147, 168, 130, 146, 162, 198, 225, 252, 405, 450, 495, 366, 399, 432, 354, 378, 402, 630, 669, 708, 502, 530, 558}, - ops::FrameworkType::TENSORFLOW); + FrameworkType::TENSORFLOW); RunTestSimple({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0}, 1, Padding::SAME, {2, 2}, {0}, {3, 3, 3, 1}, @@ -157,7 +157,7 @@ void TestNHWCSimple3x3SAME_S1() { {54, 66, 78, 126, 147, 168, 130, 146, 162, 198, 225, 252, 405, 450, 495, 366, 399, 432, 354, 378, 402, 630, 669, 708, 502, 530, 558}, - ops::FrameworkType::CAFFE); + FrameworkType::CAFFE); } template @@ -175,7 +175,7 @@ void TestNHWCSimple3x3SAME_S2() { 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1}, - ops::FrameworkType::TENSORFLOW); + FrameworkType::TENSORFLOW); RunTestSimple({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, {0, 0, 0}, 2, Padding::SAME, {2, 2}, {0}, {3, 3, 3, 1}, @@ -188,7 +188,7 @@ void TestNHWCSimple3x3SAME_S2() { 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1}, - ops::FrameworkType::CAFFE); + FrameworkType::CAFFE); RunTestSimple({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0}, 2, Padding::SAME, {}, {1, 6, 6, 3}, {3, 3, 3, 1}, @@ -206,7 +206,7 @@ void TestNHWCSimple3x3SAME_S2() { 83, 94, 105, 116, 127, 138, 252, 276, 300, 142, 155, 168, 304, 332, 360, 168, 183, 198, 70, 77, 84, 91, 98, 105, 192, 207, 222, 104, 112, 120, 218, 235, 252, 117, 126, 135}, - ops::FrameworkType::TENSORFLOW); + FrameworkType::TENSORFLOW); RunTestSimple({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0}, 2, Padding::SAME, {2, 2}, {0}, {3, 3, 3, 1}, @@ -219,7 +219,7 @@ void TestNHWCSimple3x3SAME_S2() { 140, 151, 162, 78, 84, 90, 116, 127, 138, 252, 276, 300, 142, 155, 168, 304, 332, 360, 168, 183, 198, 91, 98, 105, 192, 207, 222, 104, 112, 120, 218, 235, 252, 117, 126, 135}, - ops::FrameworkType::CAFFE); + FrameworkType::CAFFE); } template @@ -236,7 +236,7 @@ void TestNHWCSimple3x3SAME_S2_1() { 18, 18, 18, 45, 45, 45, 27, 27, 27, 45, 45, 45, 18, 18, 18, 30, 30, 30, 75, 75, 75, 45, 45, 45, 75, 75, 75, 30, 30, 30, 12, 12, 12, 30, 30, 30, 18, 18, 18, 30, 30, 30, 12, 12, 12}, - ops::FrameworkType::TENSORFLOW); + FrameworkType::TENSORFLOW); } template @@ -261,7 +261,7 @@ void TestNHWCSimple3x3VALID_S2() { 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1}, - ops::FrameworkType::TENSORFLOW); + FrameworkType::TENSORFLOW); } template @@ -278,7 +278,7 @@ void TestNHWCSimple3x3VALID_S1() { 366, 399, 432, 234, 252, 270, 146, 157, 168, 354, 378, 402, 630, 669, 708, 502, 530, 558, 294, 309, 324, 133, 140, 147, 306, 321, 336, 522, 546, 570, 398, 415, 432, 225, 234, 243}, - ops::FrameworkType::TENSORFLOW); + FrameworkType::TENSORFLOW); } template @@ -287,7 +287,7 @@ void TestNHWCSimple2x2SAME() { {1, 2, 2, 1}, {3, 3, 1, 1}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, {1, 2, 2, 1}, {4.f, 4.f, 4.f, 4.f}, - ops::FrameworkType::TENSORFLOW); + FrameworkType::TENSORFLOW); } template @@ -298,7 +298,7 @@ void TestNHWCSimple2x2VALID() { {1, 5, 5, 1}, {1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f, 2.f, 2.f, 4.f, 2.f, 2.f, 1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f}, - ops::FrameworkType::TENSORFLOW); + FrameworkType::TENSORFLOW); } } // namespace @@ -388,11 +388,11 @@ void TestComplexDeconvNxN(const int batch, std::vector paddings; std::vector output_shape; - ops::FrameworkType model_type = + FrameworkType model_type = padding < 0 ? - ops::FrameworkType::TENSORFLOW : ops::FrameworkType::CAFFE; + FrameworkType::TENSORFLOW : FrameworkType::CAFFE; - if (model_type == ops::FrameworkType::TENSORFLOW) { + if (model_type == FrameworkType::TENSORFLOW) { if (type == Padding::SAME) { out_h = (height - 1) * stride_h + 1; out_w = (width - 1) * stride_w + 1; @@ -410,7 +410,7 @@ void TestComplexDeconvNxN(const int batch, paddings.push_back(padding); } - if (model_type == ops::FrameworkType::CAFFE) { + if (model_type == FrameworkType::CAFFE) { OpDefBuilder("Deconv2D", "Deconv2dTest") .Input("InputNCHW") .Input("Filter") @@ -448,7 +448,7 @@ void TestComplexDeconvNxN(const int batch, expected->Copy(*net.GetOutput("Output")); // run on gpu - if (model_type == ops::FrameworkType::CAFFE) { + if (model_type == FrameworkType::CAFFE) { OpDefBuilder("Deconv2D", "Deconv2dTest") .Input("Input") .Input("Filter") diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc index 2460d75a258068c4e0f08576311bf93ace6b3289..09208e7abf1194455450cb038343b0e79c65891f 100644 --- a/mace/ops/depth_to_space.cc +++ b/mace/ops/depth_to_space.cc @@ -58,7 +58,6 @@ class DepthToSpaceOp : public Operation { const T *input_ptr = input->data(); T *output_ptr = output->mutable_data(); -#pragma omp parallel for schedule(runtime) for (index_t b = 0; b < batch_size; ++b) { for (index_t d = 0; d < output_depth; ++d) { for (index_t h = 0; h < output_height; ++h) { diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 24b5d4192265a35397c54cc58e009e870943ad64..522a3b357ed24f5804a9bb2d4af41f8605e644a2 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -21,6 +21,11 @@ #if defined(MACE_ENABLE_NEON) #include "mace/ops/arm/fp32/depthwise_conv_2d_3x3.h" +#include "mace/ops/arm/fp32/bias_add.h" +#include "mace/ops/arm/fp32/activation.h" +#else +#include "mace/ops/ref/activation.h" +#include "mace/ops/ref/bias_add.h" #endif // MACE_ENABLE_NEON #ifdef MACE_ENABLE_QUANTIZE @@ -36,7 +41,7 @@ #include "mace/ops/conv_pool_2d_base.h" #include "mace/public/mace.h" #include "mace/utils/memory.h" -#include "mace/utils/quantize.h" +#include "mace/core/quantize.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/buffer/depthwise_conv2d.h" @@ -69,7 +74,10 @@ template<> class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { public: explicit DepthwiseConv2dOp(OpConstructContext *context) - : DepthwiseConv2dOpBase(context) {} + : DepthwiseConv2dOpBase(context), + activation_delegator_(activation_, + relux_max_limit_, + leakyrelu_coefficient_) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -129,30 +137,8 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { ref_conv2d_delegator_->Compute(context, input, filter, output); #endif // MACE_ENABLE_NEON - Tensor::MappingGuard bias_guard(bias); - Tensor::MappingGuard output_guard(output); - auto bias_data = bias == nullptr ? nullptr : bias->data(); - auto output_data = output->mutable_data(); - - const index_t batch = output->dim(0); - const index_t channels = output->dim(1); - const index_t height = output->dim(2); - const index_t width = output->dim(3); - - if (bias_data != nullptr) { -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - for (index_t i = 0; i < height * width; ++i) { - output_data[(b * channels + c) * height * width + i] += - bias_data[c]; - } - } - } - } - - DoActivation(output_data, output_data, output->size(), activation_, - relux_max_limit_, leakyrelu_coefficient_); + bias_add_delegator_.Compute(context, output, bias, output); + activation_delegator_.Compute(context, output, output); return MaceStatus::MACE_SUCCESS; } @@ -160,6 +146,11 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { private: #ifdef MACE_ENABLE_NEON std::unique_ptr conv2d_delegator_; + arm::fp32::BiasAdd bias_add_delegator_; + arm::fp32::Activation activation_delegator_; +#else + ref::BiasAdd bias_add_delegator_; + ref::Activation activation_delegator_; #endif // MACE_ENABLE_NEON std::unique_ptr> ref_conv2d_delegator_; @@ -169,7 +160,7 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { }; #ifdef MACE_ENABLE_QUANTIZE -template <> +template<> class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { public: @@ -269,7 +260,7 @@ class DepthwiseConv2dOp float output_multiplier = input->scale() * filter->scale() / output->scale(); const int pad_hw[2] = {pad_top, pad_left}; - DepthwiseConv2dGeneral( + DepthwiseConv2dGeneral(context, input_data, filter_data, bias_data, input->shape().data(), output_shape.data(), filter->shape().data(), input->zero_point(), filter->zero_point(), output->zero_point(), output_multiplier, @@ -279,7 +270,8 @@ class DepthwiseConv2dOp return MaceStatus::MACE_SUCCESS; } private: - void DepthwiseConv2dGeneral(const uint8_t *input, + void DepthwiseConv2dGeneral(const OpContext *context, + const uint8_t *input, const uint8_t *filter, const int32_t *bias, const index_t *in_shape, @@ -293,54 +285,60 @@ class DepthwiseConv2dOp const int *dilation_hw, const int *pad_hw, uint8_t *output) { -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t h = 0; h < out_shape[1]; ++h) { - for (index_t w = 0; w < out_shape[2]; ++w) { - for (index_t m = 0; m < out_shape[3]; ++m) { - const index_t filter_height = filter_shape[0]; - const index_t filter_width = filter_shape[1]; - const index_t in_channels = filter_shape[2]; - const index_t depth_multiplier = filter_shape[3]; - const index_t in_height = in_shape[1]; - const index_t in_width = in_shape[2]; - const index_t out_height = out_shape[1]; - const index_t out_width = out_shape[2]; - const index_t out_channels = out_shape[3]; - index_t out_offset = - ((b * out_height + h) * out_width + w) * out_channels + m; - index_t c = m / depth_multiplier; - index_t o = m % depth_multiplier; - index_t ih_base = h * stride_hw[0] - pad_hw[0]; - index_t iw_base = w * stride_hw[1] - pad_hw[1]; - int32_t sum = 0; - for (index_t kh = 0; kh < filter_height; ++kh) { - const index_t ih = ih_base + kh * dilation_hw[0]; - for (index_t kw = 0; kw < filter_width; ++kw) { - const index_t iw = iw_base + kw * dilation_hw[1]; - if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { - index_t in_offset = - ((b * in_height + ih) * in_width + iw) * in_channels + c; - index_t filter_offset = - ((kh * filter_width + kw) * in_channels + c) - * depth_multiplier + o; - - sum += (input[in_offset] - input_zero) * - (filter[filter_offset] - filter_zero); + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t h = start1; h < end1; h += step1) { + for (index_t w = 0; w < out_shape[2]; ++w) { + for (index_t m = 0; m < out_shape[3]; ++m) { + const index_t filter_height = filter_shape[0]; + const index_t filter_width = filter_shape[1]; + const index_t in_channels = filter_shape[2]; + const index_t depth_multiplier = filter_shape[3]; + const index_t in_height = in_shape[1]; + const index_t in_width = in_shape[2]; + const index_t out_height = out_shape[1]; + const index_t out_width = out_shape[2]; + const index_t out_channels = out_shape[3]; + index_t out_offset = + ((b * out_height + h) * out_width + w) * out_channels + m; + index_t c = m / depth_multiplier; + index_t o = m % depth_multiplier; + index_t ih_base = h * stride_hw[0] - pad_hw[0]; + index_t iw_base = w * stride_hw[1] - pad_hw[1]; + int32_t sum = 0; + for (index_t kh = 0; kh < filter_height; ++kh) { + const index_t ih = ih_base + kh * dilation_hw[0]; + for (index_t kw = 0; kw < filter_width; ++kw) { + const index_t iw = iw_base + kw * dilation_hw[1]; + if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { + index_t in_offset = + ((b * in_height + ih) * in_width + iw) * in_channels + + c; + index_t filter_offset = + ((kh * filter_width + kw) * in_channels + c) + * depth_multiplier + o; + + sum += (input[in_offset] - input_zero) * + (filter[filter_offset] - filter_zero); + } } } + if (bias) { + sum += bias[m]; + } + sum = static_cast(std::round(sum * output_multiplier)); + sum += output_zero; + output[out_offset] = + static_cast(std::min(255, std::max(0, sum))); } - if (bias) { - sum += bias[m]; - } - sum = static_cast(std::round(sum * output_multiplier)); - sum += output_zero; - output[out_offset] = - static_cast(std::min(255, std::max(0, sum))); } } } - } + }, 0, out_shape[0], 1, 0, out_shape[1], 1); } inline tflite::Dims<4> ShapeToTfliteDims(const std::vector &shape) { diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 72a50f24ce868da3ab5344062e3fa5ebeefbda2f..58852a012e84fb6664331708738adcd180519e5d 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -440,7 +440,8 @@ void TestQuant(const index_t batch, auto bias_data = bias->data(); float bias_scale = q_input->scale() * q_filter->scale(); std::vector q_bias(bias->size()); - QuantizeWithScaleAndZeropoint( + QuantizeUtil quantize_util(OpTestContext::Get()->thread_pool()); + quantize_util.QuantizeWithScaleAndZeropoint( bias_data, bias->size(), bias_scale, 0, q_bias.data()); net.AddInputFromArray( "QuantizedBias", {out_channels}, q_bias, true, bias_scale, 0); diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc index 3d203cfa5678c1ca407b6db2d441890bc00785a5..6111ea3062b241514fccca9167410f6314e4fcaf 100644 --- a/mace/ops/depthwise_deconv2d.cc +++ b/mace/ops/depthwise_deconv2d.cc @@ -16,6 +16,16 @@ #if defined(MACE_ENABLE_NEON) #include +#include "mace/ops/arm/fp32/depthwise_deconv_2d_general.h" +#include "mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h" +#include "mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h" +#include "mace/ops/arm/fp32/bias_add.h" +#include "mace/ops/arm/fp32/activation.h" + +#else +#include "mace/ops/ref/depthwise_deconv_2d.h" +#include "mace/ops/ref/bias_add.h" +#include "mace/ops/ref/activation.h" #endif #include @@ -25,10 +35,11 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" -#include "mace/ops/arm/depthwise_deconv2d_neon.h" #include "mace/utils/math.h" #include "mace/public/mace.h" #include "mace/utils/memory.h" +#include "mace/ops/common/conv_pool_2d_util.h" + #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/depthwise_deconv2d.h" @@ -45,7 +56,10 @@ class DepthwiseDeconv2dOp : public Deconv2dOpBase { public: explicit DepthwiseDeconv2dOp(OpConstructContext *context) - : Deconv2dOpBase(context) {} + : Deconv2dOpBase(context), + activation_delegator_(activation_, + relux_max_limit_, + leakyrelu_coefficient_) {} MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(0); @@ -57,60 +71,12 @@ class DepthwiseDeconv2dOp MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); - std::vector out_paddings(2, 0); - std::vector out_shape(4, 0); - std::vector padded_out_shape(4, 0); - - if (!paddings_.empty()) out_paddings = paddings_; - CalcDeconvShape_Caffe( - input->shape().data(), - filter->shape().data(), - strides_.data(), - out_paddings.data(), - group_, - nullptr, - out_shape.data(), - padded_out_shape.data(), - true); - - MACE_RETURN_IF_ERROR(output->Resize(out_shape)); - output->Clear(); - index_t kernel_h = filter->dim(2); - index_t kernel_w = filter->dim(3); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard bias_mapper(bias); - Tensor::MappingGuard output_mapper(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto bias_data = bias == nullptr ? nullptr : bias->data(); - - auto output_data = output->mutable_data(); - - const index_t pad_left = out_paddings[0] / 2; - const index_t pad_top = out_paddings[1] / 2; - - index_t padded_out_size = - PadAlignSize(std::accumulate(padded_out_shape.begin(), - padded_out_shape.end(), - 1, - std::multiplies()) - * sizeof(float) + MACE_EXTRA_BUFFER_PAD_SIZE); - ScratchBuffer *scratch = context->device()->scratch_buffer(); - scratch->Rewind(); - scratch->GrowSize(padded_out_size); - Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT); - padded_out.Reshape(padded_out_shape); - padded_out.Clear(); - auto *padded_out_data = padded_out.mutable_data(); - const index_t in_channels = input->dim(1); - const index_t out_channels = output->dim(1); - - bool no_pad = paddings_[0] == 0 && paddings_[1] == 0; - float *out_data = no_pad ? output_data : padded_out_data; + bool is_depthwise = group_ == in_channels; +#ifdef MACE_ENABLE_NEON + const index_t kernel_h = filter->dim(2); + const index_t kernel_w = filter->dim(3); bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 && strides_[0] == strides_[1] && strides_[0] == 1; bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 && @@ -120,289 +86,101 @@ class DepthwiseDeconv2dOp bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 && strides_[0] == strides_[1] && strides_[0] == 2; - bool is_depthwise = (group_ == in_channels && group_ == out_channels); - - std::function kernel_func; - - if (use_neon_3x3_s1) { - kernel_func = [=](const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *padded_out_shape, - float *padded_output) { - if (is_depthwise) { - DepthwiseDeconv2dNeonK3x3S1(input, - filter, - in_shape, - padded_out_shape, - padded_output); - } else { - GroupDeconv2dNeonK3x3S1(input, - filter, - group, - in_shape, - padded_out_shape, - padded_output); - } - }; - } else if (use_neon_3x3_s2) { - kernel_func = [=](const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *padded_out_shape, - float *padded_output) { - if (is_depthwise) { - DepthwiseDeconv2dNeonK3x3S2(input, - filter, - in_shape, - padded_out_shape, - padded_output); - } else { - GroupDeconv2dNeonK3x3S2(input, - filter, - group, - in_shape, - padded_out_shape, - padded_output); - } - }; - } else if (use_neon_4x4_s1) { - kernel_func = [=](const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *padded_out_shape, - float *padded_output) { - if (is_depthwise) { - DepthwiseDeconv2dNeonK4x4S1(input, - filter, - in_shape, - padded_out_shape, - padded_output); + if (deconv2d_delegator_ == nullptr) { + if (is_depthwise) { + if (use_neon_3x3_s1) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, CAFFE); + } else if (use_neon_3x3_s2) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, CAFFE); + } else if (use_neon_4x4_s1) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, CAFFE); + } else if (use_neon_4x4_s2) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, CAFFE); } else { - GroupDeconv2dNeonK4x4S1(input, - filter, - group, - in_shape, - padded_out_shape, - padded_output); + deconv2d_delegator_ = + make_unique( + strides_, + std::vector{1, 1}, + paddings_, + padding_type_, + CAFFE); } - }; - } else if (use_neon_4x4_s2) { - kernel_func = [=](const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *padded_out_shape, - float *padded_output) { - if (is_depthwise) { - DepthwiseDeconv2dNeonK4x4S2(input, - filter, - in_shape, - padded_out_shape, - padded_output); - } else { - GroupDeconv2dNeonK4x4S2(input, - filter, - group, - in_shape, - padded_out_shape, - padded_output); - } - }; - } else { - kernel_func = [=](const float *input, - const float *filter, - const int group, - const index_t *in_shape, - const index_t *padded_out_shape, - float *padded_output) { - if (is_depthwise) { - DepthwiseDeconv2dGeneral(input, - filter, - kernel_h, - kernel_w, - strides_.data(), - in_shape, - padded_out_shape, - padded_output); + } else { + if (use_neon_3x3_s1) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, group_, CAFFE); + } else if (use_neon_3x3_s2) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, group_, CAFFE); + } else if (use_neon_4x4_s1) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, group_, CAFFE); + } else if (use_neon_4x4_s2) { + deconv2d_delegator_ = make_unique( + paddings_, padding_type_, group_, CAFFE); } else { - GroupDeconv2dGeneral(input, - filter, - kernel_h, - kernel_w, - strides_.data(), - group, - in_shape, - padded_out_shape, - padded_output); + deconv2d_delegator_ = make_unique( + strides_, + std::vector{1, 1}, + paddings_, + padding_type_, + group_, + CAFFE); } - }; - } - - kernel_func(input_data, - filter_data, - group_, - input->shape().data(), - padded_out_shape.data(), - out_data); - - if (!no_pad) { - CropPadOut(out_data, - padded_out_shape.data(), - out_shape.data(), - pad_left, - pad_top, - output_data); + } } - if (bias_data != nullptr) { - const index_t batch = out_shape[0]; - const index_t channels = out_shape[1]; - const index_t img_size = out_shape[2] * out_shape[3]; -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - for (index_t i = 0; i < img_size; ++i) { - output_data[(b * channels + c) * img_size + i] += - bias_data[c]; - } - } + deconv2d_delegator_->Compute(context, + input, + filter, + nullptr, + output); +#else + if (deconv2d_delegator_ == nullptr) { + if (is_depthwise) { + deconv2d_delegator_ = make_unique>( + strides_, + std::vector{1, 1}, + paddings_, + padding_type_, + CAFFE); + } else { + deconv2d_delegator_ = make_unique>( + strides_, + std::vector{1, 1}, + paddings_, + padding_type_, + group_, + CAFFE); } } + deconv2d_delegator_->Compute(context, + input, + filter, + nullptr, + output); +#endif - DoActivation(output_data, - output_data, - output->size(), - activation_, - relux_max_limit_, - leakyrelu_coefficient_); + bias_add_delegator_.Compute(context, output, bias, output); + activation_delegator_.Compute(context, output, output); return MaceStatus::MACE_SUCCESS; } private: - void DepthwiseDeconv2dGeneral(const float *input, - const float *filter, - const index_t kernel_h, - const index_t kernel_w, - const int *strides, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t batch = in_shape[0]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - - const index_t out_img_size = out_height * out_width; - const index_t in_img_size = in_height * in_width; - - const int kernel_size = kernel_h * kernel_w; - std::vector index_map(kernel_size, 0); - for (int i = 0; i < kernel_h; ++i) { - for (int j = 0; j < kernel_w; ++j) { - index_map[i * kernel_w + j] = i * out_width + j; - } - } - -#pragma omp parallel for collapse(2) - for (int b = 0; b < batch; ++b) { - for (int c = 0; c < channels; ++c) { - float *out_base = - output + (b * channels + c) * out_img_size; - for (int i = 0; i < in_height; ++i) { - for (int j = 0; j < in_width; ++j) { - const index_t out_offset = - i * strides[0] * out_width + j * strides[1]; - const index_t input_idx = - (b * channels + c) * in_img_size + i * in_width + j; - const float val = input[input_idx]; - const index_t kernel_offset = c * kernel_size; - for (int k = 0; k < kernel_size; ++k) { - const index_t out_idx = out_offset + index_map[k]; - const index_t kernel_idx = kernel_offset + k; - out_base[out_idx] += val * filter[kernel_idx]; - } - } - } - } - } - } - - void GroupDeconv2dGeneral(const float *input, - const float *filter, - const index_t kernel_h, - const index_t kernel_w, - const int *strides, - const int group, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - - MACE_CHECK(in_channels % group == 0 && out_channels % group == 0, - "invalid input/output channel and group."); - - const index_t out_img_size = out_height * out_width; - const index_t in_img_size = in_height * in_width; - - const int kernel_size = kernel_h * kernel_w; - std::vector index_map(kernel_size, 0); - for (int i = 0; i < kernel_h; ++i) { - for (int j = 0; j < kernel_w; ++j) { - index_map[i * kernel_w + j] = i * out_width + j; - } - } - - const int in_channels_g = in_channels / group; - const int out_channels_g = out_channels / group; -#pragma omp parallel for collapse(3) - for (int b = 0; b < in_shape[0]; ++b) { - for (int g = 0; g < group; ++g) { - for (int p = 0; p < out_channels_g; ++p) { - const index_t out_base = - ((b * group + g) * out_channels_g + p) * out_img_size; - for (int i = 0; i < in_height; ++i) { - for (int j = 0; j < in_width; ++j) { - const index_t out_offset = - i * strides[0] * out_width + j * strides[1]; - for (int q = 0; q < in_channels_g; ++q) { - const index_t in_base = - ((b * group + g) * in_channels_g + q) * in_img_size; - const index_t in_offset = - in_base + i * in_width + j; - const float val = input[in_offset]; - const index_t k_offset = - ((p * group + g) * in_channels_g + q) * kernel_size; - for (int k = 0; k < kernel_size; ++k) { - const index_t out_idx = out_base + out_offset + index_map[k]; - const float w = filter[k_offset + k]; - output[out_idx] += val * w; - } - } - } - } - } - } - } - } +#ifdef MACE_ENABLE_NEON + std::unique_ptr deconv2d_delegator_; + arm::fp32::BiasAdd bias_add_delegator_; + arm::fp32::Activation activation_delegator_; +#else + std::unique_ptr> deconv2d_delegator_; + ref::BiasAdd bias_add_delegator_; + ref::Activation activation_delegator_; +#endif // MACE_ENABLE_NEON }; #ifdef MACE_ENABLE_OPENCL @@ -437,19 +215,22 @@ class DepthwiseDeconv2dOp : public Deconv2dOpBase { MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); - std::vector in_paddings(2, 0); - std::vector out_paddings(2, 0); - std::vector out_shape(4, 0); - - if (!paddings_.empty()) out_paddings = paddings_; - CalcDeconvShape_Caffe(input->shape().data(), - filter->shape().data(), - strides_.data(), - out_paddings.data(), - group_, - in_paddings.data(), - out_shape.data(), - nullptr); + std::vector out_shape; + std::vector in_paddings; + std::vector out_paddings; + + CalDeconvOutputShapeAndPadSize(input->shape(), + filter->shape(), + strides_, + padding_type_, + paddings_, + group_, + &out_shape, + &in_paddings, + &out_paddings, + nullptr, + CAFFE, + NHWC); return kernel_->Compute(context, input, diff --git a/mace/ops/depthwise_deconv2d_test.cc b/mace/ops/depthwise_deconv2d_test.cc index 0b81779e4e58bcc3915fa4a972f15607b0e11b95..0cf3de95bf5c2d077e062dcde07a232977ff8ba6 100644 --- a/mace/ops/depthwise_deconv2d_test.cc +++ b/mace/ops/depthwise_deconv2d_test.cc @@ -252,7 +252,7 @@ TEST_F(DepthwiseDeconv2dOpTest, RandomTestFloat) { RandomTest(1, 4, 256, 256, 5, 1, 3); RandomTest(1, 4, 256, 256, 5, 2, 4); } -// + TEST_F(DepthwiseDeconv2dOpTest, RandomTestHalf) { RandomTest(1, 32, 256, 256, 5, 1, 2); RandomTest(1, 3, 256, 256, 5, 1, 1); diff --git a/mace/ops/dynamic_lstm.cc b/mace/ops/dynamic_lstm.cc index 7fe93f21d6b7831bfe5fba3d21200a21923cdc2e..7d7014d57a7162184c93d1559dc1f93d0facde8c 100644 --- a/mace/ops/dynamic_lstm.cc +++ b/mace/ops/dynamic_lstm.cc @@ -33,10 +33,10 @@ namespace mace { namespace ops { -template +template class DynamicLSTMOp; -template +template class DynamicLSTMOp : public Operation { public: explicit DynamicLSTMOp(OpConstructContext *context) @@ -58,7 +58,6 @@ class DynamicLSTMOp : public Operation { if (std::abs(scale - 1.f) < 1e-6) return; const index_t rounds = cell_dim / 4; -#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < rounds * 4; i += 4) { #ifdef MACE_ENABLE_NEON float32x4_t in_vec = vld1q_f32(cell_data + i); @@ -86,7 +85,6 @@ class DynamicLSTMOp : public Operation { } const index_t rounds = cell_dim / 4; -#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < rounds * 4; i += 4) { #ifdef MACE_ENABLE_NEON float32x4_t in_vec = vld1q_f32(src_data + i); @@ -156,8 +154,8 @@ class DynamicLSTMOp : public Operation { MACE_CHECK(lstm_params->dim(0) == 3 && params_stride == lstm_cell_dim && lstm_cell_dim == prev_cell_dim_) << "lstm params rows:" << lstm_params->dim(0) - << "params_stride:"<< params_stride - << "!=" << "cell_dim:"<< lstm_cell_dim << std::endl; + << "params_stride:" << params_stride + << "!=" << "cell_dim:" << lstm_cell_dim << std::endl; const index_t affine_b_out_dim = weights_b->dim(0); const index_t affine_b_depth = weights_b->dim(1); const index_t affine_b_in_dim = lstm_cell_dim; @@ -262,7 +260,8 @@ class DynamicLSTMOp : public Operation { float *curr_cell_ptr = prev_cell_data + i % cell_buf_chunk * prev_cell_dim_; // LSTMNonlinear - LSTMNonlinearKernel(affine_a_out_data, + LSTMNonlinearKernel(context, + affine_a_out_data, prev_cell_ptr, nullptr, lstm_params_data, diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index 92864ae1016fad410ce054887babd09ee2557c59..04c0e10e323a53d9e3efb042366c4ff6cc1b666d 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -31,7 +31,7 @@ #include "mace/core/operator.h" #include "mace/core/tensor.h" #include "mace/utils/memory.h" -#include "mace/utils/quantize.h" +#include "mace/core/quantize.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/eltwise.h" @@ -40,7 +40,6 @@ namespace mace { namespace ops { - inline index_t GetIndex(const std::vector &shape, const std::vector &index) { index_t idx = 0; @@ -64,8 +63,9 @@ inline void IncreaseIndex(const std::vector &shape, } } -template +template inline void TensorGeneralBroadcastEltwise( + const OpContext *context, const EltwiseType type, const T *input0, const T *input1, @@ -75,6 +75,8 @@ inline void TensorGeneralBroadcastEltwise( const std::vector &input1_shape, const std::vector &output_shape, DstType *output) { + MACE_UNUSED(context); + const index_t output_size = std::accumulate( output_shape.begin(), output_shape.end(), 1, std::multiplies()); std::vector out_index(output_shape.size(), 0); @@ -209,13 +211,13 @@ inline void TensorGeneralBroadcastEltwise( IncreaseIndex(output_shape, &out_index); } break; - default: - LOG(FATAL) << "Eltwise op not support type " << type; + default:LOG(FATAL) << "Eltwise op not support type " << type; } } -template -inline void TensorBroadcastEltwise(const EltwiseType type, +template +inline void TensorBroadcastEltwise(const OpContext *context, + const EltwiseType type, const T *input0, const T *input1, const std::vector &coeff, @@ -223,437 +225,408 @@ inline void TensorBroadcastEltwise(const EltwiseType type, const index_t common_size, const bool swapped, DstType *output) { - switch (type) { - case SUM: - if (coeff.empty()) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input0[i + d * common_size] + input1[i]; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + switch (type) { + case SUM: + if (coeff.empty()) { + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { + output[i + d * common_size] = + input0[i + d * common_size] + input1[i]; + } } - } - } else { - std::vector coeff_copy = coeff; - if (swapped) { - std::swap(coeff_copy[0], coeff_copy[1]); - } -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input0[i + d * common_size] * coeff_copy[0] + - input1[i] * coeff_copy[1]; + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { + output[i + d * common_size] = + input0[i + d * common_size] * coeff_copy[0] + + input1[i] * coeff_copy[1]; + } } } - } - break; - case SUB: - if (!swapped) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input0[i + d * common_size] - input1[i]; + break; + case SUB: + if (!swapped) { + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { + output[i + d * common_size] = + input0[i + d * common_size] - input1[i]; + } + } + } else { + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { + output[i + d * common_size] = + input1[i] - input0[i + d * common_size]; + } } } - } else { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { + break; + case PROD: + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { output[i + d * common_size] = - input1[i] - input0[i + d * common_size]; + input0[i + d * common_size] * input1[i]; } } - } - break; - case PROD: -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = input0[i + d * common_size] * input1[i]; + break; + case DIV: + if (!swapped) { + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { + output[i + d * common_size] = + input0[i + d * common_size] / input1[i]; + } + } + } else { + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { + output[i + d * common_size] = + input1[i] / input0[i + d * common_size]; + } + } } - } - break; - case DIV: - if (!swapped) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input0[i + d * common_size] / input1[i]; + break; + case FLOOR_DIV: + if (!swapped) { + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { + output[i + d * common_size] = + std::floor(input0[i + d * common_size] / input1[i]); + } + } + } else { + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { + output[i + d * common_size] = + std::floor(input1[i] / input0[i + d * common_size]); + } } } - } else { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { + break; + case MIN: + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { output[i + d * common_size] = - input1[i] / input0[i + d * common_size]; + std::min(input0[i + d * common_size], input1[i]); } } - } - break; - case FLOOR_DIV: - if (!swapped) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { + break; + case MAX: + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { output[i + d * common_size] = - std::floor(input0[i + d * common_size] / input1[i]); + std::max(input0[i + d * common_size], input1[i]); } } - } else { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { + break; + case SQR_DIFF: + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { output[i + d * common_size] = - std::floor(input1[i] / input0[i + d * common_size]); + std::pow(input0[i + d * common_size] - input1[i], 2.f); } } - } - break; - case MIN: -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - std::min(input0[i + d * common_size], input1[i]); - } - } - break; - case MAX: -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - std::max(input0[i + d * common_size], input1[i]); + break; + case POW: + if (!swapped) { + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { + output[i + d * common_size] = + std::pow(input0[i + d * common_size], input1[i]); + } + } + } else { + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { + output[i + d * common_size] = + std::pow(input1[i], input0[i + d * common_size]); + } + } } - } - break; - case SQR_DIFF: -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - std::pow(input0[i + d * common_size] - input1[i], 2.f); + break; + case NEG: + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { + output[i + d * common_size] = -input0[i + d * common_size]; + } } - } - break; - case POW: - if (!swapped) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { + break; + case ABS: + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { output[i + d * common_size] = - std::pow(input0[i + d * common_size], input1[i]); + std::fabs(input0[i + d * common_size]); } } - } else { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { + break; + case EQUAL: + for (index_t d = start0; d < end0; d += step0) { + for (index_t i = start1; i < end1; i += step1) { output[i + d * common_size] = - std::pow(input1[i], input0[i + d * common_size]); + input0[i + d * common_size] == input1[i]; } } - } - break; - case NEG: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < diff_size * common_size; ++i) { - output[i] = -input0[i]; - } - break; - case ABS: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < diff_size * common_size; ++i) { - output[i] = std::fabs(input0[i]); - } - break; - case EQUAL: -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input0[i + d * common_size] == input1[i]; - } - } - break; - default: - LOG(FATAL) << "Eltwise op not support type " << type; - } + break; + default:LOG(FATAL) << "Eltwise op not support type " << type; + } + }, 0, diff_size, 1, 0, common_size, 1); } // Multiplication is costly, so we specialize the following case. -template -inline void TensorEltwise(const EltwiseType type, +template +inline void TensorEltwise(const OpContext *context, + const EltwiseType type, const T *input0, const T *input1, const std::vector &coeff, const index_t size, const bool swapped, DstType *output) { - switch (type) { - case SUM: - if (coeff.empty()) { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] + input1[i]; - } + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); - } else { - std::vector coeff_copy = coeff; - if (swapped) { - std::swap(coeff_copy[0], coeff_copy[1]); - } -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1]; - } - } - break; - case SUB: - if (!swapped) { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] - input1[i]; - } + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + switch (type) { + case SUM: + if (coeff.empty()) { + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] + input1[i]; + } - } else { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input1[i] - input0[i]; + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1]; + } } - } - break; - case PROD: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] * input1[i]; - } + break; + case SUB: + if (!swapped) { + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] - input1[i]; + } - break; - case DIV: - if (!swapped) { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] / input1[i]; + } else { + for (index_t i = start; i < end; i += step) { + output[i] = input1[i] - input0[i]; + } + } + break; + case PROD: + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] * input1[i]; } - } else { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input1[i] / input0[i]; + break; + case DIV: + if (!swapped) { + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] / input1[i]; + } + + } else { + for (index_t i = start; i < end; i += step) { + output[i] = input1[i] / input0[i]; + } } - } - break; - case FLOOR_DIV: - if (!swapped) { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::floor(input0[i] / input1[i]); + break; + case FLOOR_DIV: + if (!swapped) { + for (index_t i = start; i < end; i += step) { + output[i] = std::floor(input0[i] / input1[i]); + } + } else { + for (index_t i = start; i < end; i += step) { + output[i] = std::floor(input1[i] / input0[i]); + } } - } else { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::floor(input1[i] / input0[i]); + break; + case MIN: + for (index_t i = start; i < end; i += step) { + output[i] = std::min(input0[i], input1[i]); } - } - break; - case MIN: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::min(input0[i], input1[i]); - } - break; - case MAX: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::max(input0[i], input1[i]); - } + break; + case MAX: + for (index_t i = start; i < end; i += step) { + output[i] = std::max(input0[i], input1[i]); + } - break; - case SQR_DIFF: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i] - input1[i], 2.f); - } + break; + case SQR_DIFF: + for (index_t i = start; i < end; i += step) { + output[i] = std::pow(input0[i] - input1[i], 2.f); + } - break; - case POW: - if (!swapped) { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i], input1[i]); + break; + case POW: + if (!swapped) { + for (index_t i = start; i < end; i += step) { + output[i] = std::pow(input0[i], input1[i]); + } + } else { + for (index_t i = start; i < end; i += step) { + output[i] = std::pow(input1[i], input0[i]); + } } - } else { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input1[i], input0[i]); + break; + case NEG: + for (index_t i = start; i < end; i += step) { + output[i] = -input0[i]; } - } - break; - case NEG: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = -input0[i]; - } - break; - case ABS: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::fabs(input0[i]); - } - break; - case EQUAL: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] == input1[i]; - } - break; - default: - LOG(FATAL) << "Eltwise op not support type " << type; - } + break; + case ABS: + for (index_t i = start; i < end; i += step) { + output[i] = std::fabs(input0[i]); + } + break; + case EQUAL: + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] == input1[i]; + } + break; + default:LOG(FATAL) << "Eltwise op not support type " << type; + } + }, 0, size, 1); } // Multiplication is costly, so we specialize the following case. -template -inline void TensorScalarEltwise(const EltwiseType type, +template +inline void TensorScalarEltwise(const OpContext *context, + const EltwiseType type, const T *input0, const T input1, const std::vector &coeff, const index_t size, const bool swapped, DstType *output) { - switch (type) { - case SUM: - if (coeff.empty()) { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] + input1; - } + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); - } else { - std::vector coeff_copy = coeff; - if (swapped) { - std::swap(coeff_copy[0], coeff_copy[1]); - } -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1]; - } - } - break; - case SUB: - if (!swapped) { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] - input1; - } + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + switch (type) { + case SUM: + if (coeff.empty()) { + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] + input1; + } - } else { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input1 - input0[i]; + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1]; + } } - } - break; - case PROD: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] * input1; - } + break; + case SUB: + if (!swapped) { + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] - input1; + } - break; - case DIV: - if (!swapped) { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] / input1; + } else { + for (index_t i = start; i < end; i += step) { + output[i] = input1 - input0[i]; + } + } + break; + case PROD: + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] * input1; } - } else { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input1 / input0[i]; + break; + case DIV: + if (!swapped) { + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] / input1; + } + + } else { + for (index_t i = start; i < end; i += step) { + output[i] = input1 / input0[i]; + } } - } - break; - case FLOOR_DIV: - if (!swapped) { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::floor(input0[i] / input1); + break; + case FLOOR_DIV: + if (!swapped) { + for (index_t i = start; i < end; i += step) { + output[i] = std::floor(input0[i] / input1); + } + } else { + for (index_t i = start; i < end; i += step) { + output[i] = std::floor(input1 / input0[i]); + } } - } else { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::floor(input1 / input0[i]); + break; + case MIN: + for (index_t i = start; i < end; i += step) { + output[i] = std::min(input0[i], input1); } - } - break; - case MIN: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::min(input0[i], input1); - } - break; - case MAX: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::max(input0[i], input1); - } + break; + case MAX: + for (index_t i = start; i < end; i += step) { + output[i] = std::max(input0[i], input1); + } - break; - case SQR_DIFF: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i] - input1, 2.f); - } + break; + case SQR_DIFF: + for (index_t i = start; i < end; i += step) { + output[i] = std::pow(input0[i] - input1, 2.f); + } - break; - case POW: - if (!swapped) { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i], input1); + break; + case POW: + if (!swapped) { + for (index_t i = start; i < end; i += step) { + output[i] = std::pow(input0[i], input1); + } + } else { + for (index_t i = start; i < end; i += step) { + output[i] = std::pow(input1, input0[i]); + } } - } else { -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input1, input0[i]); + break; + case NEG: + for (index_t i = start; i < end; i += step) { + output[i] = -input0[i]; + } + break; + case ABS: + for (index_t i = start; i < end; i += step) { + output[i] = std::fabs(input0[i]); + } + break; + case EQUAL: + for (index_t i = start; i < end; i += step) { + output[i] = input0[i] == input1; } - } - break; - case NEG: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = -input0[i]; - } - break; - case ABS: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = std::fabs(input0[i]); - } - break; - case EQUAL: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] == input1; - } - break; - default: - LOG(FATAL) << "Eltwise op not support type " << type; - } + break; + default:LOG(FATAL) << "Eltwise op not support type " << type; + } + }, 0, size, 1); } -template -inline void TensorEltwisePerChannel(const EltwiseType type, +template +inline void TensorEltwisePerChannel(const OpContext *context, + const EltwiseType type, const T *input0, const T *input1, const std::vector &coeff, @@ -663,230 +636,227 @@ inline void TensorEltwisePerChannel(const EltwiseType type, const index_t image_size, const bool swapped, DstType *output) { - switch (type) { - case SUM: - if (coeff.empty()) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in0_ptr[i] + in1_ptr[c]; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + switch (type) { + case SUM: + if (coeff.empty()) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] + in1_ptr[c]; + } } } - } - } else { - std::vector coeff_copy = coeff; - if (swapped) { - std::swap(coeff_copy[0], coeff_copy[1]); - } -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = - in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1]; + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = + in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1]; + } } } } - } - break; - case SUB: - if (!swapped) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in0_ptr[i] - in1_ptr[c]; + break; + case SUB: + if (!swapped) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] - in1_ptr[c]; + } + } + } + } else { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in1_ptr[c] - in0_ptr[i]; + } } } } - } else { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { + break; + case PROD: + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); DstType *out_ptr = output + ((b * channel) + c) * image_size; for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in1_ptr[c] - in0_ptr[i]; + out_ptr[i] = in0_ptr[i] * in1_ptr[c]; } } } - } - break; - case PROD: -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in0_ptr[i] * in1_ptr[c]; + break; + case DIV: + if (!swapped) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] / in1_ptr[c]; + } + } + } + } else { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in1_ptr[c] / in0_ptr[i]; + } + } } } - } - break; - case DIV: - if (!swapped) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in0_ptr[i] / in1_ptr[c]; + break; + case FLOOR_DIV: + if (!swapped) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::floor(in0_ptr[i] / in1_ptr[c]); + } + } + } + } else { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::floor(in1_ptr[c] / in0_ptr[i]); + } } } } - } else { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { + break; + case MIN: + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); DstType *out_ptr = output + ((b * channel) + c) * image_size; for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in1_ptr[c] / in0_ptr[i]; + out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]); } } } - } - break; - case FLOOR_DIV: - if (!swapped) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { + break; + case MAX: + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); DstType *out_ptr = output + ((b * channel) + c) * image_size; for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::floor(in0_ptr[i] / in1_ptr[c]); + out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]); } } } - } else { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { + break; + case SQR_DIFF: + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); DstType *out_ptr = output + ((b * channel) + c) * image_size; for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::floor(in1_ptr[c] / in0_ptr[i]); + out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f); } } } - } - break; - case MIN: -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]); + break; + case POW: + if (!swapped) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]); + } + } } - } - } - break; - case MAX: -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]); + } else { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]); + } + } } } - } - break; - case SQR_DIFF: -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f); + break; + case NEG: + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = -input0[i]; + } } } - } - break; - case POW: - if (!swapped) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; + break; + case ABS: + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]); + output[i] = std::fabs(input0[i]); } } } - } else { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { + break; + case EQUAL: + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); DstType *out_ptr = output + ((b * channel) + c) * image_size; for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]); + out_ptr[i] = in0_ptr[i] == in1_ptr[c]; } } } - } - break; - case NEG: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < batch0 * channel * image_size; ++i) { - output[i] = -input0[i]; - } - break; - case ABS: -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < batch0 * channel * image_size; ++i) { - output[i] = std::fabs(input0[i]); - } - break; - case EQUAL: -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in0_ptr[i] == in1_ptr[c]; - } - } - } - break; - default: - LOG(FATAL) << "Eltwise op not support type " << type; - } + break; + default:LOG(FATAL) << "Eltwise op not support type " << type; + } + }, 0, batch0, 1, 0, channel, 1); } -template +template class EltwiseOp : public Operation { public: explicit EltwiseOp(OpConstructContext *context) @@ -915,15 +885,16 @@ class EltwiseOp : public Operation { if (IsLogicalType(type_)) { // as we do not have bool-type tensor, we use int type - return DoEltwise(input0, input1, output); + return DoEltwise(context, input0, input1, output); } else { - return DoEltwise(input0, input1, output); + return DoEltwise(context, input0, input1, output); } } private: - template - MaceStatus DoEltwise(const Tensor *input0, + template + MaceStatus DoEltwise(const OpContext *context, + const Tensor *input0, const Tensor *input1, Tensor *output) { bool swapped = false; @@ -970,12 +941,20 @@ class EltwiseOp : public Operation { Tensor::MappingGuard output_guard(output); DstType *output_ptr = output->mutable_data(); if (input1->size() < input0->size()) { - TensorEltwisePerChannel( - type_, input0_ptr, input1_ptr, coeff_, input0->dim(0), - input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1), - input0->dim(2) * input0->dim(3), swapped, output_ptr); + TensorEltwisePerChannel(context, + type_, + input0_ptr, + input1_ptr, + coeff_, + input0->dim(0), + input1->dim_size() == 1 ? 1 : input1->dim(0), + input0->dim(1), + input0->dim(2) * input0->dim(3), + swapped, + output_ptr); } else { - TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(), + TensorEltwise(context, + type_, input0_ptr, input1_ptr, coeff_, input0->size(), swapped, output_ptr); } } else { @@ -1002,19 +981,23 @@ class EltwiseOp : public Operation { } if (input1->size() == 1) { - TensorScalarEltwise(type_, input0_ptr, input1_ptr[0], coeff_, + TensorScalarEltwise(context, + type_, input0_ptr, input1_ptr[0], coeff_, input0->size(), swapped, output_ptr); } else if (input0_shape == input1_shape) { - TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(), + TensorEltwise(context, + type_, input0_ptr, input1_ptr, coeff_, input0->size(), swapped, output_ptr); } else if (need_general_broadcast) { - TensorGeneralBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_, + TensorGeneralBroadcastEltwise(context, + type_, input0_ptr, input1_ptr, coeff_, swapped, input0_shape, input1_shape, output_shape, output_ptr); } else { index_t common_size = input1->size(); index_t diff_size = input0->size() / common_size; - TensorBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_, + TensorBroadcastEltwise(context, + type_, input0_ptr, input1_ptr, coeff_, diff_size, common_size, swapped, output_ptr); } } @@ -1096,37 +1079,41 @@ class EltwiseOp : public Operation { auto input0_ptr = input0->data(); auto input1_ptr = input1->data(); auto output_ptr = output->mutable_data(); -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < output->size(); ++i) { - const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); - const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); - const int32_t shifted_input0 = offset_input0 * (1 << left_shift); - const int32_t shifted_input1 = offset_input1 * (1 << left_shift); - const int32_t multiplied_input0 = - gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0, - input0_multiplier), - -input0_shift); - const int32_t multiplied_input1 = - gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, - input1_multiplier), - -input1_shift); - - int32_t res; - if (type_ == SUM) { - res = multiplied_input0 + multiplied_input1; - } else { - res = multiplied_input0 - multiplied_input1; - } - const int32_t output_val = - gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(res, - output_multiplier), - -output_shift) + output->zero_point(); - output_ptr[i] = Saturate(output_val); - } + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); + const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); + const int32_t shifted_input0 = offset_input0 * (1 << left_shift); + const int32_t shifted_input1 = offset_input1 * (1 << left_shift); + const int32_t multiplied_input0 = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0, + input0_multiplier), + -input0_shift); + const int32_t multiplied_input1 = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, + input1_multiplier), + -input1_shift); + + int32_t res; + if (type_ == SUM) { + res = multiplied_input0 + multiplied_input1; + } else { + res = multiplied_input0 - multiplied_input1; + } + + const int32_t output_val = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(res, + output_multiplier), + -output_shift) + output->zero_point(); + output_ptr[i] = Saturate(output_val); + } + }, 0, output->size(), 1); #endif // NEON return MaceStatus::MACE_SUCCESS; @@ -1203,7 +1190,6 @@ class EltwiseOp : public Operation { }; #endif // MACE_ENABLE_OPENCL - void RegisterEltwise(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, DeviceType::CPU, float); diff --git a/mace/ops/expand_dims.cc b/mace/ops/expand_dims.cc index 2d99d7a742659549c750fc1246449f35701f2277..78fed15619553b3903d8c71015b4d4228f6a5c7a 100644 --- a/mace/ops/expand_dims.cc +++ b/mace/ops/expand_dims.cc @@ -20,10 +20,10 @@ namespace mace { namespace ops { -template +template class ExpandDimsOp; -template +template class ExpandDimsOp : public Operation { public: explicit ExpandDimsOp(OpConstructContext *context) @@ -50,14 +50,15 @@ class ExpandDimsOp : public Operation { // only tensorflow support expand dim, so the default format is NHWC // transform NHWC to NCHW auto t_output_shape = TransposeShape(output_shape, - {0, 3, 1, 2}); + {0, 3, 1, 2}); output->Resize(t_output_shape); Tensor::MappingGuard input_guard(input); Tensor::MappingGuard output_guard(output); auto input_data = input->data(); auto output_data = output->mutable_data(); - Transpose(input_data, output_shape, {0, 3, 1, 2}, output_data); + Transpose(&context->device()->cpu_runtime()->thread_pool(), + input_data, output_shape, {0, 3, 1, 2}, output_data); } else { output->Resize(output_shape); Tensor::MappingGuard input_guard(input); diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index 04745a055cfd519e8df365e430d952b206c843e9..64765d9c99f6a9ade2b8ef7a1a2cdd5874f3c243 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -22,8 +22,8 @@ #include "mace/ops/activation.h" #ifdef MACE_ENABLE_NEON - #include "mace/ops/arm/fp32/gemv.h" +#include "mace/ops/arm/fp32/activation.h" #ifdef MACE_ENABLE_QUANTIZE #include "mace/ops/arm/q8/gemv.h" @@ -31,6 +31,7 @@ #else #include "mace/ops/ref/gemv.h" +#include "mace/ops/ref/activation.h" #endif // MACE_ENABLE_NEON #ifdef MACE_ENABLE_OPENCL @@ -69,7 +70,10 @@ template<> class FullyConnectedOp : public FullyConnectedOpBase { public: explicit FullyConnectedOp(OpConstructContext *context) - : FullyConnectedOpBase(context) {} + : FullyConnectedOpBase(context), + activation_delegator_(activation_, + relux_max_limit_, + leakyrelu_coefficient_) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -106,10 +110,8 @@ class FullyConnectedOp : public FullyConnectedOpBase { false, true, output); - Tensor::MappingGuard guard_output(output); - float *output_ptr = output->mutable_data(); - DoActivation(output_ptr, output_ptr, output->size(), activation_, - relux_max_limit_, leakyrelu_coefficient_); + + activation_delegator_.Compute(context, output, output); return MaceStatus::MACE_SUCCESS; } @@ -117,8 +119,10 @@ class FullyConnectedOp : public FullyConnectedOpBase { private: #ifdef MACE_ENABLE_NEON arm::fp32::Gemv gemv_; + arm::fp32::Activation activation_delegator_; #else ref::Gemv gemv_; + ref::Activation activation_delegator_; #endif // MACE_ENABLE_NEON }; diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index ce10c97e606c6394fe1046a35b6978099dd313b6..64fead6e05bc4a1d552d20e55a8645b589751968 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -261,7 +261,9 @@ void QuantRandom(const index_t batch, auto bias_data = bias->data(); float bias_scale = q_input->scale() * q_weight->scale(); std::vector q_bias(bias->size()); - QuantizeWithScaleAndZeropoint( + + QuantizeUtil quantize_util(OpTestContext::Get()->thread_pool()); + quantize_util.QuantizeWithScaleAndZeropoint( bias_data, bias->size(), bias_scale, 0, q_bias.data()); net.AddInputFromArray( "QuantizedBias", {out_channel}, q_bias, true, bias_scale, 0); diff --git a/mace/ops/gather.cc b/mace/ops/gather.cc index 453a201cb71e0fd2aff59accb13b037aa50d1612..0c0551cd396af2279f47b245c371df4989143a98 100644 --- a/mace/ops/gather.cc +++ b/mace/ops/gather.cc @@ -53,16 +53,15 @@ class GatherOp : public Operation { const T *params_data = params->data(); T *output_data = output->mutable_data(); - index_t axis_dim_size = params->dim(axis_); - index_t lhs_size = std::accumulate(params->shape().begin(), + const index_t axis_dim_size = params->dim(axis_); + const index_t lhs_size = std::accumulate(params->shape().begin(), params->shape().begin() + axis_, 1, std::multiplies()); - index_t rhs_size = + const index_t rhs_size = std::accumulate(params->shape().begin() + (axis_ + 1), params->shape().end(), 1, std::multiplies()); - index_t index_size = indices->size(); + const index_t index_size = indices->size(); -#pragma omp parallel for collapse(2) schedule(runtime) for (index_t l = 0; l < lhs_size; ++l) { for (index_t idx = 0; idx < index_size; ++idx) { MACE_ASSERT(indices_data[idx] < axis_dim_size, "idx out of bound: ", diff --git a/mace/ops/local_response_norm.cc b/mace/ops/local_response_norm.cc index f70d8342d12df14f131c910feae95fc10fe5b567..022ee3e7aa979ee36794f0fe6c4888012a0f0cb2 100644 --- a/mace/ops/local_response_norm.cc +++ b/mace/ops/local_response_norm.cc @@ -20,10 +20,10 @@ namespace mace { namespace ops { -template +template class LocalResponseNormOp; -template <> +template<> class LocalResponseNormOp : public Operation { public: explicit LocalResponseNormOp(OpConstructContext *context) @@ -51,29 +51,35 @@ class LocalResponseNormOp : public Operation { const float *input_ptr = input->data(); float *output_ptr = output->mutable_data(); - index_t image_size = height * width; - index_t batch_size = channels * image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - const int begin_input_c = std::max(static_cast(0), - c - depth_radius_); - const int end_input_c = std::min(channels, c + depth_radius_ + 1); - - index_t pos = b * batch_size; - for (index_t hw = 0; hw < height * width; ++hw, ++pos) { - float accum = 0.f; - for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) { - const float input_val = input_ptr[pos + input_c * image_size]; - accum += input_val * input_val; + const index_t image_size = height * width; + const index_t batch_size = channels * image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const index_t begin_input_c = std::max(static_cast(0), + c - depth_radius_); + const index_t end_input_c = std::min(channels, c + depth_radius_ + 1); + + index_t pos = b * batch_size; + for (index_t hw = 0; hw < height * width; ++hw, ++pos) { + float accum = 0.f; + for (index_t input_c = begin_input_c; input_c < end_input_c; + ++input_c) { + const float input_val = input_ptr[pos + input_c * image_size]; + accum += input_val * input_val; + } + const float multiplier = std::pow(bias_ + alpha_ * accum, -beta_); + output_ptr[pos + c * image_size] = + input_ptr[pos + c * image_size] * multiplier; } - const float multiplier = std::pow(bias_ + alpha_ * accum, -beta_); - output_ptr[pos + c * image_size] = - input_ptr[pos + c * image_size] * multiplier; } } - } + }, 0, batch, 1, 0, channels, 1); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/lstm_nonlinear.cc b/mace/ops/lstm_nonlinear.cc index 745c4d79674c6e2becc2eb49b2d855a2819a0e15..596c9ad77bd3add3d99043e5a5f4ebd33db5dade 100644 --- a/mace/ops/lstm_nonlinear.cc +++ b/mace/ops/lstm_nonlinear.cc @@ -24,10 +24,10 @@ namespace mace { namespace ops { -template +template class LSTMNonlinearOp; -template +template class LSTMNonlinearOp : public Operation { public: explicit LSTMNonlinearOp(OpConstructContext *context) @@ -45,12 +45,7 @@ class LSTMNonlinearOp : public Operation { << "The input dim size should >= 2"; MACE_CHECK(params->dim_size() == 2) << "The params dim size should be 2"; - return Compute(input, params, output); - } - MaceStatus Compute(const Tensor *input, - const Tensor *params, - Tensor *output) { const std::vector &input_shape = input->shape(); const std::vector ¶ms_shape = params->shape(); @@ -77,7 +72,7 @@ class LSTMNonlinearOp : public Operation { const float *input_data = input->data(); const float *params_data = params->data(); float *output_data = output->mutable_data(); -#pragma omp parallel for schedule(runtime) + for (int r = 0; r < num_rows; ++r) { const float *input_row = input_data + r * input_cols; const float *prev_row = input_row + 4 * cell_dim; @@ -85,7 +80,8 @@ class LSTMNonlinearOp : public Operation { embed_scales ? prev_row + cell_dim : nullptr; float *output_cell = output_data + r * output_dim; float *output_row = output_cell + cell_dim; - LSTMNonlinearKernel(input_row, + LSTMNonlinearKernel(context, + input_row, prev_row, scale_data, params_data, diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index 3b0913de574607660b807ea133f3e797a30aca71..65df7305ea769cbbfd5a6c5ebfa8a779b95fe954 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -195,12 +195,18 @@ class MatMulOp : public MatMulOpBase { Tensor::MappingGuard c_guard(C); const float *bias_data = bias->data(); float *c_data = C->mutable_data(); -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t i = 0; i < batch * rows; ++i) { - for (index_t w = 0; w < cols; ++w) { - c_data[i * cols + w] += bias_data[w]; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t w = start1; w < end1; w += step1) { + c_data[i * cols + w] += bias_data[w]; + } } - } + }, 0, batch * rows, 1, 0, cols, 1); } return ret; diff --git a/mace/ops/one_hot.cc b/mace/ops/one_hot.cc index 1d243f202f1fa5ad65c4abd58892df2a31dd9155..1596286af6ae4af96e5e7d01194fa5eff7e235a2 100644 --- a/mace/ops/one_hot.cc +++ b/mace/ops/one_hot.cc @@ -78,7 +78,6 @@ class OneHotOp : public OneHotOpBase { const index_t batch = input->dim(0); if (axis == 1) { -#pragma omp parallel for collapse(2) for (index_t i = 0; i < batch; ++i) { for (index_t j = 0; j < depth_; ++j) { output_ptr[i * depth_ + j] = input_ptr[i] == j ? on_value_ : @@ -86,7 +85,6 @@ class OneHotOp : public OneHotOpBase { } } } else { -#pragma omp parallel for collapse(2) for (index_t i = 0; i < depth_; ++i) { for (index_t j = 0; j < batch; ++j) { output_ptr[i * batch + j] = input_ptr[j] == i ? on_value_ : diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h index dbb6eab64c22f2941c2710f6a2730a527149f6c3..20dc6d1ac9da37ca99bc70eed9905afbfd89ceb7 100644 --- a/mace/ops/opencl/buffer_transformer.h +++ b/mace/ops/opencl/buffer_transformer.h @@ -29,7 +29,7 @@ namespace mace { namespace ops { // Only used for GPU Operation(BufferTransform) -template +template class OpenCLBufferTransformer { public: OpenCLBufferTransformer(const MemoryType in_mem_type, @@ -79,10 +79,12 @@ class OpenCLBufferTransformer { const float *input_ptr = input->data(); Tensor::MappingGuard guard(internal_tensor); float *internal_ptr = internal_tensor->mutable_data(); - MACE_RETURN_IF_ERROR(ops::Transpose(input_ptr, - input->shape(), - dst_dims, - internal_ptr)); + MACE_RETURN_IF_ERROR(ops::Transpose( + &context->device()->cpu_runtime()->thread_pool(), + input_ptr, + input->shape(), + dst_dims, + internal_ptr)); } else { internal_tensor->Resize(input->shape()); const uint8_t *input_ptr = input->data(); @@ -117,7 +119,8 @@ class OpenCLBufferTransformer { const float *internal_ptr = internal_tensor.data(); output->Resize(output_shape); float *output_ptr = output->mutable_data(); - return ops::Transpose(internal_ptr, + return ops::Transpose(&context->device()->cpu_runtime()->thread_pool(), + internal_ptr, internal_tensor.shape(), dst_dims, output_ptr); @@ -147,7 +150,7 @@ class OpenCLBufferTransformer { std::string TransformedFilterName(const std::string &name); -template +template MaceStatus TransformFilter( mace::OpConstructContext *context, OperatorDef *op_def, diff --git a/mace/ops/opencl/image/reduce.h b/mace/ops/opencl/image/reduce.h index 7a4bf2b55934a1880447c6b6c1b5a3be87915ac4..a2bdc65280fd82cdd244c0c949e2753765a3bf6d 100644 --- a/mace/ops/opencl/image/reduce.h +++ b/mace/ops/opencl/image/reduce.h @@ -35,7 +35,7 @@ template class ReduceKernel : public OpenCLReduceKernel { public: ReduceKernel(ReduceType type, - const std::vector axis, + const std::vector &axis, const bool keep_dims) : reduce_type_(type), axis_(axis), keep_dims_(keep_dims) {} diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc index 27a0bc30533f4538a537dc6c3084178ee1d5d3cd..40b83fa62e757b1f13a1e06c6f91b6db1e29ab1b 100644 --- a/mace/ops/opencl/image/winograd_conv2d.cc +++ b/mace/ops/opencl/image/winograd_conv2d.cc @@ -241,7 +241,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, bool input_changed = !IsVecEqual(*prev_input_shape, input->shape()); *prev_input_shape = input->shape(); - auto output_shape = output->shape(); + auto &output_shape = output->shape(); const index_t round_h = (output_shape[1] + wino_blk_size - 1) / wino_blk_size; const index_t round_w = diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc index c1569204bdc11895ff47392838e9987bdf2ef75b..ab61e8c627fd72d4cb8c2c279f9567e92692df23 100644 --- a/mace/ops/ops_test_util.cc +++ b/mace/ops/ops_test_util.cc @@ -101,55 +101,50 @@ void OpDefBuilder::Finalize(OperatorDef *op_def) const { } namespace { -#ifdef MACE_ENABLE_OPENCL std::string GetStoragePathFromEnv() { char *storage_path_str = getenv("MACE_INTERNAL_STORAGE_PATH"); if (storage_path_str == nullptr) return ""; return storage_path_str; } -#endif } // namespace OpTestContext *OpTestContext::Get(int num_threads, - CPUAffinityPolicy cpu_affinity_policy, - bool use_gemmlowp) { + CPUAffinityPolicy cpu_affinity_policy) { static OpTestContext instance(num_threads, - cpu_affinity_policy, - use_gemmlowp); + cpu_affinity_policy); return &instance; } OpTestContext::OpTestContext(int num_threads, - CPUAffinityPolicy cpu_affinity_policy, - -#ifdef MACE_ENABLE_OPENCL - bool use_gemmlowp) + CPUAffinityPolicy cpu_affinity_policy) : gpu_context_(std::make_shared(GetStoragePathFromEnv())), - opencl_mem_types_({MemoryType::GPU_IMAGE}) { -#else - bool use_gemmlowp) { -#endif + opencl_mem_types_({MemoryType::GPU_IMAGE}), + thread_pool_(make_unique(num_threads, + cpu_affinity_policy)) { + thread_pool_->Init(); + device_map_[DeviceType::CPU] = make_unique( - num_threads, cpu_affinity_policy, use_gemmlowp); + num_threads, cpu_affinity_policy, thread_pool_.get()); -#ifdef MACE_ENABLE_OPENCL device_map_[DeviceType::GPU] = make_unique( gpu_context_->opencl_tuner(), gpu_context_->opencl_cache_storage(), GPUPriorityHint::PRIORITY_NORMAL, - GPUPerfHint::PERF_HIGH); -#endif // MACE_ENABLE_OPENCL -} - -Device *OpTestContext::GetDevice(DeviceType device_type) { - return device_map_[device_type].get(); + GPUPerfHint::PERF_HIGH, + nullptr, + num_threads, + cpu_affinity_policy, + thread_pool_.get()); } -#ifdef MACE_ENABLE_OPENCL std::shared_ptr OpTestContext::gpu_context() const { return gpu_context_; } +Device *OpTestContext::GetDevice(DeviceType device_type) { + return device_map_[device_type].get(); +} + std::vector OpTestContext::opencl_mem_types() { return opencl_mem_types_; } @@ -165,7 +160,6 @@ void OpTestContext::SetOCLImageTestFlag() { void OpTestContext::SetOCLImageAndBufferTestFlag() { opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER}; } -#endif // MACE_ENABLE_OPENCL bool OpsTestNet::Setup(mace::DeviceType device) { NetDef net_def; @@ -237,7 +231,6 @@ MaceStatus OpsTestNet::Run() { MaceStatus OpsTestNet::RunOp(mace::DeviceType device) { if (device == DeviceType::GPU) { -#ifdef MACE_ENABLE_OPENCL auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types(); for (auto type : opencl_mem_types) { OpTestContext::Get()->GetDevice(device) @@ -246,9 +239,6 @@ MaceStatus OpsTestNet::RunOp(mace::DeviceType device) { MACE_RETURN_IF_ERROR(Run()); } return MaceStatus::MACE_SUCCESS; -#else - return MaceStatus::MACE_UNSUPPORTED; -#endif } else { Setup(device); return Run(); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 871803234236de5c3833468dfa785dd339e3ee16..e9ef4d90f89807f8b123b5e3cba75c075ab52657 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -26,22 +26,20 @@ #include #include "gtest/gtest.h" +#include "mace/core/types.h" #include "mace/core/net.h" #include "mace/core/device_context.h" +#include "mace/core/runtime/opencl/gpu_device.h" +#include "mace/core/runtime/opencl/opencl_util.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" #include "mace/ops/ops_registry.h" #include "mace/public/mace.h" #include "mace/utils/memory.h" #include "mace/utils/math.h" -#include "mace/utils/quantize.h" +#include "mace/core/quantize.h" #include "mace/ops/testing/test_utils.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/gpu_device.h" -#include "mace/core/runtime/opencl/opencl_util.h" -#endif - namespace mace { namespace ops { namespace test { @@ -79,30 +77,26 @@ class OpTestContext { public: static OpTestContext *Get( int num_threads = -1, - CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY, - bool use_gemmlowp = true); - Device *GetDevice(DeviceType device_type); - -#ifdef MACE_ENABLE_OPENCL + CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY); std::shared_ptr gpu_context() const; + Device *GetDevice(DeviceType device_type); std::vector opencl_mem_types(); void SetOCLBufferTestFlag(); void SetOCLImageTestFlag(); void SetOCLImageAndBufferTestFlag(); -#endif + utils::ThreadPool *thread_pool() { + return thread_pool_.get(); + } private: OpTestContext(int num_threads, - CPUAffinityPolicy cpu_affinity_policy, - bool use_gemmlowp); + CPUAffinityPolicy cpu_affinity_policy); MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext); - std::map> device_map_; - -#ifdef MACE_ENABLE_OPENCL std::shared_ptr gpu_context_; std::vector opencl_mem_types_; -#endif + std::map> device_map_; + std::unique_ptr thread_pool_; }; class OpsTestNet { @@ -430,9 +424,7 @@ class OpsTestBase : public ::testing::Test { } virtual void TearDown() { -#ifdef MACE_ENABLE_OPENCL OpTestContext::Get()->SetOCLImageTestFlag(); -#endif } }; diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc index aaa6b230f4b5237dc88d16e369dcf289a8fe9df6..e0a94f4a7f5b2f6a00eddd816b3b92ae9da816d1 100644 --- a/mace/ops/pad.cc +++ b/mace/ops/pad.cc @@ -84,7 +84,6 @@ class PadOp : public Operation { if (type_ == PadType::CONSTANT) { std::fill(output_ptr, output_ptr + output->size(), this->constant_value_); -#pragma omp parallel for collapse(3) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channel; ++c) { for (index_t h = 0; h < height; ++h) { @@ -109,7 +108,6 @@ class PadOp : public Operation { const int l_add = type_ == PadType::REFLECT ? 0 : -1; const int r_add = type_ == PadType::REFLECT ? -2 : -1; -#pragma omp parallel for collapse(1) for (index_t h = 0; h < o_height; ++h) { index_t h_in = get_src_idx(h, height, paddings_[4], l_add, r_add); diff --git a/mace/ops/pad_context.cc b/mace/ops/pad_context.cc index 6c463ec9830b2e22e234cef6e4ec7eddc61d9906..8370f9f56d03056b6e9c905771abfbcadbf2c1b9 100644 --- a/mace/ops/pad_context.cc +++ b/mace/ops/pad_context.cc @@ -63,7 +63,6 @@ class PadContextOp : public Operation { for (index_t i = 0; i < batch; ++i) { T *out_base = output_data + i * output_chunk * dim; const T *in_base = input_data + i * chunk * dim; -#pragma omp parallel for schedule(runtime) for (index_t j = 0; j < left_context_; ++j) { memcpy(out_base + j * dim, in_base, dim * sizeof(T)); } @@ -71,7 +70,6 @@ class PadContextOp : public Operation { memcpy(out_base, in_base, chunk * dim * sizeof(T)); out_base = out_base + chunk * dim; in_base = in_base + (chunk -1) * dim; -#pragma omp parallel for schedule(runtime) for (index_t j = 0; j < right_context_; ++j) { memcpy(out_base + j * dim, in_base, dim * sizeof(T)); } diff --git a/mace/ops/pnorm.cc b/mace/ops/pnorm.cc index 6964c6810bac50e59350410f009ac85c85f44ed6..1d0d6698604834fdd58fb390171d21d0976780ec 100644 --- a/mace/ops/pnorm.cc +++ b/mace/ops/pnorm.cc @@ -28,14 +28,13 @@ #include "mace/core/operator.h" - namespace mace { namespace ops { -template +template class PNormOp; -template +template class PNormOp : public Operation { public: explicit PNormOp(OpConstructContext *context) @@ -52,7 +51,7 @@ class PNormOp : public Operation { const index_t dim_size = input_shape.size(); MACE_CHECK(dim_size >= 1, "PNorm only supports input dim size >= 1"); std::vector output_shape(input_shape); - const index_t input_dim = input_shape[dim_size -1]; + const index_t input_dim = input_shape[dim_size - 1]; MACE_CHECK(output_dim_ > 0, "Output dim should be greater than zero."); MACE_CHECK(input_dim % output_dim_ == 0 && output_dim_ < input_dim, @@ -69,48 +68,59 @@ class PNormOp : public Operation { const index_t bh = std::accumulate(input->shape().begin(), input->shape().end() - 1, 1, std::multiplies()); + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + if (p_ == 0) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t i = 0; i < bh; ++i) { - for (index_t j = 0; j < output_dim_; ++j) { - const T *in_base = input_data + i * input_dim + j * group_size; - T *out_base = output_data + i * output_dim_; - T temp_result = 0; - for (index_t g = 0; g < group_size; ++g) { - T value = - (std::fabs(in_base[g]) - > std::numeric_limits::epsilon()) ? 1.0f : 0.0f; - temp_result += value; + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + const T *in_base = input_data + i * input_dim + j * group_size; + T *out_base = output_data + i * output_dim_; + T temp_result = 0; + for (index_t g = 0; g < group_size; ++g) { + T value = + (std::fabs(in_base[g]) + > std::numeric_limits::epsilon()) ? 1.0f : 0.0f; + temp_result += value; + } + out_base[j] = temp_result; } - out_base[j] = temp_result; } - } + }, 0, bh, 1, 0, output_dim_, 1); + } else if (p_ == 1) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t i = 0; i < bh; ++i) { - for (index_t j = 0; j < output_dim_; ++j) { - const T *in_base = input_data + i * input_dim + j * group_size; - T *out_base = output_data + i * output_dim_; - T temp_result = 0; - for (index_t g = 0; g < group_size; ++g) { - temp_result += std::abs(in_base[g]);; + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + const T *in_base = input_data + i * input_dim + j * group_size; + T *out_base = output_data + i * output_dim_; + T temp_result = 0; + for (index_t g = 0; g < group_size; ++g) { + temp_result += std::abs(in_base[g]);; + } + out_base[j] = temp_result; } - out_base[j] = temp_result; } - } + }, 0, bh, 1, 0, output_dim_, 1); } else if (p_ == 2) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t i = 0; i < bh; ++i) { - for (index_t j = 0; j < output_dim_; ++j) { - const T *in_base = input_data + i * input_dim + j * group_size; - T *out_base = output_data + i * output_dim_; - T temp_result = 0; - for (index_t g = 0; g < group_size; ++g) { - temp_result += in_base[g] * in_base[g]; + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + const T *in_base = input_data + i * input_dim + j * group_size; + T *out_base = output_data + i * output_dim_; + T temp_result = 0; + for (index_t g = 0; g < group_size; ++g) { + temp_result += in_base[g] * in_base[g]; + } + out_base[j] = std::sqrt(temp_result); } - out_base[j] = std::sqrt(temp_result); } - } + }, 0, bh, 1, 0, output_dim_, 1); } else { LOG(FATAL) << "PNorm's p should be 0, 1 or 2, here p is: " << p_; } diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 969f2774e3bb5a5fcf35e37e5f613f2f87b9f19b..52842c5230a299ade8af2d85e24ba23f00052e30 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -57,10 +57,10 @@ class PoolingOpBase : public ConvPool2dOpBase { MACE_OP_OUTPUT_TAGS(OUTPUT); }; -template +template class PoolingOp; -template <> +template<> class PoolingOp : public PoolingOpBase { public: explicit PoolingOp(OpConstructContext *context) @@ -99,7 +99,8 @@ class PoolingOp : public PoolingOpBase { int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2}; if (pooling_type_ == PoolingType::MAX) { - MaxPooling(input, + MaxPooling(context, + input, input_shape, output_shape.data(), kernels_.data(), @@ -108,7 +109,8 @@ class PoolingOp : public PoolingOpBase { pad_hw, output); } else if (pooling_type_ == PoolingType::AVG) { - AvgPooling(input, + AvgPooling(context, + input, input_shape, output_shape.data(), kernels_.data(), @@ -124,7 +126,8 @@ class PoolingOp : public PoolingOpBase { } private: - void MaxPooling(const float *input, + void MaxPooling(const OpContext *context, + const float *input, const index_t *in_shape, const index_t *out_shape, const int *filter_hw, @@ -132,45 +135,56 @@ class PoolingOp : public PoolingOpBase { const int *dilation_hw, const int *pad_hw, float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t c = 0; c < out_shape[1]; ++c) { - const index_t out_base = b * out_batch_size + c * out_image_size; - const index_t in_base = b * in_batch_size + c * in_image_size; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w < out_width; ++w) { - const index_t out_offset = out_base + h * out_width + w; - float res = std::numeric_limits::lowest(); - for (int fh = 0; fh < filter_hw[0]; ++fh) { - for (int fw = 0; fw < filter_hw[1]; ++fw) { - index_t inh = - h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; - index_t inw = - w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; - if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { - index_t input_offset = in_base + inh * in_width + inw; - res = std::max(res, input[input_offset]); + const index_t batch = out_shape[0]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const index_t out_base = b * out_batch_size + c * out_image_size; + const index_t in_base = b * in_batch_size + c * in_image_size; + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + const index_t out_offset = out_base + h * out_width + w; + float res = std::numeric_limits::lowest(); + for (int fh = 0; fh < filter_hw[0]; ++fh) { + for (int fw = 0; fw < filter_hw[1]; ++fw) { + index_t inh = + h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; + index_t inw = + w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; + if (inh >= 0 && inh < in_height && inw >= 0 + && inw < in_width) { + index_t input_offset = in_base + inh * in_width + inw; + res = std::max(res, input[input_offset]); + } } } + output[out_offset] = res; } - output[out_offset] = res; } } } - } + }, 0, batch, 1, 0, out_channels, 1); } - void AvgPooling(const float *input, + void AvgPooling(const OpContext *context, + const float *input, const index_t *in_shape, const index_t *out_shape, const int *filter_hw, @@ -178,48 +192,62 @@ class PoolingOp : public PoolingOpBase { const int *dilation_hw, const int *pad_hw, float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t c = 0; c < out_shape[1]; ++c) { - const index_t out_base = b * out_batch_size + c * out_image_size; - const index_t in_base = b * in_batch_size + c * in_image_size; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w < out_width; ++w) { - const index_t out_offset = out_base + h * out_width + w; - float res = 0; - int block_size = 0; - for (int fh = 0; fh < filter_hw[0]; ++fh) { - for (int fw = 0; fw < filter_hw[1]; ++fw) { - index_t inh = - h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; - index_t inw = - w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; - if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { - index_t input_offset = in_base + inh * in_width + inw; - res += input[input_offset]; - ++block_size; + const index_t batch = out_shape[0]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const index_t out_base = b * out_batch_size + c * out_image_size; + const index_t in_base = b * in_batch_size + c * in_image_size; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + const index_t out_offset = out_base + h * out_width + w; + float res = 0; + int block_size = 0; + for (int fh = 0; fh < filter_hw[0]; ++fh) { + for (int fw = 0; fw < filter_hw[1]; ++fw) { + index_t inh = + h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; + index_t inw = + w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; + if (inh >= 0 && inh < in_height && inw >= 0 + && inw < in_width) { + index_t input_offset = in_base + inh * in_width + inw; + res += input[input_offset]; + ++block_size; + } } } + output[out_offset] = res / block_size; } - output[out_offset] = res / block_size; } } } - } + }, 0, batch, 1, 0, out_channels, 1); } }; #ifdef MACE_ENABLE_QUANTIZE -template <> +template<> class PoolingOp : public PoolingOpBase { public: explicit PoolingOp(OpConstructContext *context) @@ -275,7 +303,8 @@ class PoolingOp : public PoolingOpBase { int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2}; if (pooling_type_ == PoolingType::MAX) { - MaxPooling(input, + MaxPooling(context, + input, input_tensor->shape().data(), output_shape.data(), kernels_.data(), @@ -283,7 +312,8 @@ class PoolingOp : public PoolingOpBase { pad_hw, output); } else if (pooling_type_ == PoolingType::AVG) { - AvgPooling(input, + AvgPooling(context, + input, input_tensor->shape().data(), output_shape.data(), kernels_.data(), @@ -298,131 +328,145 @@ class PoolingOp : public PoolingOpBase { } private: - void MaxPooling(const uint8_t *input, + void MaxPooling(const OpContext *context, + const uint8_t *input, const index_t *in_shape, const index_t *out_shape, const int *filter_hw, const int *stride_hw, const int *pad_hw, uint8_t *output) { -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t h = 0; h < out_shape[1]; ++h) { - for (index_t w = 0; w < out_shape[2]; ++w) { - const index_t out_height = out_shape[1]; - const index_t out_width = out_shape[2]; - const index_t channels = out_shape[3]; - const index_t in_height = in_shape[1]; - const index_t in_width = in_shape[2]; - const index_t in_h_base = h * stride_hw[0] - pad_hw[0]; - const index_t in_w_base = w * stride_hw[1] - pad_hw[1]; - const index_t in_h_begin = std::max(0, in_h_base); - const index_t in_w_begin = std::max(0, in_w_base); - const index_t in_h_end = - std::min(in_height, in_h_base + filter_hw[0]); - const index_t in_w_end = - std::min(in_width, in_w_base + filter_hw[1]); - - uint8_t *out_ptr = - output + ((b * out_height + h) * out_width + w) * channels; - std::fill_n(out_ptr, channels, 0); - for (index_t ih = in_h_begin; ih < in_h_end; ++ih) { - for (index_t iw = in_w_begin; iw < in_w_end; ++iw) { - const uint8_t *in_ptr = input + - ((b * in_height + ih) * in_width + iw) * channels; - index_t c = 0; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t h = start1; h < end1; h += step1) { + for (index_t w = start2; w < end2; w += step2) { + const index_t out_height = out_shape[1]; + const index_t out_width = out_shape[2]; + const index_t channels = out_shape[3]; + const index_t in_height = in_shape[1]; + const index_t in_width = in_shape[2]; + const index_t in_h_base = h * stride_hw[0] - pad_hw[0]; + const index_t in_w_base = w * stride_hw[1] - pad_hw[1]; + const index_t in_h_begin = std::max(0, in_h_base); + const index_t in_w_begin = std::max(0, in_w_base); + const index_t in_h_end = + std::min(in_height, in_h_base + filter_hw[0]); + const index_t in_w_end = + std::min(in_width, in_w_base + filter_hw[1]); + + uint8_t *out_ptr = + output + ((b * out_height + h) * out_width + w) * channels; + std::fill_n(out_ptr, channels, 0); + for (index_t ih = in_h_begin; ih < in_h_end; ++ih) { + for (index_t iw = in_w_begin; iw < in_w_end; ++iw) { + const uint8_t *in_ptr = input + + ((b * in_height + ih) * in_width + iw) * channels; + index_t c = 0; #if defined(MACE_ENABLE_NEON) - for (; c <= channels - 16; c += 16) { - uint8x16_t out_vec = vld1q_u8(out_ptr + c); - uint8x16_t in_vec = vld1q_u8(in_ptr + c); - out_vec = vmaxq_u8(out_vec, in_vec); - vst1q_u8(out_ptr + c, out_vec); - } - for (; c <= channels - 8; c += 8) { - uint8x8_t out_vec = vld1_u8(out_ptr + c); - uint8x8_t in_vec = vld1_u8(in_ptr + c); - out_vec = vmax_u8(out_vec, in_vec); - vst1_u8(out_ptr + c, out_vec); - } + for (; c <= channels - 16; c += 16) { + uint8x16_t out_vec = vld1q_u8(out_ptr + c); + uint8x16_t in_vec = vld1q_u8(in_ptr + c); + out_vec = vmaxq_u8(out_vec, in_vec); + vst1q_u8(out_ptr + c, out_vec); + } + for (; c <= channels - 8; c += 8) { + uint8x8_t out_vec = vld1_u8(out_ptr + c); + uint8x8_t in_vec = vld1_u8(in_ptr + c); + out_vec = vmax_u8(out_vec, in_vec); + vst1_u8(out_ptr + c, out_vec); + } #endif - for (; c < channels; ++c) { - out_ptr[c] = std::max(out_ptr[c], in_ptr[c]); + for (; c < channels; ++c) { + out_ptr[c] = std::max(out_ptr[c], in_ptr[c]); + } } } } } } - } + }, 0, out_shape[0], 1, 0, out_shape[1], 1, 0, out_shape[2], 1); } - void AvgPooling(const uint8_t *input, + void AvgPooling(const OpContext *context, + const uint8_t *input, const index_t *in_shape, const index_t *out_shape, const int *filter_hw, const int *stride_hw, const int *pad_hw, uint8_t *output) { -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t h = 0; h < out_shape[1]; ++h) { - for (index_t w = 0; w < out_shape[2]; ++w) { - const index_t out_height = out_shape[1]; - const index_t out_width = out_shape[2]; - const index_t channels = out_shape[3]; - const index_t in_height = in_shape[1]; - const index_t in_width = in_shape[2]; - const index_t in_h_base = h * stride_hw[0] - pad_hw[0]; - const index_t in_w_base = w * stride_hw[1] - pad_hw[1]; - const index_t in_h_begin = std::max(0, in_h_base); - const index_t in_w_begin = std::max(0, in_w_base); - const index_t in_h_end = - std::min(in_height, in_h_base + filter_hw[0]); - const index_t in_w_end = - std::min(in_width, in_w_base + filter_hw[1]); - const index_t block_size = - (in_h_end - in_h_begin) * (in_w_end - in_w_begin); - MACE_CHECK(block_size > 0); - - std::vector average_buffer(channels); - uint16_t *avg_buffer = average_buffer.data(); - std::fill_n(avg_buffer, channels, 0); - for (index_t ih = in_h_begin; ih < in_h_end; ++ih) { - for (index_t iw = in_w_begin; iw < in_w_end; ++iw) { - const uint8_t *in_ptr = input + - ((b * in_height + ih) * in_width + iw) * channels; - index_t c = 0; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t h = start1; h < end1; h += step1) { + for (index_t w = start2; w < end2; w += step2) { + const index_t out_height = out_shape[1]; + const index_t out_width = out_shape[2]; + const index_t channels = out_shape[3]; + const index_t in_height = in_shape[1]; + const index_t in_width = in_shape[2]; + const index_t in_h_base = h * stride_hw[0] - pad_hw[0]; + const index_t in_w_base = w * stride_hw[1] - pad_hw[1]; + const index_t in_h_begin = std::max(0, in_h_base); + const index_t in_w_begin = std::max(0, in_w_base); + const index_t in_h_end = + std::min(in_height, in_h_base + filter_hw[0]); + const index_t in_w_end = + std::min(in_width, in_w_base + filter_hw[1]); + const index_t block_size = + (in_h_end - in_h_begin) * (in_w_end - in_w_begin); + MACE_CHECK(block_size > 0); + + std::vector average_buffer(channels); + uint16_t *avg_buffer = average_buffer.data(); + std::fill_n(avg_buffer, channels, 0); + for (index_t ih = in_h_begin; ih < in_h_end; ++ih) { + for (index_t iw = in_w_begin; iw < in_w_end; ++iw) { + const uint8_t *in_ptr = input + + ((b * in_height + ih) * in_width + iw) * channels; + index_t c = 0; #if defined(MACE_ENABLE_NEON) - for (; c <= channels - 16; c += 16) { - uint16x8_t avg_vec[2]; - avg_vec[0] = vld1q_u16(avg_buffer + c); - avg_vec[1] = vld1q_u16(avg_buffer + c + 8); - uint8x16_t in_vec = vld1q_u8(in_ptr + c); - avg_vec[0] = vaddw_u8(avg_vec[0], vget_low_u8(in_vec)); - avg_vec[1] = vaddw_u8(avg_vec[1], vget_high_u8(in_vec)); - vst1q_u16(avg_buffer + c, avg_vec[0]); - vst1q_u16(avg_buffer + c + 8, avg_vec[1]); - } - for (; c <= channels - 8; c += 8) { - uint16x8_t avg_vec = vld1q_u16(avg_buffer + c); - uint8x8_t in_vec = vld1_u8(in_ptr + c); - avg_vec = vaddw_u8(avg_vec, in_vec); - vst1q_u16(avg_buffer + c, avg_vec); - } + for (; c <= channels - 16; c += 16) { + uint16x8_t avg_vec[2]; + avg_vec[0] = vld1q_u16(avg_buffer + c); + avg_vec[1] = vld1q_u16(avg_buffer + c + 8); + uint8x16_t in_vec = vld1q_u8(in_ptr + c); + avg_vec[0] = vaddw_u8(avg_vec[0], vget_low_u8(in_vec)); + avg_vec[1] = vaddw_u8(avg_vec[1], vget_high_u8(in_vec)); + vst1q_u16(avg_buffer + c, avg_vec[0]); + vst1q_u16(avg_buffer + c + 8, avg_vec[1]); + } + for (; c <= channels - 8; c += 8) { + uint16x8_t avg_vec = vld1q_u16(avg_buffer + c); + uint8x8_t in_vec = vld1_u8(in_ptr + c); + avg_vec = vaddw_u8(avg_vec, in_vec); + vst1q_u16(avg_buffer + c, avg_vec); + } #endif - for (; c < channels; ++c) { - avg_buffer[c] += in_ptr[c]; + for (; c < channels; ++c) { + avg_buffer[c] += in_ptr[c]; + } } } - } - uint8_t *out_ptr = - output + ((b * out_height + h) * out_width + w) * channels; - for (index_t c = 0; c < channels; ++c) { - out_ptr[c] = static_cast( - (avg_buffer[c] + block_size / 2) / block_size); + uint8_t *out_ptr = + output + ((b * out_height + h) * out_width + w) * channels; + for (index_t c = 0; c < channels; ++c) { + out_ptr[c] = static_cast( + (avg_buffer[c] + block_size / 2) / block_size); + } } } } - } + }, 0, out_shape[0], 1, 0, out_shape[1], 1, 0, out_shape[2], 1); } }; #endif // MACE_ENABLE_QUANTIZE @@ -454,7 +498,6 @@ class PoolingOp : public PoolingOpBase { }; #endif // MACE_ENABLE_OPENCL - void RegisterPooling(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, DeviceType::CPU, float); diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index f9a83a23e41ef290fde9d8005bcf8419a2b217ea..104b67bc304de59a16d54bcdc6c66c68c987c0c7 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -216,7 +216,7 @@ TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { SimpleMaxPooling3S2(); } namespace { template void MaxPooling3S2(const std::vector &input_shape, - const std::vector strides, + const std::vector &strides, Padding padding) { // Construct graph OpsTestNet net; diff --git a/mace/ops/prior_box.cc b/mace/ops/prior_box.cc index 3226d2be63f0380feac80f9ddd52cb7172da928f..62040d272d4eb7ba46ba8b6d3bc20db401f9c644 100644 --- a/mace/ops/prior_box.cc +++ b/mace/ops/prior_box.cc @@ -113,7 +113,6 @@ class PriorBoxOp : public Operation { } if (clip_) { -#pragma omp parallel for schedule(runtime) for (int i = 0; i < dim; ++i) { T min = 0; T max = 1; @@ -122,7 +121,6 @@ class PriorBoxOp : public Operation { } output_data += dim; -#pragma omp parallel for schedule(runtime) for (int i = 0; i < dim / 4; ++i) { int index = i * 4; output_data[0 + index] = variance_[0]; diff --git a/mace/ops/quantize.cc b/mace/ops/quantize.cc index 6be719c5feb4e8ae8af3f1ad1734e9843961b8df..09354a45a5513783d9962adbbe1ea25f27b33529 100644 --- a/mace/ops/quantize.cc +++ b/mace/ops/quantize.cc @@ -19,15 +19,15 @@ #include "mace/core/operator.h" #include "mace/core/tensor.h" -#include "mace/utils/quantize.h" +#include "mace/core/quantize.h" namespace mace { namespace ops { -template +template class QuantizeOp; -template <> +template<> class QuantizeOp : public Operation { public: explicit QuantizeOp(OpConstructContext *context) @@ -36,7 +36,8 @@ class QuantizeOp : public Operation { static_cast(Operation::GetOptionalArg("non_zero", 0))), find_range_every_time_(static_cast(Operation::GetOptionalArg( "find_range_every_time", - 0))) {} + 0))), + quantize_util_(&context->device()->cpu_runtime()->thread_pool()) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -48,20 +49,20 @@ class QuantizeOp : public Operation { const float *input_data = input->data(); uint8_t *output_data = output->mutable_data(); if (!find_range_every_time_ && output->scale() > 0.f) { - QuantizeWithScaleAndZeropoint(input_data, - input->size(), - output->scale(), - output->zero_point(), - output_data); + quantize_util_.QuantizeWithScaleAndZeropoint(input_data, + input->size(), + output->scale(), + output->zero_point(), + output_data); } else { float scale; int32_t zero_point; - Quantize(input_data, - input->size(), - non_zero_, - output_data, - &scale, - &zero_point); + quantize_util_.Quantize(input_data, + input->size(), + non_zero_, + output_data, + &scale, + &zero_point); output->SetScale(scale); output->SetZeroPoint(zero_point); } @@ -71,16 +72,18 @@ class QuantizeOp : public Operation { private: bool non_zero_; bool find_range_every_time_; + QuantizeUtil quantize_util_; }; -template +template class DequantizeOp; -template +template class DequantizeOp : public Operation { public: explicit DequantizeOp(OpConstructContext *context) - : Operation(context) {} + : Operation(context), + quantize_util_(&context->device()->cpu_runtime()->thread_pool()) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -91,13 +94,16 @@ class DequantizeOp : public Operation { Tensor::MappingGuard output_guard(output); const T *input_data = input->data(); float *output_data = output->mutable_data(); - Dequantize(input_data, - input->size(), - input->scale(), - input->zero_point(), - output_data); + quantize_util_.Dequantize(input_data, + input->size(), + input->scale(), + input->zero_point(), + output_data); return MaceStatus::MACE_SUCCESS; } + + private: + QuantizeUtil quantize_util_; }; void RegisterQuantize(OpRegistryBase *op_registry) { diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc index 068212f204d85a3129d1f7ad9e9cbe0cfca06491..29ce821b84a98f8552ce4d3e60a0f9d693f39f0d 100644 --- a/mace/ops/reduce.cc +++ b/mace/ops/reduce.cc @@ -33,12 +33,12 @@ namespace ops { class ReduceOpBase : public Operation { public: explicit ReduceOpBase(OpConstructContext *context) - : Operation(context), - reduce_type_( - static_cast(Operation::GetOptionalArg( - "reduce_type", static_cast(MEAN)))), - axis_(Operation::GetRepeatedArgs("axis")), - keep_dims_(Operation::GetOptionalArg("keepdims", false)) { + : Operation(context), + reduce_type_( + static_cast(Operation::GetOptionalArg( + "reduce_type", static_cast(MEAN)))), + axis_(Operation::GetRepeatedArgs("axis")), + keep_dims_(Operation::GetOptionalArg("keepdims", false)) { } protected: @@ -54,15 +54,15 @@ class ReduceOpBase : public Operation { } protected: - ReduceType reduce_type_; + ReduceType reduce_type_; std::vector axis_; bool keep_dims_; }; -template +template class ReduceOp; -template +template class ReduceOp : public ReduceOpBase { public: explicit ReduceOp(OpConstructContext *context) @@ -78,7 +78,7 @@ class ReduceOp : public ReduceOpBase { output->SetScale(input->scale()); output->SetZeroPoint(input->zero_point()); output->Resize(out_shape_); - Compute(input, output); + Compute(context, input, output); return MaceStatus::MACE_SUCCESS; } @@ -92,8 +92,8 @@ class ReduceOp : public ReduceOpBase { } else { for (unsigned int i = 0; i < axis_.size(); ++i) { int index = axis_[i] >= 0 ? - axis_[i] : - axis_[i] + input->dim_size(); + axis_[i] : + axis_[i] + input->dim_size(); auto has_df = Operation::GetOptionalArg( "has_data_format", 0); if (has_df && DataTypeToEnum::value != DT_UINT8 @@ -128,7 +128,7 @@ class ReduceOp : public ReduceOpBase { if (n == 1) { bitmap[dim_index] = bitmap[dim_index - 1]; } - if (bitmap[dim_index-1] != bitmap[dim_index]) { + if (bitmap[dim_index - 1] != bitmap[dim_index]) { data_reshape_.push_back(n); } else { data_reshape_.back() *= n; @@ -137,7 +137,11 @@ class ReduceOp : public ReduceOpBase { } } - void Reduce1Dims(const T *input, ReduceType type, T *output) { + void Reduce1Dims(const OpContext *context, + const T *input, + ReduceType type, + T *output) { + MACE_UNUSED(context); if (reduce_first_axis_) { if (type == ReduceType::MEAN) { T tmp = 0; @@ -157,13 +161,13 @@ class ReduceOp : public ReduceOpBase { tmp = std::max(tmp, input[i]); } output[0] = tmp; - } else if (type == ReduceType::PROD) { + } else if (type == ReduceType::PROD) { T tmp = input[0]; for (int i = 1; i < data_reshape_[0]; ++i) { tmp = tmp * input[i]; } output[0] = tmp; - } else { + } else { MACE_NOT_IMPLEMENTED; } } else { @@ -171,359 +175,367 @@ class ReduceOp : public ReduceOpBase { } } - void Reduce2Dims(const T *input, ReduceType type, T *output) { + void Reduce2Dims(const OpContext *context, + const T *input, + ReduceType type, + T *output) { + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + if (reduce_first_axis_) { - if (type == ReduceType::MEAN) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - T tmp = 0; - for (int j = 0; j < data_reshape_[0]; ++j) { - tmp += input[j * data_reshape_[1] + i]; + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + if (type == ReduceType::MEAN) { + for (index_t i = start; i < end; i += step) { + T tmp = 0; + for (int j = 0; j < data_reshape_[0]; ++j) { + tmp += input[j * data_reshape_[1] + i]; + } + output[i] = tmp / data_reshape_[0]; } - output[i] = tmp / data_reshape_[0]; - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - T tmp = input[i]; - for (int j = 1; j < data_reshape_[0]; ++j) { - tmp = std::min(tmp, input[j * data_reshape_[1] + i]); + } else if (type == ReduceType::MIN) { + for (index_t i = start; i < end; i += step) { + T tmp = input[i]; + for (int j = 1; j < data_reshape_[0]; ++j) { + tmp = std::min(tmp, input[j * data_reshape_[1] + i]); + } + output[i] = tmp; } - output[i] = tmp; - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - T tmp = input[i]; - for (int j = 1; j < data_reshape_[0]; ++j) { - tmp = std::max(tmp, input[j * data_reshape_[1] + i]); + } else if (type == ReduceType::MAX) { + for (index_t i = start; i < end; i += step) { + T tmp = input[i]; + for (int j = 1; j < data_reshape_[0]; ++j) { + tmp = std::max(tmp, input[j * data_reshape_[1] + i]); + } + output[i] = tmp; } - output[i] = tmp; - } - } else if (type == ReduceType::PROD) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - T tmp = input[i]; - for (int j = 1; j < data_reshape_[0]; ++j) { - tmp = tmp * input[j * data_reshape_[1] + i]; + } else if (type == ReduceType::PROD) { + for (index_t i = start; i < end; i += step) { + T tmp = input[i]; + for (int j = 1; j < data_reshape_[0]; ++j) { + tmp = tmp * input[j * data_reshape_[1] + i]; + } + output[i] = tmp; } - output[i] = tmp; + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[1], 1); } else { - if (type == ReduceType::MEAN) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - T tmp = 0; - for (int j = 0; j < data_reshape_[1]; ++j) { - tmp += input[i * data_reshape_[1] + j]; + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + if (type == ReduceType::MEAN) { + for (index_t i = start; i < end; i += step) { + T tmp = 0; + for (int j = 0; j < data_reshape_[1]; ++j) { + tmp += input[i * data_reshape_[1] + j]; + } + output[i] = tmp / data_reshape_[1]; } - output[i] = tmp / data_reshape_[1]; - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - T tmp = input[i * data_reshape_[1]]; - for (int j = 1; j < data_reshape_[1]; ++j) { - tmp = std::min(tmp, input[i * data_reshape_[1] + j]); + } else if (type == ReduceType::MIN) { + for (index_t i = start; i < end; i += step) { + T tmp = input[i * data_reshape_[1]]; + for (int j = 1; j < data_reshape_[1]; ++j) { + tmp = std::min(tmp, input[i * data_reshape_[1] + j]); + } + output[i] = tmp; } - output[i] = tmp; - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - T tmp = input[i * data_reshape_[1]]; - for (int j = 1; j < data_reshape_[1]; ++j) { - tmp = std::max(tmp, input[i * data_reshape_[1] + j]); + } else if (type == ReduceType::MAX) { + for (index_t i = start; i < end; i += step) { + T tmp = input[i * data_reshape_[1]]; + for (int j = 1; j < data_reshape_[1]; ++j) { + tmp = std::max(tmp, input[i * data_reshape_[1] + j]); + } + output[i] = tmp; } - output[i] = tmp; - } - } else if (type == ReduceType::PROD) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - T tmp = input[i * data_reshape_[1]]; - for (int j = 1; j < data_reshape_[1]; ++j) { - tmp = tmp * input[i * data_reshape_[1] + j]; + } else if (type == ReduceType::PROD) { + for (index_t i = start; i < end; i += step) { + T tmp = input[i * data_reshape_[1]]; + for (int j = 1; j < data_reshape_[1]; ++j) { + tmp = tmp * input[i * data_reshape_[1] + j]; + } + output[i] = tmp; } - output[i] = tmp; + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[0], 1); } } - void Reduce3Dims(const T *input, ReduceType type, T *output) { + void Reduce3Dims(const OpContext *context, + const T *input, + ReduceType type, + T *output) { + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + if (reduce_first_axis_) { - if (type == ReduceType::MEAN) { -#pragma omp parallel for collapse(1) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - for (int k = 0; k < data_reshape_[0]; ++k) { - output[i] += - input[(k * data_reshape_[1] + i) * data_reshape_[2] - + j]; + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + if (type == ReduceType::MEAN) { + for (index_t i = start; i < end; i += step) { + for (int j = 0; j < data_reshape_[2]; ++j) { + for (int k = 0; k < data_reshape_[0]; ++k) { + output[i] += + input[(k * data_reshape_[1] + i) * data_reshape_[2] + + j]; + } } + output[i] /= (data_reshape_[0] * data_reshape_[2]); } - output[i] /= (data_reshape_[0] * data_reshape_[2]); - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for collapse(1) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - T tmp = input[i * data_reshape_[2]]; - for (int j = 0; j < data_reshape_[2]; ++j) { - for (int k = 0; k < data_reshape_[0]; ++k) { - tmp = std::min(tmp, - input[(k * data_reshape_[1] + i) * data_reshape_[2] - + j]); + } else if (type == ReduceType::MIN) { + for (index_t i = start; i < end; i += step) { + T tmp = input[i * data_reshape_[2]]; + for (int j = 0; j < data_reshape_[2]; ++j) { + for (int k = 0; k < data_reshape_[0]; ++k) { + tmp = std::min(tmp, + input[ + (k * data_reshape_[1] + i) * data_reshape_[2] + + j]); + } } + output[i] = tmp; } - output[i] = tmp; - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for collapse(1) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - T tmp = input[i * data_reshape_[2]]; - for (int j = 0; j < data_reshape_[2]; ++j) { - for (int k = 0; k < data_reshape_[0]; ++k) { - tmp = - std::max(tmp, - input[(k * data_reshape_[1] + i) - * data_reshape_[2] + j]); + } else if (type == ReduceType::MAX) { + for (index_t i = start; i < end; i += step) { + T tmp = input[i * data_reshape_[2]]; + for (int j = 0; j < data_reshape_[2]; ++j) { + for (int k = 0; k < data_reshape_[0]; ++k) { + tmp = + std::max(tmp, + input[(k * data_reshape_[1] + i) + * data_reshape_[2] + j]); + } } + output[i] = tmp; } - output[i] = tmp; - } - } else if (type == ReduceType::PROD) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - T tmp = 1; - for (int j = 0; j < data_reshape_[2]; ++j) { - for (int k = 0; k < data_reshape_[0]; ++k) { - tmp *= - input[(k * data_reshape_[1] + i) * data_reshape_[2] - + j]; + } else if (type == ReduceType::PROD) { + for (index_t i = start; i < end; i += step) { + T tmp = 1; + for (int j = 0; j < data_reshape_[2]; ++j) { + for (int k = 0; k < data_reshape_[0]; ++k) { + tmp *= + input[(k * data_reshape_[1] + i) * data_reshape_[2] + + j]; + } } + output[i] = tmp; } - output[i] = tmp; + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[1], 1); } else { - if (type == ReduceType::MEAN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - for (int k = 0; k < data_reshape_[1]; ++k) { - output[i * data_reshape_[2] + j] += - input[(i * data_reshape_[1] + k) * data_reshape_[2] - + j]; + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + if (type == ReduceType::MEAN) { + for (index_t i = start; i < end; i += step) { + for (int j = 0; j < data_reshape_[2]; ++j) { + for (int k = 0; k < data_reshape_[1]; ++k) { + output[i * data_reshape_[2] + j] += + input[(i * data_reshape_[1] + k) * data_reshape_[2] + + j]; + } + output[i * data_reshape_[2] + j] /= data_reshape_[1]; } - output[i * data_reshape_[2] + j] /= data_reshape_[1]; } - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j]; - for (int k = 1; k < data_reshape_[1]; ++k) { - tmp = std::min(tmp, - input[(i * data_reshape_[1] + k) * - data_reshape_[2] + j]); + } else if (type == ReduceType::MIN) { + for (index_t i = start; i < end; i += step) { + for (int j = 0; j < data_reshape_[2]; ++j) { + T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j]; + for (int k = 1; k < data_reshape_[1]; ++k) { + tmp = std::min(tmp, + input[(i * data_reshape_[1] + k) * + data_reshape_[2] + j]); + } + output[i * data_reshape_[2] + j] = tmp; } - output[i * data_reshape_[2] + j] = tmp; } - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j]; - for (int k = 1; k < data_reshape_[1]; ++k) { - tmp = std::max(tmp, - input[(i * data_reshape_[1] + k) * - data_reshape_[2] + j]); + } else if (type == ReduceType::MAX) { + for (index_t i = start; i < end; i += step) { + for (int j = 0; j < data_reshape_[2]; ++j) { + T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j]; + for (int k = 1; k < data_reshape_[1]; ++k) { + tmp = std::max(tmp, + input[(i * data_reshape_[1] + k) * + data_reshape_[2] + j]); + } + output[i * data_reshape_[2] + j] = tmp; } - output[i * data_reshape_[2] + j] = tmp; } - } - } else if (type == ReduceType::PROD) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j]; - for (int k = 1; k < data_reshape_[1]; ++k) { - tmp *= input[(i * data_reshape_[1] + k) * - data_reshape_[2] + j]; + } else if (type == ReduceType::PROD) { + for (index_t i = start; i < end; i += step) { + for (int j = 0; j < data_reshape_[2]; ++j) { + T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j]; + for (int k = 1; k < data_reshape_[1]; ++k) { + tmp *= input[(i * data_reshape_[1] + k) * + data_reshape_[2] + j]; + } + output[i * data_reshape_[2] + j] = tmp; } - output[i * data_reshape_[2] + j] = tmp; } + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[0], 1); } } - void Reduce4Dims(const T *input, ReduceType type, T *output) { + void Reduce4Dims(const OpContext *context, + const T *input, + ReduceType type, + T *output) { + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + if (reduce_first_axis_) { - if (type == ReduceType::MEAN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - for (int j = 0; j < data_reshape_[3]; ++j) { - for (int k = 0; k < data_reshape_[2]; ++k) { - for (int t = 0; t < data_reshape_[0]; ++t) { - output[i * data_reshape_[3] + j] += - input[((t * data_reshape_[1] + i) * - data_reshape_[2] + k)*data_reshape_[3] + j]; + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + if (type == ReduceType::MEAN) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + for (int k = 0; k < data_reshape_[2]; ++k) { + for (int t = 0; t < data_reshape_[0]; ++t) { + output[i * data_reshape_[3] + j] += + input[((t * data_reshape_[1] + i) * + data_reshape_[2] + k) * data_reshape_[3] + j]; + } } + output[i * data_reshape_[3] + j] /= + (data_reshape_[0] * data_reshape_[2]); } - output[i * data_reshape_[3] + j] /= - (data_reshape_[0] * data_reshape_[2]); } - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - for (int j = 0; j < data_reshape_[3]; ++j) { - T tmp = input[i * data_reshape_[2] * data_reshape_[3] + j]; - for (int k = 0; k < data_reshape_[2]; ++k) { - for (int t = 0; t < data_reshape_[0]; ++t) { - tmp = std::min(tmp, - input[((t * data_reshape_[1] + i) * - data_reshape_[2] + k)*data_reshape_[3] + j]); + } else if (type == ReduceType::MIN) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + T tmp = input[i * data_reshape_[2] * data_reshape_[3] + j]; + for (int k = 0; k < data_reshape_[2]; ++k) { + for (int t = 0; t < data_reshape_[0]; ++t) { + tmp = std::min(tmp, + input[((t * data_reshape_[1] + i) * + data_reshape_[2] + k) * data_reshape_[3] + + j]); + } } + output[i * data_reshape_[3] + j] = tmp; } - output[i * data_reshape_[3] + j] = tmp; } - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - for (int j = 0; j < data_reshape_[3]; ++j) { - T tmp = input[i * data_reshape_[2] * data_reshape_[3] + j]; - for (int k = 0; k < data_reshape_[2]; ++k) { - for (int t = 0; t < data_reshape_[0]; ++t) { - tmp = std::max(tmp, - input[((t * data_reshape_[1] + i) * - data_reshape_[2] + k)*data_reshape_[3] + j]); + } else if (type == ReduceType::MAX) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + T tmp = input[i * data_reshape_[2] * data_reshape_[3] + j]; + for (int k = 0; k < data_reshape_[2]; ++k) { + for (int t = 0; t < data_reshape_[0]; ++t) { + tmp = std::max(tmp, + input[((t * data_reshape_[1] + i) * + data_reshape_[2] + k) * data_reshape_[3] + + j]); + } } + output[i * data_reshape_[3] + j] = tmp; } - output[i * data_reshape_[3] + j] = tmp; } - } - } else if (type == ReduceType::PROD) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - for (int j = 0; j < data_reshape_[3]; ++j) { - T tmp = 1; - for (int k = 0; k < data_reshape_[2]; ++k) { - for (int t = 0; t < data_reshape_[0]; ++t) { - tmp = tmp * input[((t * data_reshape_[1] + i) * - data_reshape_[2] + k)*data_reshape_[3] + j]; + } else if (type == ReduceType::PROD) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + T tmp = 1; + for (int k = 0; k < data_reshape_[2]; ++k) { + for (int t = 0; t < data_reshape_[0]; ++t) { + tmp = tmp * input[((t * data_reshape_[1] + i) * + data_reshape_[2] + k) * data_reshape_[3] + j]; + } } + output[i * data_reshape_[3] + j] = tmp; } - output[i * data_reshape_[3] + j] = tmp; } + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[1], 1, 0, data_reshape_[3], 1); } else { - if (type == ReduceType::MEAN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - for (int k = 0; k < data_reshape_[1]; ++k) { - for (int t = 0; t < data_reshape_[3]; ++t) { - output[i * data_reshape_[2] + j] += - input[((i * data_reshape_[1] + k) * - data_reshape_[2] + j)*data_reshape_[3] + t]; + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + if (type == ReduceType::MEAN) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + for (int k = 0; k < data_reshape_[1]; ++k) { + for (int t = 0; t < data_reshape_[3]; ++t) { + output[i * data_reshape_[2] + j] += + input[((i * data_reshape_[1] + k) * + data_reshape_[2] + j) * data_reshape_[3] + t]; + } } + output[i * data_reshape_[2] + j] /= + (data_reshape_[1] * data_reshape_[3]); } - output[i * data_reshape_[2] + j] /= - (data_reshape_[1] * data_reshape_[3]); } - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - T tmp = input[(i * data_reshape_[1] * - data_reshape_[2] + j)*data_reshape_[3]]; - for (int k = 0; k < data_reshape_[1]; ++k) { - for (int t = 0; t < data_reshape_[3]; ++t) { - tmp = - std::min(tmp, - input[((i * data_reshape_[1] + k) * - data_reshape_[2] + j)*data_reshape_[3] + t]); + } else if (type == ReduceType::MIN) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + T tmp = input[(i * data_reshape_[1] * + data_reshape_[2] + j) * data_reshape_[3]]; + for (int k = 0; k < data_reshape_[1]; ++k) { + for (int t = 0; t < data_reshape_[3]; ++t) { + tmp = + std::min(tmp, + input[((i * data_reshape_[1] + k) * + data_reshape_[2] + j) * data_reshape_[3] + + t]); + } } + output[i * data_reshape_[2] + j] = tmp; } - output[i * data_reshape_[2] + j] = tmp; } - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - T tmp = input[(i * data_reshape_[1] * - data_reshape_[2] + j)*data_reshape_[3]]; - for (int k = 0; k < data_reshape_[1]; ++k) { - for (int t = 0; t < data_reshape_[3]; ++t) { - tmp = - std::max(tmp, - input[((i * data_reshape_[1] + k) * - data_reshape_[2] + j)*data_reshape_[3] + t]); + } else if (type == ReduceType::MAX) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + T tmp = input[(i * data_reshape_[1] * + data_reshape_[2] + j) * data_reshape_[3]]; + for (int k = 0; k < data_reshape_[1]; ++k) { + for (int t = 0; t < data_reshape_[3]; ++t) { + tmp = + std::max(tmp, + input[((i * data_reshape_[1] + k) * + data_reshape_[2] + j) * data_reshape_[3] + + t]); + } } + output[i * data_reshape_[2] + j] = tmp; } - output[i * data_reshape_[2] + j] = tmp; } - } - } else if (type == ReduceType::PROD) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - T tmp = 1; - for (int k = 0; k < data_reshape_[1]; ++k) { - for (int t = 0; t < data_reshape_[3]; ++t) { - tmp = tmp * input[((i * data_reshape_[1] + k) * - data_reshape_[2] + j)*data_reshape_[3] + t]; + } else if (type == ReduceType::PROD) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + T tmp = 1; + for (int k = 0; k < data_reshape_[1]; ++k) { + for (int t = 0; t < data_reshape_[3]; ++t) { + tmp = tmp * input[((i * data_reshape_[1] + k) * + data_reshape_[2] + j) * data_reshape_[3] + t]; + } } + output[i * data_reshape_[2] + j] = tmp; } - output[i * data_reshape_[2] + j] = tmp; } + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[0], 1, 0, data_reshape_[2], 1); } } - void Compute(const Tensor *input, Tensor *output) { + void Compute(const OpContext *context, const Tensor *input, Tensor *output) { Tensor::MappingGuard input_mapper(input); const T *input_ptr = input->data(); Tensor::MappingGuard output_map(output); T *output_ptr = output->mutable_data(); memset(output_ptr, 0, output->size() * sizeof(T)); switch (data_reshape_.size()) { - case 1: - Reduce1Dims(input_ptr, reduce_type_, output_ptr); + case 1:Reduce1Dims(context, input_ptr, reduce_type_, output_ptr); break; - case 2: - Reduce2Dims(input_ptr, reduce_type_, output_ptr); + case 2:Reduce2Dims(context, input_ptr, reduce_type_, output_ptr); break; - case 3: - Reduce3Dims(input_ptr, reduce_type_, output_ptr); + case 3:Reduce3Dims(context, input_ptr, reduce_type_, output_ptr); break; - case 4: - Reduce4Dims(input_ptr, reduce_type_, output_ptr); + case 4:Reduce4Dims(context, input_ptr, reduce_type_, output_ptr); break; - default: - MACE_CHECK(false, "not implemented in mace") + default:MACE_CHECK(false, "not implemented in mace") << "data reshape size" << data_reshape_.size() << "reduce first axis:" << reduce_first_axis_; break; @@ -537,9 +549,11 @@ class ReduceOp : public ReduceOpBase { }; #ifdef MACE_ENABLE_QUANTIZE -template <> +template<> void ReduceOp::Reduce1Dims( + const OpContext *context, const uint8_t *input, ReduceType type, uint8_t *output) { + MACE_UNUSED(context); if (reduce_first_axis_) { if (type == ReduceType::MEAN) { uint32_t tmp = 0; @@ -568,275 +582,286 @@ void ReduceOp::Reduce1Dims( } } -template <> +template<> void ReduceOp::Reduce2Dims( + const OpContext *context, const uint8_t *input, ReduceType type, uint8_t *output) { + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + if (reduce_first_axis_) { - if (type == ReduceType::MEAN) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - uint32_t tmp = 0; - for (int j = 0; j < data_reshape_[0]; ++j) { - tmp += input[j * data_reshape_[1] + i]; + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + if (type == ReduceType::MEAN) { + for (index_t i = start; i < end; i += step) { + uint32_t tmp = 0; + for (int j = 0; j < data_reshape_[0]; ++j) { + tmp += input[j * data_reshape_[1] + i]; + } + output[i] = static_cast( + (tmp + data_reshape_[0] / 2) / data_reshape_[0]); } - output[i] = static_cast( - (tmp + data_reshape_[0] / 2) / data_reshape_[0]); - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - uint8_t tmp = input[i]; - for (int j = 1; j < data_reshape_[0]; ++j) { - tmp = std::min(tmp, input[j * data_reshape_[1] + i]); + } else if (type == ReduceType::MIN) { + for (index_t i = start; i < end; i += step) { + uint8_t tmp = input[i]; + for (int j = 1; j < data_reshape_[0]; ++j) { + tmp = std::min(tmp, input[j * data_reshape_[1] + i]); + } + output[i] = tmp; } - output[i] = tmp; - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - uint8_t tmp = input[i]; - for (int j = 1; j < data_reshape_[0]; ++j) { - tmp = std::max(tmp, input[j * data_reshape_[1] + i]); + } else if (type == ReduceType::MAX) { + for (index_t i = start; i < end; i += step) { + uint8_t tmp = input[i]; + for (int j = 1; j < data_reshape_[0]; ++j) { + tmp = std::max(tmp, input[j * data_reshape_[1] + i]); + } + output[i] = tmp; } - output[i] = tmp; + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[1], 1); } else { - if (type == ReduceType::MEAN) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - uint32_t tmp = 0; - for (int j = 0; j < data_reshape_[1]; ++j) { - tmp += input[i * data_reshape_[1] + j]; + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + if (type == ReduceType::MEAN) { + for (index_t i = start; i < end; i += step) { + uint32_t tmp = 0; + for (int j = 0; j < data_reshape_[1]; ++j) { + tmp += input[i * data_reshape_[1] + j]; + } + output[i] = static_cast( + (tmp + data_reshape_[1] / 2) / data_reshape_[1]); } - output[i] = static_cast( - (tmp + data_reshape_[1] / 2) / data_reshape_[1]); - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - uint8_t tmp = input[i * data_reshape_[1]]; - for (int j = 1; j < data_reshape_[1]; ++j) { - tmp = std::min(tmp, input[i * data_reshape_[1] + j]); + } else if (type == ReduceType::MIN) { + for (index_t i = start; i < end; i += step) { + uint8_t tmp = input[i * data_reshape_[1]]; + for (int j = 1; j < data_reshape_[1]; ++j) { + tmp = std::min(tmp, input[i * data_reshape_[1] + j]); + } + output[i] = tmp; } - output[i] = tmp; - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - uint8_t tmp = input[i * data_reshape_[1]]; - for (int j = 1; j < data_reshape_[1]; ++j) { - tmp = std::max(tmp, input[i * data_reshape_[1] + j]); + } else if (type == ReduceType::MAX) { + for (index_t i = start; i < end; i += step) { + uint8_t tmp = input[i * data_reshape_[1]]; + for (int j = 1; j < data_reshape_[1]; ++j) { + tmp = std::max(tmp, input[i * data_reshape_[1] + j]); + } + output[i] = tmp; } - output[i] = tmp; + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[0], 1); } } -template <> +template<> void ReduceOp::Reduce3Dims( + const OpContext *context, const uint8_t *input, ReduceType type, uint8_t *output) { + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + if (reduce_first_axis_) { - if (type == ReduceType::MEAN) { -#pragma omp parallel for collapse(1) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - uint32_t tmp = 0; - for (int j = 0; j < data_reshape_[2]; ++j) { - for (int k = 0; k < data_reshape_[0]; ++k) { - tmp += input[(k * data_reshape_[1] + i) * data_reshape_[2] + j]; + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + if (type == ReduceType::MEAN) { + for (index_t i = start; i < end; i += step) { + uint32_t tmp = 0; + for (int j = 0; j < data_reshape_[2]; ++j) { + for (int k = 0; k < data_reshape_[0]; ++k) { + tmp += input[(k * data_reshape_[1] + i) * data_reshape_[2] + j]; + } } + index_t dim = data_reshape_[0] * data_reshape_[2]; + output[i] = static_cast((tmp + dim / 2) / dim); } - index_t dim = data_reshape_[0] * data_reshape_[2]; - output[i] = static_cast((tmp + dim / 2) / dim); - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for collapse(1) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - uint8_t tmp = input[i * data_reshape_[2]]; - for (int j = 0; j < data_reshape_[2]; ++j) { - for (int k = 0; k < data_reshape_[0]; ++k) { - tmp = std::min(tmp, - input[(k * data_reshape_[1] + i) * data_reshape_[2] - + j]); + } else if (type == ReduceType::MIN) { + for (index_t i = start; i < end; i += step) { + uint8_t tmp = input[i * data_reshape_[2]]; + for (int j = 0; j < data_reshape_[2]; ++j) { + for (int k = 0; k < data_reshape_[0]; ++k) { + tmp = std::min(tmp, + input[(k * data_reshape_[1] + i) * data_reshape_[2] + + j]); + } } + output[i] = tmp; } - output[i] = tmp; - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for collapse(1) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - uint8_t tmp = input[i * data_reshape_[2]]; - for (int j = 0; j < data_reshape_[2]; ++j) { - for (int k = 0; k < data_reshape_[0]; ++k) { - tmp = - std::max(tmp, - input[(k * data_reshape_[1] + i) - * data_reshape_[2] + j]); + } else if (type == ReduceType::MAX) { + for (index_t i = start; i < end; i += step) { + uint8_t tmp = input[i * data_reshape_[2]]; + for (int j = 0; j < data_reshape_[2]; ++j) { + for (int k = 0; k < data_reshape_[0]; ++k) { + tmp = + std::max(tmp, + input[(k * data_reshape_[1] + i) + * data_reshape_[2] + j]); + } } + output[i] = tmp; } - output[i] = tmp; + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[1], 1); } else { - if (type == ReduceType::MEAN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - uint32_t tmp = 0; - for (int k = 0; k < data_reshape_[1]; ++k) { - tmp += input[(i * data_reshape_[1] + k) * data_reshape_[2] + j]; + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + if (type == ReduceType::MEAN) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + uint32_t tmp = 0; + for (int k = 0; k < data_reshape_[1]; ++k) { + tmp += input[(i * data_reshape_[1] + k) * data_reshape_[2] + j]; + } + output[i * data_reshape_[2] + j] = + static_cast((tmp + data_reshape_[1] / 2) / + data_reshape_[1]); } - output[i * data_reshape_[2] + j] = - static_cast((tmp + data_reshape_[1] / 2) / - data_reshape_[1]); } - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j]; - for (int k = 1; k < data_reshape_[1]; ++k) { - tmp = std::min(tmp, - input[(i * data_reshape_[1] + k) * - data_reshape_[2] + j]); - } - output[i * data_reshape_[2] + j] = tmp; + } else if (type == ReduceType::MIN) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j]; + for (int k = 1; k < data_reshape_[1]; ++k) { + tmp = std::min(tmp, + input[(i * data_reshape_[1] + k) * + data_reshape_[2] + j]); + } + output[i * data_reshape_[2] + j] = tmp; + } } - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j]; - for (int k = 1; k < data_reshape_[1]; ++k) { - tmp = std::max(tmp, - input[(i * data_reshape_[1] + k) * - data_reshape_[2] + j]); - } - output[i * data_reshape_[2] + j] = tmp; + } else if (type == ReduceType::MAX) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j]; + for (int k = 1; k < data_reshape_[1]; ++k) { + tmp = std::max(tmp, + input[(i * data_reshape_[1] + k) * + data_reshape_[2] + j]); + } + output[i * data_reshape_[2] + j] = tmp; + } } + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[0], 1, 0, data_reshape_[2], 1); } } -template <> +template<> void ReduceOp::Reduce4Dims( + const OpContext *context, const uint8_t *input, ReduceType type, uint8_t *output) { + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + if (reduce_first_axis_) { - if (type == ReduceType::MEAN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - for (int j = 0; j < data_reshape_[3]; ++j) { - uint32_t tmp = 0; - for (int k = 0; k < data_reshape_[2]; ++k) { - for (int t = 0; t < data_reshape_[0]; ++t) { - tmp += input[((t * data_reshape_[1] + i) * - data_reshape_[2] + k)*data_reshape_[3] + j]; + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + if (type == ReduceType::MEAN) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + uint32_t tmp = 0; + for (int k = 0; k < data_reshape_[2]; ++k) { + for (int t = 0; t < data_reshape_[0]; ++t) { + tmp += input[((t * data_reshape_[1] + i) * + data_reshape_[2] + k) * data_reshape_[3] + j]; + } } + index_t dim = data_reshape_[0] * data_reshape_[2]; + output[i * data_reshape_[3] + j] = + static_cast((tmp + dim / 2) / dim); } - index_t dim = data_reshape_[0] * data_reshape_[2]; - output[i * data_reshape_[3] + j] = - static_cast((tmp + dim / 2) / dim); } - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - for (int j = 0; j < data_reshape_[3]; ++j) { - uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j]; - for (int k = 0; k < data_reshape_[2]; ++k) { - for (int t = 0; t < data_reshape_[0]; ++t) { - tmp = std::min(tmp, - input[((t * data_reshape_[1] + i) * - data_reshape_[2] + k)*data_reshape_[3] + j]); + } else if (type == ReduceType::MIN) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j]; + for (int k = 0; k < data_reshape_[2]; ++k) { + for (int t = 0; t < data_reshape_[0]; ++t) { + tmp = std::min(tmp, + input[((t * data_reshape_[1] + i) * + data_reshape_[2] + k) * data_reshape_[3] + + j]); + } } + output[i * data_reshape_[3] + j] = tmp; } - output[i * data_reshape_[3] + j] = tmp; } - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[1]; ++i) { - for (int j = 0; j < data_reshape_[3]; ++j) { - uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j]; - for (int k = 0; k < data_reshape_[2]; ++k) { - for (int t = 0; t < data_reshape_[0]; ++t) { - tmp = std::max(tmp, - input[((t * data_reshape_[1] + i) * - data_reshape_[2] + k)*data_reshape_[3] + j]); + } else if (type == ReduceType::MAX) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j]; + for (int k = 0; k < data_reshape_[2]; ++k) { + for (int t = 0; t < data_reshape_[0]; ++t) { + tmp = std::max(tmp, + input[((t * data_reshape_[1] + i) * + data_reshape_[2] + k) * data_reshape_[3] + + j]); + } } + output[i * data_reshape_[3] + j] = tmp; } - output[i * data_reshape_[3] + j] = tmp; } + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[1], 1, 0, data_reshape_[3], 1); } else { - if (type == ReduceType::MEAN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - uint32_t tmp = 0; - for (int k = 0; k < data_reshape_[1]; ++k) { - for (int t = 0; t < data_reshape_[3]; ++t) { - tmp += input[((i * data_reshape_[1] + k) * - data_reshape_[2] + j)*data_reshape_[3] + t]; + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + if (type == ReduceType::MEAN) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + uint32_t tmp = 0; + for (int k = 0; k < data_reshape_[1]; ++k) { + for (int t = 0; t < data_reshape_[3]; ++t) { + tmp += input[((i * data_reshape_[1] + k) * + data_reshape_[2] + j) * data_reshape_[3] + t]; + } } + index_t dim = data_reshape_[1] * data_reshape_[3]; + output[i * data_reshape_[2] + j] = + static_cast((tmp + dim / 2) / dim); } - index_t dim = data_reshape_[1] * data_reshape_[3]; - output[i * data_reshape_[2] + j] = - static_cast((tmp + dim / 2) / dim); } - } - } else if (type == ReduceType::MIN) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - uint8_t tmp = input[(i * data_reshape_[1] * - data_reshape_[2] + j)*data_reshape_[3]]; - for (int k = 0; k < data_reshape_[1]; ++k) { - for (int t = 0; t < data_reshape_[3]; ++t) { - tmp = - std::min(tmp, - input[((i * data_reshape_[1] + k) * - data_reshape_[2] + j)*data_reshape_[3] + t]); + } else if (type == ReduceType::MIN) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + uint8_t tmp = input[(i * data_reshape_[1] * + data_reshape_[2] + j) * data_reshape_[3]]; + for (int k = 0; k < data_reshape_[1]; ++k) { + for (int t = 0; t < data_reshape_[3]; ++t) { + tmp = + std::min(tmp, + input[((i * data_reshape_[1] + k) * + data_reshape_[2] + j) * data_reshape_[3] + t]); + } } + output[i * data_reshape_[2] + j] = tmp; } - output[i * data_reshape_[2] + j] = tmp; } - } - } else if (type == ReduceType::MAX) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (int i = 0; i < data_reshape_[0]; ++i) { - for (int j = 0; j < data_reshape_[2]; ++j) { - uint8_t tmp = input[(i * data_reshape_[1] * - data_reshape_[2] + j)*data_reshape_[3]]; - for (int k = 0; k < data_reshape_[1]; ++k) { - for (int t = 0; t < data_reshape_[3]; ++t) { - tmp = - std::max(tmp, - input[((i * data_reshape_[1] + k) * - data_reshape_[2] + j)*data_reshape_[3] + t]); + } else if (type == ReduceType::MAX) { + for (index_t i = start0; i < end0; i += step0) { + for (index_t j = start1; j < end1; j += step1) { + uint8_t tmp = input[(i * data_reshape_[1] * + data_reshape_[2] + j) * data_reshape_[3]]; + for (int k = 0; k < data_reshape_[1]; ++k) { + for (int t = 0; t < data_reshape_[3]; ++t) { + tmp = + std::max(tmp, + input[((i * data_reshape_[1] + k) * + data_reshape_[2] + j) * data_reshape_[3] + t]); + } } + output[i * data_reshape_[2] + j] = tmp; } - output[i * data_reshape_[2] + j] = tmp; } + } else { + MACE_NOT_IMPLEMENTED; } - } else { - MACE_NOT_IMPLEMENTED; - } + }, 0, data_reshape_[0], 1, 0, data_reshape_[2], 1); } } #endif // MACE_ENABLE_QUANTIZE diff --git a/mace/ops/ref/activation.cc b/mace/ops/ref/activation.cc new file mode 100644 index 0000000000000000000000000000000000000000..4e2e65dbe71ef5b0e243a2be7d7803028de1f8d8 --- /dev/null +++ b/mace/ops/ref/activation.cc @@ -0,0 +1,104 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "mace/ops/ref/activation.h" + +namespace mace { +namespace ops { +namespace ref { + +Activation::Activation(ActivationType type, + const float limit, + const float leakyrelu_coefficient) + : type_(type), + limit_(limit), + leakyrelu_coefficient_(leakyrelu_coefficient) {} + +MaceStatus Activation::Compute(const OpContext *context, + const Tensor *input, + Tensor *output) { + Tensor::MappingGuard input_guard(input); + if (input != output) { + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + Tensor::MappingGuard output_guard(output); + DoActivation(context, input, output); + } else { + DoActivation(context, input, output); + } + + return MaceStatus::MACE_SUCCESS; +} + +void Activation::DoActivation(const OpContext *context, + const Tensor *input, + Tensor *output) { + MACE_UNUSED(context); + auto input_ptr = input->data(); + auto output_ptr = output->mutable_data(); + const index_t size = input->size(); + + switch (type_) { + case RELU: { + for (index_t i = 0; i < size; ++i) { + *output_ptr++ = std::max(0.f, *input_ptr++); + } + + break; + } + + case RELUX: { + for (index_t i = 0; i < size; ++i) { + *output_ptr++ = std::max(0.f, std::min(limit_, *input_ptr++)); + } + + break; + } + + case LEAKYRELU: { + for (index_t i = 0; i < size; ++i) { + *output_ptr = + std::max(*input_ptr, 0.f) + + std::min(*input_ptr, 0.f) * leakyrelu_coefficient_; + ++input_ptr; + ++output_ptr; + } + + break; + } + + case TANH: { + for (index_t i = 0; i < size; ++i) { + *output_ptr++ = std::tanh(*input_ptr++); + } + + break; + } + + case SIGMOID: { + for (index_t i = 0; i < size; ++i) { + *output_ptr++ = 1 / (1 + std::exp(-(*input_ptr++))); + } + break; + } + + case NOOP:break; + + default:MACE_NOT_IMPLEMENTED; + } +} + +} // namespace ref +} // namespace ops +} // namespace mace diff --git a/mace/ops/ref/activation.h b/mace/ops/ref/activation.h new file mode 100644 index 0000000000000000000000000000000000000000..7ad986a50ceed14b021abf2a4d81f2bb7b336e19 --- /dev/null +++ b/mace/ops/ref/activation.h @@ -0,0 +1,51 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_REF_ACTIVATION_H_ +#define MACE_OPS_REF_ACTIVATION_H_ + +#include "mace/core/op_context.h" +#include "mace/ops/common/activation_type.h" + +namespace mace { +namespace ops { +namespace ref { + +class Activation { + public: + explicit Activation(ActivationType type, + const float limit, + const float leakyrelu_coefficient); + ~Activation() = default; + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + Tensor *output); + + private: + void DoActivation(const OpContext *context, + const Tensor *input, + Tensor *output); + + ActivationType type_; + const float limit_; + const float leakyrelu_coefficient_; +}; + +} // namespace ref +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_REF_ACTIVATION_H_ diff --git a/mace/ops/ref/bias_add.cc b/mace/ops/ref/bias_add.cc new file mode 100644 index 0000000000000000000000000000000000000000..c3be25b08d070ac2791fe971325cce57d96de831 --- /dev/null +++ b/mace/ops/ref/bias_add.cc @@ -0,0 +1,76 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ref/bias_add.h" + +namespace mace { +namespace ops { +namespace ref { + +MaceStatus BiasAdd::Compute(const OpContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output) { + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard bias_guard(bias); + if (input != output) { + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + if (bias == nullptr) { + output->Copy(*input); + } else { + Tensor::MappingGuard output_guard(output); + AddBias(context, input, bias, output); + } + } else { + if (bias != nullptr) { + AddBias(context, input, bias, output); + } + } + + return MaceStatus::MACE_SUCCESS; +} + +void BiasAdd::AddBias(const OpContext *context, + const Tensor *input, + const Tensor *bias, + mace::Tensor *output) { + MACE_UNUSED(context); + auto input_data = input->data(); + auto bias_data = bias->data(); + auto output_data = output->mutable_data(); + + const index_t batch = input->dim(0); + const index_t channels = input->dim(1); + const index_t height = output->dim(2); + const index_t width = output->dim(3); + const index_t image_size = height * width; + + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + const index_t offset = (b * channels + c) * image_size; + auto input_ptr = input_data + offset; + auto output_ptr = output_data + offset; + const float bias = bias_data[c]; + + for (index_t i = 0; i < image_size; ++i) { + (*output_ptr++) = (*input_ptr++) + bias; + } + } + } +} + +} // namespace ref +} // namespace ops +} // namespace mace + diff --git a/mace/ops/arm/activation_neon.h b/mace/ops/ref/bias_add.h similarity index 56% rename from mace/ops/arm/activation_neon.h rename to mace/ops/ref/bias_add.h index d640e689a2c1e91cb614826b9af1b53d7c90ef94..f3dc6096e0ae409d0a4b226ebd21b04d6e0228b5 100644 --- a/mace/ops/arm/activation_neon.h +++ b/mace/ops/ref/bias_add.h @@ -12,23 +12,35 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_ACTIVATION_NEON_H_ -#define MACE_OPS_ARM_ACTIVATION_NEON_H_ +#ifndef MACE_OPS_REF_BIAS_ADD_H_ +#define MACE_OPS_REF_BIAS_ADD_H_ -#include "mace/core/types.h" +#include "mace/core/op_context.h" namespace mace { namespace ops { - -void ReluNeon(const float *input, const index_t size, float *output); - -void ReluxNeon(const float *input, const float limit, - const index_t size, float *output); - -void LeakyReluNeon(const float *input, const float alpha, - const index_t size, float *output); - +namespace ref { + +class BiasAdd { + public: + BiasAdd() = default; + ~BiasAdd() = default; + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output); + + private: + void AddBias(const OpContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output); +}; + +} // namespace ref } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_ACTIVATION_NEON_H_ +#endif // MACE_OPS_REF_BIAS_ADD_H_ diff --git a/mace/ops/ref/conv_2d.cc b/mace/ops/ref/conv_2d.cc index e5b7952a334b8fb5bcc4d13d8264fc6f76d8c41d..1c69ee9d72e98dbb357347ed2d4e10d971e1cb07 100644 --- a/mace/ops/ref/conv_2d.cc +++ b/mace/ops/ref/conv_2d.cc @@ -66,7 +66,6 @@ MaceStatus Conv2d::Compute(const OpContext *context, auto filter_data = filter->data(); auto output_data = output->mutable_data(); -#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < in_shape[0]; b++) { for (index_t m = 0; m < filter_shape[0]; ++m) { const index_t in_height = in_shape[2]; diff --git a/mace/ops/ref/conv_2d.h b/mace/ops/ref/conv_2d.h index c04eff0fdecef6579f8065f1eb91a0dfef60a8b2..9a9fbb8f92363fed058d9a96929714c8870ab028 100644 --- a/mace/ops/ref/conv_2d.h +++ b/mace/ops/ref/conv_2d.h @@ -30,9 +30,9 @@ namespace ref { template class Conv2d { public: - Conv2d(const std::vector strides, - const std::vector dilations, - const std::vector paddings, + Conv2d(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, const Padding padding_type) : strides_(strides), dilations_(dilations), @@ -55,9 +55,9 @@ class Conv2d { template<> class Conv2d { public: - Conv2d(const std::vector strides, - const std::vector dilations, - const std::vector paddings, + Conv2d(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, const Padding padding_type) : strides_(strides), dilations_(dilations), diff --git a/mace/ops/ref/deconv_2d.cc b/mace/ops/ref/deconv_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..6044af3b7fefa5e698bb6db02220832a8802af79 --- /dev/null +++ b/mace/ops/ref/deconv_2d.cc @@ -0,0 +1,167 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "mace/ops/ref/deconv_2d.h" +#include "mace/utils/memory.h" + +namespace mace { +namespace ops { +namespace ref { + +MaceStatus Deconv2d::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + MACE_UNUSED(context); + + std::vector out_shape; + if (output_shape) { + Tensor::MappingGuard out_shape_guard(output_shape); + MACE_CHECK(output_shape->size() == 4, "output shape should be 4-dims"); + out_shape = + std::vector(output_shape->data(), + output_shape->data() + 4); + } + std::vector padded_out_shape; + std::vector out_pad_size; + CalDeconvOutputShapeAndPadSize(input->shape(), + filter->shape(), + strides_, + padding_type_, + paddings_, + 1, + &out_shape, + nullptr, + &out_pad_size, + &padded_out_shape, + framework_type_, + NCHW); + + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + + const bool is_out_padded = + padded_out_shape[2] != out_shape[2] + || padded_out_shape[3] != out_shape[3]; + + std::unique_ptr padded_output(nullptr); + if (is_out_padded) { + index_t padded_out_size = + std::accumulate(padded_out_shape.begin(), + padded_out_shape.end(), + 1, + std::multiplies()) * sizeof(float); + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + index_t scratch_size = PadAlignSize(padded_out_size); + scratch->GrowSize(scratch_size); + + std::unique_ptr + padded_out + (make_unique(scratch->Scratch(scratch_size), DT_FLOAT)); + padded_out->Reshape(padded_out_shape); + padded_output = std::move(padded_out); + } + Tensor *out_tensor = output; + if (padded_output != nullptr) { + out_tensor = padded_output.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto pad_out_data = out_tensor->mutable_data(); + auto out_data = output->mutable_data(); + + auto &in_shape = input->shape(); + + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t pad_out_height = padded_out_shape[2]; + const index_t pad_out_width = padded_out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_img_size = pad_out_height * pad_out_width; + const index_t in_img_size = in_height * in_width; + const index_t kernel_h = filter->dim(2); + const index_t kernel_w = filter->dim(3); + const int kernel_size = static_cast(kernel_h * kernel_w); + const index_t pad_top = out_pad_size[0] / 2; + const index_t pad_left = out_pad_size[1] / 2; + + std::vector index_map(kernel_size, 0); + for (index_t i = 0; i < kernel_h; ++i) { + for (index_t j = 0; j < kernel_w; ++j) { + index_map[i * kernel_w + j] = i * pad_out_width + j; + } + } + + const index_t batch = in_shape[0]; + const index_t out_channels = out_shape[1]; + const index_t in_channels = in_shape[1]; + + for (index_t b = 0; b < batch; ++b) { + for (index_t oc = 0; oc < out_channels; ++oc) { + float *out_base = + pad_out_data + (b * out_channels + oc) * out_img_size; + for (index_t i = 0; i < in_height; ++i) { + for (index_t j = 0; j < in_width; ++j) { + const index_t out_offset = + i * strides_[0] * pad_out_width + j * strides_[1]; + for (index_t ic = 0; ic < in_channels; ++ic) { + const index_t input_idx = + (b * in_channels + ic) * in_img_size + i * in_width + j; + const float val = input_data[input_idx]; + const index_t kernel_offset = + (oc * in_channels + ic) * kernel_size; + for (int k = 0; k < kernel_size; ++k) { + const index_t out_idx = out_offset + index_map[k]; + const index_t kernel_idx = kernel_offset + k; + out_base[out_idx] += val * filter_data[kernel_idx]; + } + } + } + } + } + } + if (out_tensor != output) { + for (index_t i = 0; i < batch; ++i) { + for (index_t j = 0; j < out_channels; ++j) { + for (index_t k = 0; k < out_height; ++k) { + const float *input_base = + pad_out_data + + ((i * out_channels + j) * pad_out_height + (k + pad_top)) + * pad_out_width; + float *output_base = + out_data + ((i * out_channels + j) * out_height + k) * out_width; + memcpy(output_base, input_base + pad_left, out_width * sizeof(float)); + } + } + } + } + return MaceStatus::MACE_SUCCESS; +} + +} // namespace ref +} // namespace ops +} // namespace mace diff --git a/mace/ops/ref/deconv_2d.h b/mace/ops/ref/deconv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..a8ab6722b47037f2552faaea8d8cca5151f463ae --- /dev/null +++ b/mace/ops/ref/deconv_2d.h @@ -0,0 +1,97 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef MACE_OPS_REF_DECONV_2D_H_ +#define MACE_OPS_REF_DECONV_2D_H_ + +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace ref { + +template +class Deconv2d { + public: + Deconv2d(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type), + framework_type_(framework_type) {} + + ~Deconv2d() = default; + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output); + + private: + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; + const FrameworkType framework_type_; +}; + +template<> +class Deconv2d { + public: + Deconv2d(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type), + framework_type_(framework_type) {} + + ~Deconv2d() = default; + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output); + + private: + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; + const FrameworkType framework_type_; +}; + +} // namespace ref +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_REF_DECONV_2D_H_ + diff --git a/mace/ops/ref/depthwise_conv_2d.cc b/mace/ops/ref/depthwise_conv_2d.cc index b9f8b31f6ad517ae07ae15295dcc1f7688584861..bff950690d719103c31f4ddeb36a7cd934e256c3 100644 --- a/mace/ops/ref/depthwise_conv_2d.cc +++ b/mace/ops/ref/depthwise_conv_2d.cc @@ -69,7 +69,6 @@ MaceStatus DepthwiseConv2d::Compute(const OpContext *context, auto filter_data = filter->data(); auto output_data = output->mutable_data(); -#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < in_shape[0]; b++) { for (index_t m = 0; m < out_shape[1]; ++m) { const index_t c = m / multiplier; @@ -119,5 +118,3 @@ MaceStatus DepthwiseConv2d::Compute(const OpContext *context, } // namespace ref } // namespace ops } // namespace mace - - diff --git a/mace/ops/ref/depthwise_conv_2d.h b/mace/ops/ref/depthwise_conv_2d.h index ad493eb207ac8a8edaaada7589aa364d080e5b16..91a95192a43ba2cc97bc9cc08b9774e2fc6d0a8a 100644 --- a/mace/ops/ref/depthwise_conv_2d.h +++ b/mace/ops/ref/depthwise_conv_2d.h @@ -30,9 +30,9 @@ namespace ref { template class DepthwiseConv2d { public: - DepthwiseConv2d(const std::vector strides, - const std::vector dilations, - const std::vector paddings, + DepthwiseConv2d(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, const Padding padding_type) : strides_(strides), dilations_(dilations), @@ -55,9 +55,9 @@ class DepthwiseConv2d { template<> class DepthwiseConv2d { public: - DepthwiseConv2d(const std::vector strides, - const std::vector dilations, - const std::vector paddings, + DepthwiseConv2d(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, const Padding padding_type) : strides_(strides), dilations_(dilations), diff --git a/mace/ops/ref/depthwise_deconv_2d.cc b/mace/ops/ref/depthwise_deconv_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..0da81faa60b5268d0effb3777669f9419483f77b --- /dev/null +++ b/mace/ops/ref/depthwise_deconv_2d.cc @@ -0,0 +1,307 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "mace/ops/ref/depthwise_deconv_2d.h" +#include "mace/utils/memory.h" + +namespace mace { +namespace ops { +namespace ref { + +MaceStatus DepthwiseDeconv2d::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + MACE_UNUSED(context); + + std::vector out_shape; + if (output_shape) { + Tensor::MappingGuard out_shape_guard(output_shape); + MACE_CHECK(output_shape->size() == 4, "output shape should be 4-dims"); + out_shape = + std::vector(output_shape->data(), + output_shape->data() + 4); + } + std::vector padded_out_shape; + std::vector out_pad_size; + CalDeconvOutputShapeAndPadSize(input->shape(), + filter->shape(), + strides_, + padding_type_, + paddings_, + input->dim(1), + &out_shape, + nullptr, + &out_pad_size, + &padded_out_shape, + framework_type_, + NCHW); + + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + + const bool is_out_padded = + padded_out_shape[2] != out_shape[2] + || padded_out_shape[3] != out_shape[3]; + + std::unique_ptr padded_output(nullptr); + if (is_out_padded) { + index_t padded_out_size = + std::accumulate(padded_out_shape.begin(), + padded_out_shape.end(), + 1, + std::multiplies()) * sizeof(float); + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + index_t scratch_size = PadAlignSize(padded_out_size); + scratch->GrowSize(scratch_size); + + std::unique_ptr + padded_out + (make_unique(scratch->Scratch(scratch_size), DT_FLOAT)); + padded_out->Reshape(padded_out_shape); + padded_output = std::move(padded_out); + } + Tensor *out_tensor = output; + if (padded_output != nullptr) { + out_tensor = padded_output.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto pad_out_data = out_tensor->mutable_data(); + auto out_data = output->mutable_data(); + + auto &in_shape = input->shape(); + + const index_t batch = in_shape[0]; + const index_t channels = in_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t pad_out_height = padded_out_shape[2]; + const index_t pad_out_width = padded_out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_img_size = pad_out_height * pad_out_width; + const index_t in_img_size = in_height * in_width; + const index_t kernel_h = filter->dim(2); + const index_t kernel_w = filter->dim(3); + const int kernel_size = static_cast(kernel_h * kernel_w); + const index_t pad_top = out_pad_size[0] / 2; + const index_t pad_left = out_pad_size[1] / 2; + + std::vector index_map(kernel_size, 0); + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + index_map[i * kernel_w + j] = i * pad_out_width + j; + } + } + + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + float *out_base = + pad_out_data + (b * channels + c) * out_img_size; + for (index_t i = 0; i < in_height; ++i) { + for (index_t j = 0; j < in_width; ++j) { + const index_t out_offset = + i * strides_[0] * pad_out_width + j * strides_[1]; + const index_t input_idx = + (b * channels + c) * in_img_size + i * in_width + j; + const float val = input_data[input_idx]; + const index_t kernel_offset = c * kernel_size; + for (int k = 0; k < kernel_size; ++k) { + const index_t out_idx = out_offset + index_map[k]; + const index_t kernel_idx = kernel_offset + k; + out_base[out_idx] += val * filter_data[kernel_idx]; + } + } + } + } + } + + if (out_tensor != output) { + for (index_t i = 0; i < batch; ++i) { + for (index_t j = 0; j < channels; ++j) { + for (index_t k = 0; k < out_height; ++k) { + const float *input_base = + pad_out_data + + ((i * channels + j) * pad_out_height + (k + pad_top)) + * pad_out_width; + float *output_base = + out_data + ((i * channels + j) * out_height + k) * out_width; + memcpy(output_base, input_base + pad_left, out_width * sizeof(float)); + } + } + } + } + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus GroupDeconv2d::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + MACE_UNUSED(context); + + std::vector out_shape; + if (output_shape) { + Tensor::MappingGuard out_shape_guard(output_shape); + MACE_CHECK(output_shape->size() == 4, "output shape should be 4-dims"); + out_shape = + std::vector(output_shape->data(), + output_shape->data() + 4); + } + std::vector padded_out_shape; + std::vector out_pad_size; + CalDeconvOutputShapeAndPadSize(input->shape(), + filter->shape(), + strides_, + padding_type_, + paddings_, + group_, + &out_shape, + nullptr, + &out_pad_size, + &padded_out_shape, + framework_type_, + NCHW); + + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + + const bool is_out_padded = + padded_out_shape[2] != out_shape[2] + || padded_out_shape[3] != out_shape[3]; + + std::unique_ptr padded_output(nullptr); + if (is_out_padded) { + index_t padded_out_size = + std::accumulate(padded_out_shape.begin(), + padded_out_shape.end(), + 1, + std::multiplies()) * sizeof(float); + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + index_t scratch_size = PadAlignSize(padded_out_size); + scratch->GrowSize(scratch_size); + + std::unique_ptr + padded_out + (make_unique(scratch->Scratch(scratch_size), DT_FLOAT)); + padded_out->Reshape(padded_out_shape); + padded_output = std::move(padded_out); + } + Tensor *out_tensor = output; + if (padded_output != nullptr) { + out_tensor = padded_output.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto pad_out_data = out_tensor->mutable_data(); + auto out_data = output->mutable_data(); + + auto &in_shape = input->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t pad_out_height = padded_out_shape[2]; + const index_t pad_out_width = padded_out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_img_size = pad_out_height * pad_out_width; + const index_t in_img_size = in_height * in_width; + const index_t kernel_h = filter->dim(2); + const index_t kernel_w = filter->dim(3); + const int kernel_size = static_cast(kernel_h * kernel_w); + const index_t pad_top = out_pad_size[0] / 2; + const index_t pad_left = out_pad_size[1] / 2; + + std::vector index_map(kernel_size, 0); + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + index_map[i * kernel_w + j] = i * out_width + j; + } + } + + const int in_channels_g = in_channels / group_; + const int out_channels_g = out_channels / group_; + for (int b = 0; b < in_shape[0]; ++b) { + for (int g = 0; g < group_; ++g) { + for (int p = 0; p < out_channels_g; ++p) { + const index_t out_base = + ((b * group_ + g) * out_channels_g + p) * out_img_size; + for (int i = 0; i < in_height; ++i) { + for (int j = 0; j < in_width; ++j) { + const index_t out_offset = + i * strides_[0] * out_width + j * strides_[1]; + for (int q = 0; q < in_channels_g; ++q) { + const index_t in_base = + ((b * group_ + g) * in_channels_g + q) * in_img_size; + const index_t in_offset = + in_base + i * in_width + j; + const float val = input_data[in_offset]; + const index_t k_offset = + ((p * group_ + g) * in_channels_g + q) * kernel_size; + for (int k = 0; k < kernel_size; ++k) { + const index_t out_idx = out_base + out_offset + index_map[k]; + const float w = filter_data[k_offset + k]; + pad_out_data[out_idx] += val * w; + } + } + } + } + } + } + } + + if (out_tensor != output) { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < out_channels; ++j) { + for (int k = 0; k < out_height; ++k) { + const float *input_base = + pad_out_data + + ((i * out_channels + j) * pad_out_height + (k + pad_top)) + * pad_out_width; + float *output_base = + out_data + ((i * out_channels + j) * out_height + k) * out_width; + memcpy(output_base, input_base + pad_left, out_width * sizeof(float)); + } + } + } + } + return MaceStatus::MACE_SUCCESS; +} + +} // namespace ref +} // namespace ops +} // namespace mace diff --git a/mace/ops/ref/depthwise_deconv_2d.h b/mace/ops/ref/depthwise_deconv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..5da7487192a3762e6219716969a826e3f602a85a --- /dev/null +++ b/mace/ops/ref/depthwise_deconv_2d.h @@ -0,0 +1,153 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_ +#define MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_ + +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace ref { + +template +class GroupDeconv2d { + public: + GroupDeconv2d(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, + const Padding padding_type, + const index_t group, + const FrameworkType framework_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type), + group_(group), + framework_type_(framework_type) {} + + virtual ~GroupDeconv2d() = default; + + virtual MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output); + + private: + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; + const index_t group_; + const FrameworkType framework_type_; +}; + +template +class DepthwiseDeconv2d : public GroupDeconv2d { + public: + DepthwiseDeconv2d(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : GroupDeconv2d(strides, + dilations, + paddings, + padding_type, + 0, + framework_type) {} + + ~DepthwiseDeconv2d() = default; + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output); +}; + +template<> +class GroupDeconv2d { + public: + GroupDeconv2d(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, + const Padding padding_type, + const index_t group, + const FrameworkType framework_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type), + group_(group), + framework_type_(framework_type) {} + + virtual ~GroupDeconv2d() = default; + + virtual MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output); + + protected: + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; + const index_t group_; + const FrameworkType framework_type_; +}; + +template<> +class DepthwiseDeconv2d : public GroupDeconv2d { + public: + DepthwiseDeconv2d(const std::vector &strides, + const std::vector &dilations, + const std::vector &paddings, + const Padding padding_type, + const FrameworkType framework_type) + : GroupDeconv2d(strides, + dilations, + paddings, + padding_type, + 0, + framework_type) {} + + ~DepthwiseDeconv2d() = default; + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output); +}; + +} // namespace ref +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_ + diff --git a/mace/ops/ref/gemv.cc b/mace/ops/ref/gemv.cc index 59fc31dc3e80f5e63084aa41fc9337b49a4cba86..bf0366f3ce8cab2c848172b511cdfb98d1cb9d27 100644 --- a/mace/ops/ref/gemv.cc +++ b/mace/ops/ref/gemv.cc @@ -16,7 +16,7 @@ #include "mace/ops/ref/gemv.h" #if defined(MACE_ENABLE_QUANTIZE) -#include "mace/utils/quantize.h" +#include "mace/core/quantize.h" #endif // MACE_ENABLE_QUANTIZE namespace mace { diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc index 236e670f1d26b97471e219ba746102d777a008b5..f06692b9711c87e04e710eaaa2c1bce39f44f38f 100644 --- a/mace/ops/resize_bicubic.cc +++ b/mace/ops/resize_bicubic.cc @@ -77,7 +77,8 @@ inline float Interpolate1D(const std::vector &weights, values[2] * weights[2] + values[3] * weights[3]; } -inline void ResizeImage(const float *images, +inline void ResizeImage(const OpContext *context, + const float *images, const index_t batch_size, const index_t in_height, const index_t in_width, @@ -87,47 +88,52 @@ inline void ResizeImage(const float *images, const float height_scale, const float width_scale, float *output) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch_size; ++b) { - for (index_t y = 0; y < out_height; ++y) { - std::vector y_weights; - std::vector y_indices; - GetWeightsAndIndices(height_scale, y, in_height, &y_weights, - &y_indices); - for (index_t x = 0; x < out_width; ++x) { - std::vector x_weights; - std::vector x_indices; - GetWeightsAndIndices(width_scale, x, in_width, &x_weights, - &x_indices); - - for (index_t c = 0; c < channels; ++c) { - // Use a 4x4 patch to compute the interpolated output value at - // (b, y, x, c). - const float *channel_input_ptr = - images + (b * channels + c) * in_height * in_width; - float *channel_output_ptr = - output + (b * channels + c) * out_height * out_width; - std::vector coeff(4, 0.0); - for (index_t i = 0; i < 4; ++i) { - const std::vector values = { - channel_input_ptr[y_indices[i] * in_width + x_indices[0]], - channel_input_ptr[y_indices[i] * in_width + x_indices[1]], - channel_input_ptr[y_indices[i] * in_width + x_indices[2]], - channel_input_ptr[y_indices[i] * in_width + x_indices[3]]}; - coeff[i] = Interpolate1D(x_weights, values); + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t y = start1; y < end1; y += step1) { + std::vector y_weights; + std::vector y_indices; + GetWeightsAndIndices(height_scale, y, in_height, &y_weights, + &y_indices); + for (index_t x = 0; x < out_width; ++x) { + std::vector x_weights; + std::vector x_indices; + GetWeightsAndIndices(width_scale, x, in_width, &x_weights, + &x_indices); + + for (index_t c = 0; c < channels; ++c) { + // Use a 4x4 patch to compute the interpolated output value at + // (b, y, x, c). + const float *channel_input_ptr = + images + (b * channels + c) * in_height * in_width; + float *channel_output_ptr = + output + (b * channels + c) * out_height * out_width; + std::vector coeff(4, 0.0); + for (index_t i = 0; i < 4; ++i) { + const std::vector values = { + channel_input_ptr[y_indices[i] * in_width + x_indices[0]], + channel_input_ptr[y_indices[i] * in_width + x_indices[1]], + channel_input_ptr[y_indices[i] * in_width + x_indices[2]], + channel_input_ptr[y_indices[i] * in_width + x_indices[3]]}; + coeff[i] = Interpolate1D(x_weights, values); + } + channel_output_ptr[y * out_width + x] = + Interpolate1D(y_weights, coeff); } - channel_output_ptr[y * out_width + x] = - Interpolate1D(y_weights, coeff); } } } - } + }, 0, batch_size, 1, 0, out_height, 1); } -template +template class ResizeBicubicOp; -template <> +template<> class ResizeBicubicOp : public Operation { public: explicit ResizeBicubicOp(OpConstructContext *context) @@ -175,8 +181,17 @@ class ResizeBicubicOp : public Operation { out_width, align_corners_); - ResizeImage(input_data, batch, in_height, in_width, out_height, out_width, - channels, height_scale, width_scale, output_data); + ResizeImage(context, + input_data, + batch, + in_height, + in_width, + out_height, + out_width, + channels, + height_scale, + width_scale, + output_data); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index 46720b3c29d32d01f82902a0bfcc49071aa6aa2a..1fe13f42b2ee20258fb55634746b85f492eea70e 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -20,7 +20,7 @@ #include "mace/core/operator.h" #include "mace/utils/memory.h" -#include "mace/utils/quantize.h" +#include "mace/core/quantize.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/resize_bilinear.h" #endif // MACE_ENABLE_OPENCL @@ -51,7 +51,7 @@ inline void ComputeInterpolationWeights( } } -template +template inline T ComputeLerp(const T top_left, const T top_right, const T bottom_left, @@ -59,7 +59,7 @@ inline T ComputeLerp(const T top_left, const float x_lerp, const float y_lerp); -template <> +template<> inline float ComputeLerp(const float top_left, const float top_right, const float bottom_left, @@ -71,7 +71,7 @@ inline float ComputeLerp(const float top_left, return top + (bottom - top) * y_lerp; } -template <> +template<> inline uint8_t ComputeLerp(const uint8_t top_left, const uint8_t top_right, const uint8_t bottom_left, @@ -83,8 +83,9 @@ inline uint8_t ComputeLerp(const uint8_t top_left, return Saturate(roundf(top + (bottom - top) * y_lerp)); } -template -inline void ResizeImageNCHW(const T *images, +template +inline void ResizeImageNCHW(const OpContext *context, + const T *images, const index_t batch_size, const index_t in_height, const index_t in_width, @@ -96,38 +97,44 @@ inline void ResizeImageNCHW(const T *images, T *output) { const CachedInterpolation *xs = xs_vec.data(); -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch_size; ++b) { - for (index_t c = 0; c < channels; ++c) { - const T - *channel_input_ptr = - images + (b * channels + c) * in_height * in_width; - T *channel_output_ptr = - output + (b * channels + c) * out_height * out_width; - for (index_t y = 0; y < out_height; ++y) { - const T *y_lower_input_ptr = - channel_input_ptr + ys[y].lower * in_width; - const T *y_upper_input_ptr = - channel_input_ptr + ys[y].upper * in_width; - const float ys_lerp = ys[y].lerp; - - for (index_t x = 0; x < out_width; ++x) { - const float xs_lerp = xs[x].lerp; - const T top_left = y_lower_input_ptr[xs[x].lower]; - const T top_right = y_lower_input_ptr[xs[x].upper]; - const T bottom_left = y_upper_input_ptr[xs[x].lower]; - const T bottom_right = y_upper_input_ptr[xs[x].upper]; - channel_output_ptr[y * out_width + x] = - ComputeLerp(top_left, top_right, bottom_left, - bottom_right, xs_lerp, ys_lerp); + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T + *channel_input_ptr = + images + (b * channels + c) * in_height * in_width; + T *channel_output_ptr = + output + (b * channels + c) * out_height * out_width; + for (index_t y = 0; y < out_height; ++y) { + const T *y_lower_input_ptr = + channel_input_ptr + ys[y].lower * in_width; + const T *y_upper_input_ptr = + channel_input_ptr + ys[y].upper * in_width; + const float ys_lerp = ys[y].lerp; + + for (index_t x = 0; x < out_width; ++x) { + const float xs_lerp = xs[x].lerp; + const T top_left = y_lower_input_ptr[xs[x].lower]; + const T top_right = y_lower_input_ptr[xs[x].upper]; + const T bottom_left = y_upper_input_ptr[xs[x].lower]; + const T bottom_right = y_upper_input_ptr[xs[x].upper]; + channel_output_ptr[y * out_width + x] = + ComputeLerp(top_left, top_right, bottom_left, + bottom_right, xs_lerp, ys_lerp); + } } } } - } + }, 0, batch_size, 1, 0, channels, 1); } -template -inline void ResizeImageNHWC(const T *images, +template +inline void ResizeImageNHWC(const OpContext *context, + const T *images, const index_t batch_size, const index_t in_height, const index_t in_width, @@ -139,39 +146,44 @@ inline void ResizeImageNHWC(const T *images, T *output) { const CachedInterpolation *xs = xs_vec.data(); + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + for (index_t b = 0; b < batch_size; ++b) { const T *input_base = images + b * channels * in_height * in_width; T *output_base = output + b * channels * out_height * out_width; -#pragma omp parallel for schedule(runtime) - for (index_t y = 0; y < out_height; ++y) { - const T - *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels; - const T - *y_upper_input_ptr = input_base + ys[y].upper * in_width * channels; - const float ys_lerp = ys[y].lerp; - - for (index_t x = 0; x < out_width; ++x) { - const float xs_lerp = xs[x].lerp; - const T *top_left = y_lower_input_ptr + xs[x].lower * channels; - const T *top_right = y_lower_input_ptr + xs[x].upper * channels; - const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels; - const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels; - - T *output_ptr = output_base + (y * out_width + x) * channels; - for (index_t c = 0; c < channels; ++c) { - output_ptr[c] = - ComputeLerp(top_left[c], top_right[c], bottom_left[c], - bottom_right[c], xs_lerp, ys_lerp); + + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t y = start; y < end; y += step) { + const T + *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels; + const T + *y_upper_input_ptr = input_base + ys[y].upper * in_width * channels; + const float ys_lerp = ys[y].lerp; + + for (index_t x = 0; x < out_width; ++x) { + const float xs_lerp = xs[x].lerp; + const T *top_left = y_lower_input_ptr + xs[x].lower * channels; + const T *top_right = y_lower_input_ptr + xs[x].upper * channels; + const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels; + const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels; + + T *output_ptr = output_base + (y * out_width + x) * channels; + for (index_t c = 0; c < channels; ++c) { + output_ptr[c] = + ComputeLerp(top_left[c], top_right[c], bottom_left[c], + bottom_right[c], xs_lerp, ys_lerp); + } } } - } + }, 0, out_height, 1); } } -template +template class ResizeBilinearOp; -template +template class ResizeBilinearOp : public Operation { public: explicit ResizeBilinearOp(OpConstructContext *context) @@ -226,7 +238,8 @@ class ResizeBilinearOp : public Operation { ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data()); ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data()); - ResizeImageNCHW(input_data, + ResizeImageNCHW(context, + input_data, batch, in_height, in_width, @@ -301,7 +314,8 @@ class ResizeBilinearOp : public Operation { ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data()); ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data()); - ResizeImageNHWC(input_data, + ResizeImageNHWC(context, + input_data, batch, in_height, in_width, diff --git a/mace/ops/resize_nearest_neighbor.cc b/mace/ops/resize_nearest_neighbor.cc index 5cdbf07fa101881c4b1c5a4b66476a01199cacee..8840458f96f171ae0886b0181163b43c0093b02e 100644 --- a/mace/ops/resize_nearest_neighbor.cc +++ b/mace/ops/resize_nearest_neighbor.cc @@ -26,8 +26,9 @@ namespace mace { namespace ops { -template -inline void ResizeImageNCHW(const T *images, +template +inline void ResizeImageNCHW(const OpContext *context, + const T *images, const index_t batch_size, const index_t in_height, const index_t in_width, @@ -38,36 +39,41 @@ inline void ResizeImageNCHW(const T *images, const float width_scale, bool align_corners, T *output) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch_size; ++b) { - for (index_t c = 0; c < channels; ++c) { - const T - *channel_input_ptr = - images + (b * channels + c) * in_height * in_width; - T *channel_output_ptr = - output + (b * channels + c) * out_height * out_width; - for (index_t y = 0; y < out_height; ++y) { - const index_t in_y = std::min( - (align_corners) ? static_cast(roundf(y * height_scale)) - : static_cast(floorf(y * height_scale)), - in_height - 1); - for (int x = 0; x < out_width; ++x) { - const index_t in_x = std::min( - (align_corners) ? static_cast(roundf(x * width_scale)) - : static_cast(floorf(x * width_scale)), - in_width - 1); - channel_output_ptr[y * out_width + x] = - channel_input_ptr[in_y * in_width + in_x]; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + const T + *channel_input_ptr = + images + (b * channels + c) * in_height * in_width; + T *channel_output_ptr = + output + (b * channels + c) * out_height * out_width; + for (index_t y = 0; y < out_height; ++y) { + const index_t in_y = std::min( + (align_corners) ? static_cast(roundf(y * height_scale)) + : static_cast(floorf(y * height_scale)), + in_height - 1); + for (int x = 0; x < out_width; ++x) { + const index_t in_x = std::min( + (align_corners) ? static_cast(roundf(x * width_scale)) + : static_cast(floorf(x * width_scale)), + in_width - 1); + channel_output_ptr[y * out_width + x] = + channel_input_ptr[in_y * in_width + in_x]; + } } } } - } + }, 0, batch_size, 1, 0, channels, 1); } -template +template class ResizeNearestNeighborOp; -template +template class ResizeNearestNeighborOp : public Operation { public: explicit ResizeNearestNeighborOp(OpConstructContext *context) @@ -116,7 +122,8 @@ class ResizeNearestNeighborOp : public Operation { resize_nearest_neighbor::CalculateResizeScale(in_width, out_width, align_corners_); - ResizeImageNCHW(input_data, + ResizeImageNCHW(context, + input_data, batch, in_height, in_width, diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc index f990912d0ce1f02ea65ab95d2334cf411aee2750..ac7ca64a9a700412a19a9600afaccdc2e56d81a8 100644 --- a/mace/ops/slice.cc +++ b/mace/ops/slice.cc @@ -66,7 +66,6 @@ class SliceOp : public Operation { const T *input_data = input->data(); T *output_data = output->mutable_data(); -#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < frames; ++i) { const T *input_base = input_data + i * input_dim + offset; diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index 427a29eb850c3a5577c4fd57a5b49e401e255b51..0eda5bf3ccee4973d9d9997ebdaac7fa5293ffa3 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -55,6 +55,9 @@ class SoftmaxOp : public Operation { const float *input_data = input->data(); float *output_data = output->mutable_data(); + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + // softmax for nchw image if (input->dim_size() == 4) { const index_t batch = input->dim(0); @@ -63,46 +66,47 @@ class SoftmaxOp : public Operation { const index_t batch_size = class_count * class_size; for (index_t b = 0; b < batch; ++b) { -#pragma omp parallel for schedule(runtime) - for (index_t k = 0; k < class_size; ++k) { - const float *input_ptr = input_data + b * batch_size + k; - float *output_ptr = output_data + b * batch_size + k; - - float max_val = std::numeric_limits::lowest(); - index_t channel_offset = 0; - for (index_t c = 0; c < class_count; ++c) { - float data = input_ptr[channel_offset]; - if (data > max_val) { - max_val = data; - } - channel_offset += class_size; - } - - channel_offset = 0; - float sum = 0; - for (index_t c = 0; c < class_count; ++c) { - float exp_value = ::exp(input_ptr[channel_offset] - max_val); - sum += exp_value; - output_ptr[channel_offset] = exp_value; - channel_offset += class_size; - } + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t k = start; k < end; k += step) { + const float *input_ptr = input_data + b * batch_size + k; + float *output_ptr = output_data + b * batch_size + k; - sum = std::max(sum, std::numeric_limits::min()); - channel_offset = 0; - if (use_log_) { + float max_val = std::numeric_limits::lowest(); + index_t channel_offset = 0; for (index_t c = 0; c < class_count; ++c) { - output_ptr[channel_offset] /= sum; - output_ptr[channel_offset] = - std::log(output_ptr[channel_offset]); + float data = input_ptr[channel_offset]; + if (data > max_val) { + max_val = data; + } channel_offset += class_size; } - } else { + + channel_offset = 0; + float sum = 0; for (index_t c = 0; c < class_count; ++c) { - output_ptr[channel_offset] /= sum; + float exp_value = ::exp(input_ptr[channel_offset] - max_val); + sum += exp_value; + output_ptr[channel_offset] = exp_value; channel_offset += class_size; } - } - } // k + + sum = std::max(sum, std::numeric_limits::min()); + channel_offset = 0; + if (use_log_) { + for (index_t c = 0; c < class_count; ++c) { + output_ptr[channel_offset] /= sum; + output_ptr[channel_offset] = + std::log(output_ptr[channel_offset]); + channel_offset += class_size; + } + } else { + for (index_t c = 0; c < class_count; ++c) { + output_ptr[channel_offset] /= sum; + channel_offset += class_size; + } + } + } // k + }, 0, class_size, 1); } // b } else if (input->dim_size() == 2 || input->dim_size() == 3) { // normal 2d softmax and 3d softmax (dim(0) is batch) @@ -115,35 +119,36 @@ class SoftmaxOp : public Operation { class_size = input->dim(0) * input->dim(1); class_count = input->dim(2); } -#pragma omp parallel for schedule(runtime) - for (index_t k = 0; k < class_size; ++k) { - const float *input_ptr = input_data + k * class_count; - float *output_ptr = output_data + k * class_count; - - float max_val = std::numeric_limits::lowest(); - for (index_t c = 0; c < class_count; ++c) { - max_val = std::max(max_val, input_ptr[c]); - } - - float sum = 0; - for (index_t c = 0; c < class_count; ++c) { - float exp_value = std::exp(input_ptr[c] - max_val); - sum += exp_value; - output_ptr[c] = exp_value; - } + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t k = start; k < end; k += step) { + const float *input_ptr = input_data + k * class_count; + float *output_ptr = output_data + k * class_count; - sum = std::max(sum, std::numeric_limits::min()); - if (use_log_) { + float max_val = std::numeric_limits::lowest(); for (index_t c = 0; c < class_count; ++c) { - output_ptr[c] /= sum; - output_ptr[c] = std::log(output_ptr[c]); + max_val = std::max(max_val, input_ptr[c]); } - } else { + + float sum = 0; for (index_t c = 0; c < class_count; ++c) { - output_ptr[c] /= sum; + float exp_value = std::exp(input_ptr[c] - max_val); + sum += exp_value; + output_ptr[c] = exp_value; + } + + sum = std::max(sum, std::numeric_limits::min()); + if (use_log_) { + for (index_t c = 0; c < class_count; ++c) { + output_ptr[c] /= sum; + output_ptr[c] = std::log(output_ptr[c]); + } + } else { + for (index_t c = 0; c < class_count; ++c) { + output_ptr[c] /= sum; + } } } - } + }, 0, class_size, 1); } else { MACE_NOT_IMPLEMENTED; } @@ -202,30 +207,35 @@ class SoftmaxOp : public Operation { float input_scale = input->scale(); uint8_t *output_data = output->mutable_data(); + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + // If depth is short, do it using float32. Float computation should not // be here, but as long as it is on CPU, it is fine. if (depth < 32) { -#pragma omp parallel for schedule(runtime) - for (index_t b = 0; b < batch; ++b) { - const uint8_t *input_ptr = input_data + b * depth; - uint8_t *output_ptr = output_data + b * depth; + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t b = start; b < end; b += step) { + const uint8_t *input_ptr = input_data + b * depth; + uint8_t *output_ptr = output_data + b * depth; - uint8_t max_value = FindMax(input_ptr, depth); - float sum = 0; - std::vector depth_cache(depth); - for (index_t d = 0; d < depth; ++d) { - float exp_value = ::exp((static_cast(input_ptr[d]) - max_value) - * input_scale); - sum += exp_value; - depth_cache[d] = exp_value; - } + uint8_t max_value = FindMax(input_ptr, depth); + float sum = 0; + std::vector depth_cache(depth); + for (index_t d = 0; d < depth; ++d) { + float exp_value = ::exp((static_cast(input_ptr[d]) - max_value) + * input_scale); + sum += exp_value; + depth_cache[d] = exp_value; + } - sum = std::max(sum, std::numeric_limits::min()); - for (index_t d = 0; d < depth; ++d) { - double output_f = depth_cache[d] / sum; - output_ptr[d] = static_cast(output_f * 255); + sum = std::max(sum, std::numeric_limits::min()); + for (index_t d = 0; d < depth; ++d) { + double output_f = depth_cache[d] / sum; + output_ptr[d] = static_cast(output_f * 255); + } } - } + }, 0, batch, 1); + return MaceStatus::MACE_SUCCESS; } @@ -234,19 +244,19 @@ class SoftmaxOp : public Operation { (1ll << 31) - 1.0)); int32_t input_delta_limit = -((1ll << 31) - 1) / scale_q; -#pragma omp parallel for schedule(runtime) - for (index_t b = 0; b < batch; ++b) { - const uint8_t *input_ptr = input_data + b * depth; - uint8_t *output_ptr = output_data + b * depth; + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t b = start; b < end; b += step) { + const uint8_t *input_ptr = input_data + b * depth; + uint8_t *output_ptr = output_data + b * depth; - FixPointSumExp sum = FixPointSumExp::Zero(); - uint8_t max_value = FindMax(input_ptr, depth); - index_t d = 0; + FixPointSumExp sum = FixPointSumExp::Zero(); + uint8_t max_value = FindMax(input_ptr, depth); + index_t d = 0; - // Neon optimization is not useful so far as we benchmark. - // Enable it when we find a case that proves it useful. + // Neon optimization is not useful so far as we benchmark. + // Enable it when we find a case that proves it useful. #if 0 && defined(MACE_ENABLE_NEON) - using FixPointInputDeltaInt32x4 = gemmlowp::FixedPoint; using FixPointSumExpInt32x4 = gemmlowp::FixedPoint; @@ -305,33 +315,33 @@ class SoftmaxOp : public Operation { vpadd_s32(vsum_reduced_2_s32, vsum_reduced_2_s32); sum = FixPointSumExp::FromRaw(vget_lane_s32(vsum_reduced_1_s32, 0)); #endif - for (; d < depth; ++d) { - int32_t input_delta = static_cast(input_ptr[d]) - max_value; - if (input_delta >= input_delta_limit) { - int32_t scaled_input_delta_q = scale_q * input_delta; - FixPointInputDelta scaled_input_delta_fp = - FixPointInputDelta::FromRaw(scaled_input_delta_q); - sum = sum + gemmlowp::Rescale( - exp_on_negative_values(scaled_input_delta_fp)); + for (; d < depth; ++d) { + int32_t input_delta = static_cast(input_ptr[d]) - max_value; + if (input_delta >= input_delta_limit) { + int32_t scaled_input_delta_q = scale_q * input_delta; + FixPointInputDelta scaled_input_delta_fp = + FixPointInputDelta::FromRaw(scaled_input_delta_q); + sum = sum + gemmlowp::Rescale( + exp_on_negative_values(scaled_input_delta_fp)); + } } - } - int32_t sum_q = sum.raw(); - int left_zero_count = - __builtin_clz(static_cast(sum_q)); - int tail_count = kSumExpIntBits - left_zero_count; - int32_t fractional_q0 = static_cast( - (static_cast(sum_q) << left_zero_count) - - (static_cast(1) << 31)); - FixPoint0 recip_sum_q0 = gemmlowp::one_over_one_plus_x_for_x_in_0_1( - FixPoint0::FromRaw(fractional_q0)); + int32_t sum_q = sum.raw(); + int left_zero_count = + __builtin_clz(static_cast(sum_q)); + int tail_count = kSumExpIntBits - left_zero_count; + int32_t fractional_q0 = static_cast( + (static_cast(sum_q) << left_zero_count) - + (static_cast(1) << 31)); + FixPoint0 recip_sum_q0 = gemmlowp::one_over_one_plus_x_for_x_in_0_1( + FixPoint0::FromRaw(fractional_q0)); - d = 0; + d = 0; - // Neon optimization is not useful so far as we benchmark. - // Enable it when we find a case that proves it useful. + // Neon optimization is not useful so far as we benchmark. + // Enable it when we find a case that proves it useful. #if 0 && defined(MACE_ENABLE_NEON) - FixPoint0Int32x4 vrecip_sum_q0_s32_fp = + FixPoint0Int32x4 vrecip_sum_q0_s32_fp = FixPoint0Int32x4::FromScalarRaw(recip_sum_q0.raw()); int16x8_t vinput_delta_limit_s16 = vdupq_n_s16(input_delta_limit); for (; d <= depth - 8; d += 8) { @@ -371,21 +381,23 @@ class SoftmaxOp : public Operation { vst1_u8(output_ptr + d, voutput_u8); } #endif - for (; d < depth; ++d) { - int32_t input_delta = static_cast(input_ptr[d]) - max_value; - if (input_delta >= input_delta_limit) { - int32_t scaled_input_delta_q = scale_q * input_delta; - FixPointInputDelta scaled_input_delta_fp = - FixPointInputDelta::FromRaw(scaled_input_delta_q); - - FixPoint0 exp = exp_on_negative_values(scaled_input_delta_fp); - int32_t output_data = gemmlowp::RoundingDivideByPOT( - (recip_sum_q0 * exp).raw(), tail_count + 31 - 8); - - output_ptr[d] = std::max(std::min(output_data, 255), 0); + for (; d < depth; ++d) { + int32_t input_delta = static_cast(input_ptr[d]) - max_value; + if (input_delta >= input_delta_limit) { + int32_t scaled_input_delta_q = scale_q * input_delta; + FixPointInputDelta scaled_input_delta_fp = + FixPointInputDelta::FromRaw(scaled_input_delta_q); + + FixPoint0 exp = exp_on_negative_values(scaled_input_delta_fp); + int32_t output_data = gemmlowp::RoundingDivideByPOT( + (recip_sum_q0 * exp).raw(), tail_count + 31 - 8); + + output_ptr[d] = std::max(std::min(output_data, 255), 0); + } } } - } + }, 0, batch, 1); + return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index ece9b6f61dd25e0fe4c6d2f5aff1aeea4ed55302..b239193c2641af400fb5c67f25be2efff8c04859 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -130,7 +130,6 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { std::max(static_cast(1), 8 * 1024 / block_shape_w / in_width); // make channel outter loop so we can make best use of cache -#pragma omp parallel for collapse(3) schedule(runtime) for (index_t c = 0; c < channels; ++c) { for (index_t block_h = 0; block_h < out_height; block_h += block_h_size) { @@ -239,7 +238,6 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { index_t out_width = batch_tensor->dim(2); index_t channels = batch_tensor->dim(3); -#pragma omp parallel for schedule(runtime) for (index_t b = 0; b < out_batches; ++b) { const index_t in_b = b % in_batches; const index_t tile_index = b / in_batches; diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc index 4e40227c5b5857d065195d509bcafe55fbef1c59..918ae678b5cb09c2f6c8f2a584f3b5fbb5d47997 100644 --- a/mace/ops/space_to_depth.cc +++ b/mace/ops/space_to_depth.cc @@ -58,7 +58,6 @@ class SpaceToDepthOp : public Operation { const T *input_ptr = input->data(); T *output_ptr = output->mutable_data(); -#pragma omp parallel for for (index_t b = 0; b < batch_size; ++b) { for (index_t d = 0; d < input_depth; ++d) { for (index_t h = 0; h < input_height; ++h) { diff --git a/mace/ops/splice.cc b/mace/ops/splice.cc index 8517b6b831c80397086e0598f8803aeff0be81ce..6d47732904820f6a83712cb1cb309e424b459615 100644 --- a/mace/ops/splice.cc +++ b/mace/ops/splice.cc @@ -32,10 +32,10 @@ namespace mace { namespace ops { -template +template class SpliceOp; -template +template class SpliceOp : public Operation { public: explicit SpliceOp(OpConstructContext *context) @@ -85,7 +85,6 @@ class SpliceOp : public Operation { const T *input_data = input->data(); T *output_data = output->mutable_data(); -#pragma omp parallel for collapse(3) schedule(runtime) for (int b = 0; b < batch; ++b) { for (index_t i = 0; i < out_chunk; ++i) { for (index_t c = 0; c < num_splice; ++c) { @@ -102,7 +101,6 @@ class SpliceOp : public Operation { if (const_dim_ > 0) { const index_t output_offset = output_dim - const_dim_; const index_t input_offset = dim; -#pragma omp parallel for collapse(2) schedule(runtime) for (int b = 0; b < batch; ++b) { for (index_t i = 0; i < out_chunk; ++i) { T *output_base = output_data + + b * output_stride + i * output_dim; diff --git a/mace/ops/split.cc b/mace/ops/split.cc index 7c920d4c115f9650973ab62a2c79d29b677faf83..e1523a06253c2a38c2451046e4daa1b0c51d2713 100644 --- a/mace/ops/split.cc +++ b/mace/ops/split.cc @@ -24,10 +24,10 @@ namespace mace { namespace ops { -template +template class SplitOp; -template +template class SplitOp : public Operation { public: explicit SplitOp(OpConstructContext *context) @@ -70,19 +70,18 @@ class SplitOp : public Operation { output_shape.end(), 1, std::multiplies()); - for (size_t i= 0; i < outputs_count; ++i) { + for (size_t i = 0; i < outputs_count; ++i) { MACE_RETURN_IF_ERROR(output_list[i]->Resize(output_shape)); output_ptrs[i] = output_list[i]->mutable_data(); } const T *input_ptr = input->data(); -#pragma omp parallel for for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) { index_t input_idx = outer_idx * input_channels * inner_size; index_t output_idx = outer_idx * output_channels * inner_size; for (size_t i = 0; i < outputs_count; ++i) { if (DataTypeCanUseMemcpy(DataTypeToEnum::v())) { - memcpy(output_ptrs[i]+output_idx, input_ptr+input_idx, + memcpy(output_ptrs[i] + output_idx, input_ptr + input_idx, output_channels * inner_size * sizeof(T)); } else { for (index_t k = 0; k < output_channels * inner_size; ++k) { @@ -100,7 +99,6 @@ class SplitOp : public Operation { bool checked_; }; - #ifdef MACE_ENABLE_OPENCL template class SplitOp : public Operation { @@ -130,7 +128,6 @@ class SplitOp : public Operation { }; #endif // MACE_ENABLE_OPENCL - void RegisterSplit(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Split", SplitOp, DeviceType::CPU, float); @@ -150,15 +147,15 @@ void RegisterSplit(OpRegistryBase *op_registry) { [](OpConstructContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; } int axis = ProtoArgHelper::GetOptionalArg( *op, "axis", 3); if (axis != 3 || op->output_shape(0).dims_size() != 4 || (op->output_shape(0).dims()[3] % 4 != 0)) { - return { DeviceType::CPU }; + return {DeviceType::CPU}; } - return { DeviceType::CPU, DeviceType::GPU }; + return {DeviceType::CPU, DeviceType::GPU}; })); } diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc index b937b259322615abcbb929e4c17c0f41e3844167..d58191c4d0bd6b2d992af9495c56b1a7dca4bc44 100644 --- a/mace/ops/sqrdiff_mean.cc +++ b/mace/ops/sqrdiff_mean.cc @@ -65,7 +65,7 @@ class SqrDiffMeanOp : public Operation { const index_t img_size = input0->dim(2) * input0->dim(3); const index_t bc = input0->dim(0) * input0->dim(1); -#pragma omp parallel for schedule(runtime) + for (int i = 0; i < bc; ++i) { for (int j = 0; j < img_size; ++j) { T diff = input_ptr0[i * img_size + j] - input_ptr1[i]; diff --git a/mace/ops/sum_group.cc b/mace/ops/sum_group.cc index 21c83b68f98b791a9a061fb1226b6b86edfceba6..0efdfe2a764d81ab35d035bf93f7ffeeb6e66174 100644 --- a/mace/ops/sum_group.cc +++ b/mace/ops/sum_group.cc @@ -81,7 +81,6 @@ class SumGroupOp : public Operation { << "size value over-ranged:" << cur_index << "<=" << input_dim; } -#pragma omp parallel for collapse(2) schedule(runtime) for (index_t i = 0; i < bh; ++i) { for (index_t j = 0; j < output_dim; ++j) { int start_col = sum_indexes[j].first; diff --git a/mace/ops/target_rms_norm.cc b/mace/ops/target_rms_norm.cc index 7b769fe712c35cc39cf282731f2a5d64d21d8695..80d42a1d0579fe563ea3fec01655de98e610dd08 100644 --- a/mace/ops/target_rms_norm.cc +++ b/mace/ops/target_rms_norm.cc @@ -91,7 +91,6 @@ class TargetRMSNormOp : public Operation { const float *input_data = input->data(); float *output_data = output->mutable_data(); -#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < bh; ++i) { float scale = SquareSum(input_data + i * dim, dim); scale = static_cast(1.0 / std::sqrt(scale / d_scale)); diff --git a/mace/ops/thread_pool_benchmark.cc b/mace/ops/thread_pool_benchmark.cc index 1fd14713d87157a13d98e30fe1a845fe189d8078..f800929809725ac73cd63fb082831e0dd7a38dd5 100644 --- a/mace/ops/thread_pool_benchmark.cc +++ b/mace/ops/thread_pool_benchmark.cc @@ -29,16 +29,16 @@ namespace test { namespace { -const size_t kMaxSize = 100000000; -const size_t image_size = 56 * 56; -std::vector output_data(kMaxSize), bias_data(kMaxSize); +const index_t kMaxSize = 100000000; +const index_t image_size = 56 * 56; +std::vector output_data(kMaxSize), bias_data(kMaxSize); void OpenMPBenchmark1D(int iters, int size) { while (iters--) { const int b = 0; #pragma omp parallel for schedule(runtime) for (int c = 0; c < size; ++c) { - for (size_t i = 0; i < image_size; ++i) { + for (index_t i = 0; i < image_size; ++i) { output_data[(b * size + c) * image_size + i] += bias_data[c]; } } @@ -52,11 +52,10 @@ void ThreadPoolBenchmark1D(int iters, int size) { mace::testing::StartTiming(); while (iters--) { - const int b = 0; // 'const' keyword affects performance - int batch_size = size * image_size; - thread_pool.Compute1D([&](size_t start0, size_t end0, size_t step0) { - for (size_t c = start0; c < end0; c += step0) { - for (size_t i = 0; i < image_size; ++i) { + const int b = 0; + thread_pool.Compute1D([=](index_t start0, index_t end0, index_t step0) { + for (index_t c = start0; c < end0; c += step0) { + for (index_t i = 0; i < image_size; ++i) { output_data[(b * size + c) * image_size + i] += bias_data[c]; } } @@ -67,14 +66,13 @@ void ThreadPoolBenchmark1D(int iters, int size) { void OpenMPBenchmark2D(int iters, int size0, int size1) { while (iters--) { #pragma omp parallel for collapse(2) schedule(runtime) - for (int b = 0; b < size0; ++b) { - for (int c = 0; c < size1; ++c) { - for (size_t i = 0; i < image_size; ++i) { - // it seems like OpenMP optimize the following mac - output_data[(b * size1 + c) * image_size + i] += bias_data[c]; - } + for (int b = 0; b < size0; ++b) { + for (int c = 0; c < size1; ++c) { + for (index_t i = 0; i < image_size; ++i) { + output_data[(b * size1 + c) * image_size + i] += bias_data[c]; } } + } } } @@ -85,11 +83,11 @@ void ThreadPoolBenchmark2D(int iters, int size0, int size1) { mace::testing::StartTiming(); while (iters--) { - thread_pool.Compute2D([&](size_t start0, size_t end0, size_t step0, - size_t start1, size_t end1, size_t step1) { - for (size_t b = start0; b < end0; b += step0) { - for (size_t c = start1; c < end1; c += step1) { - for (size_t i = 0; i < image_size; ++i) { + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + for (index_t i = 0; i < image_size; ++i) { output_data[(b * size1 + c) * image_size + i] += bias_data[c]; } } diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc index 678f3ee642f210083904c189fc4752dcd8c5bd4e..6c6993e065a9dbf1f0a0bf0e336ea32598a9989b 100644 --- a/mace/ops/transpose.cc +++ b/mace/ops/transpose.cc @@ -26,10 +26,10 @@ namespace mace { namespace ops { -template +template class TransposeOp; -template +template class TransposeOp : public Operation { public: explicit TransposeOp(OpConstructContext *context) @@ -55,7 +55,8 @@ class TransposeOp : public Operation { const float *input_data = input->data(); float *output_data = output->mutable_data(); - return Transpose(input_data, input->shape(), dims_, output_data); + return Transpose(&context->device()->cpu_runtime()->thread_pool(), + input_data, input->shape(), dims_, output_data); } private: diff --git a/mace/public/mace.h b/mace/public/mace.h index 8cc251132d9d2ee26ecf70b2684e7eee25f50f15..fd39fdba6c501b6f1aa4eb6cb7980fa5158012ca 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -299,11 +299,9 @@ class MACE_API MaceEngineConfig { /// \param status MACE_SUCCESS for successful, or it can't reliabley /// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's /// suggested to use AFFINITY_NONE to use all cores. - /// \param use_gemmlowp use gemmlowp for cpu quantized inference /// \return MaceStatus::MACE_SUCCESS for success, other for failed. MaceStatus SetCPUThreadPolicy(int num_threads_hint, - CPUAffinityPolicy policy, - bool use_gemmlowp = false); + CPUAffinityPolicy policy); private: class Impl; diff --git a/mace/test/mace_api_test.h b/mace/test/mace_api_test.h index 2257b2162ca6d53e81fd29367594bf860ff115ec..9cc1402f7558c9e5d0d1116eaef2fb161adda194 100644 --- a/mace/test/mace_api_test.h +++ b/mace/test/mace_api_test.h @@ -163,7 +163,7 @@ void CheckOutputs(const NetDef &net_def, std::unique_ptr tmp_tensor( new Tensor(allocator.get(), DataTypeToEnum::v())); - auto output_shape = output.second.shape(); + auto &output_shape = output.second.shape(); const int64_t data_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies()); diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index cc77b6b303a42a32ae9aab0d1ee9033c82dcba62..7fc0690df25c3f2dc094cc4f36109b3eba392e23 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -167,8 +167,7 @@ bool RunModel(const std::string &model_name, MaceEngineConfig config(device_type); status = config.SetCPUThreadPolicy( FLAGS_omp_num_threads, - static_cast(FLAGS_cpu_affinity_policy), - true); + static_cast(FLAGS_cpu_affinity_policy)); if (status != MaceStatus::MACE_SUCCESS) { LOG(WARNING) << "Set openmp or cpu affinity failed."; } diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h deleted file mode 100644 index 30595046cabffc6d33a57803dcf59d638962a6d4..0000000000000000000000000000000000000000 --- a/mace/utils/quantize.h +++ /dev/null @@ -1,301 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_UTILS_QUANTIZE_H_ -#define MACE_UTILS_QUANTIZE_H_ - -#include -#include -#include - -#if defined(MACE_ENABLE_NEON) -#include -#endif // MACE_ENABLE_NEON - -#include "mace/utils/logging.h" - -namespace mace { - -template -inline void AdjustRange(const float in_min_data, - const float in_max_data, - const bool non_zero, - float *scale, - int32_t *zero_point) { - // re-range to make range include zero float and - // make zero float as integer u8 - const T quantized_min = std::numeric_limits::lowest(); - const T quantized_max = std::numeric_limits::max(); - if (quantized_min < 0) { - MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value."); - } - - float out_max = std::max(0.f, in_max_data); - float out_min = std::min(0.f, in_min_data); - // make in_min_data quantize as greater than 1 - if (non_zero) { - out_min = std::min(out_min, - in_min_data - (out_max - in_min_data) - / (quantized_max - quantized_min - 1)); - } - - *scale = (out_max - out_min) / (quantized_max - quantized_min); - const float kEps = 1e-6; - if (out_min < -kEps && out_max > kEps) { - float quantized_zero = -out_min / *scale; - int32_t - quantized_zero_near_int = static_cast(roundf(quantized_zero)); - *zero_point = quantized_zero_near_int; - if (fabs(quantized_zero - quantized_zero_near_int) > kEps && non_zero) { - *zero_point = static_cast(std::ceil(quantized_zero)); - } - } else if (out_min > -kEps) { - *zero_point = quantized_min; - } else { - *zero_point = quantized_max; - } -} - -template -inline T Saturate(float value) { - int rounded_value = static_cast(value); - if (rounded_value <= std::numeric_limits::lowest()) { - return std::numeric_limits::lowest(); - } else if (rounded_value >= std::numeric_limits::max()) { - return std::numeric_limits::max(); - } else { - return static_cast(rounded_value); - } -} - -inline void FindMinMax(const float *input, - const index_t size, - float *min_val, float *max_val) { - float max_v = std::numeric_limits::lowest(); - float min_v = std::numeric_limits::max(); - for (index_t i = 0; i < size; ++i) { - max_v = std::max(max_v, input[i]); - min_v = std::min(min_v, input[i]); - } - *min_val = min_v; - *max_val = max_v; -} - -template -inline void QuantizeWithScaleAndZeropoint(const float *input, - const index_t size, - float scale, - int32_t zero_point, - T *output) { - float recip_scale = 1 / scale; -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < size; ++i) { - output[i] = Saturate(roundf(zero_point + recip_scale * input[i])); - } -} - -template -inline void Quantize(const float *input, - const index_t size, - bool non_zero, - T *output, - float *scale, - int32_t *zero_point) { - float in_min_data; - float in_max_data; - FindMinMax(input, size, &in_min_data, &in_max_data); - - AdjustRange(in_min_data, in_max_data, non_zero, - scale, zero_point); - - QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output); -} - -template -inline void Quantize(const Tensor &input, - Tensor *output, - float *min_out, - float *max_out) { - MACE_CHECK(input.size() != 0); - Tensor::MappingGuard input_guard(&input); - Tensor::MappingGuard output_guard(output); - auto *input_data = input.data(); - auto *output_data = output->mutable_data(); - float scale; - int32_t zero_point; - - Quantize(input_data, input.size(), false, output_data, &scale, &zero_point); - - *min_out = scale * (std::numeric_limits::lowest() - zero_point); - *max_out = scale * (std::numeric_limits::max() - zero_point); -} - -template -inline void Dequantize(const T *input, - const index_t size, - const float scale, - const int32_t zero_point, - float *output) { -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < size; ++i) { - output[i] = scale * (input[i] - zero_point); - } -} - -#if defined(MACE_ENABLE_NEON) -template<> -inline void QuantizeWithScaleAndZeropoint(const float *input, - const index_t size, - float scale, - int32_t zero_point, - uint8_t *output) { - const float32x4_t vround = vdupq_n_f32(0.5); - const float32x4_t - vzero = vaddq_f32(vround, vcvtq_f32_s32(vdupq_n_s32(zero_point))); - const float recip_scale = 1.f / scale; - const float32x4_t vrecip_scale = vdupq_n_f32(recip_scale); - const index_t block_count = size / 16; - -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < block_count; ++i) { - float32x4_t vi0 = vld1q_f32(input + i * 16); - float32x4_t vi1 = vld1q_f32(input + i * 16 + 4); - float32x4_t vi2 = vld1q_f32(input + i * 16 + 8); - float32x4_t vi3 = vld1q_f32(input + i * 16 + 12); - - int32x4_t vo0_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi0, vrecip_scale)); - int32x4_t vo1_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi1, vrecip_scale)); - int32x4_t vo2_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi2, vrecip_scale)); - int32x4_t vo3_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi3, vrecip_scale)); - - uint8x8_t vo0_u8 = - vqmovun_s16(vcombine_s16(vqmovn_s32(vo0_s32), vqmovn_s32(vo1_s32))); - uint8x8_t vo1_u8 = - vqmovun_s16(vcombine_s16(vqmovn_s32(vo2_s32), vqmovn_s32(vo3_s32))); - uint8x16_t vo = vcombine_u8(vo0_u8, vo1_u8); - - vst1q_u8(output + i * 16, vo); - } - -#pragma omp parallel for schedule(runtime) - for (index_t i = block_count * 16; i < size; ++i) { - output[i] = Saturate(roundf(zero_point + recip_scale * input[i])); - } -} - -template<> -inline void Dequantize(const int32_t *input, - const index_t size, - const float scale, - const int32_t zero_point, - float *output) { - const index_t block_count = size / 4; - const int32x4_t vzero = vdupq_n_s32(zero_point); - const float32x4_t vscale = vdupq_n_f32(scale); - -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < block_count; ++i) { - int32x4_t vi = vld1q_s32(input + i * 4); - float32x4_t vo = vmulq_f32(vscale, vcvtq_f32_s32(vsubq_s32(vi, vzero))); - vst1q_f32(output + i * 4, vo); - } - for (index_t i = block_count * 4; i < size; ++i) { - output[i] = scale * (input[i] - zero_point); - } -} - -template<> -inline void Dequantize(const uint8_t *input, - const index_t size, - const float scale, - const int32_t zero_point, - float *output) { - const index_t block_count = size / 16; - const int32x4_t vzero = vdupq_n_s32(zero_point); - const float32x4_t vscale = vdupq_n_f32(scale); - -#pragma omp parallel for schedule(runtime) - for (index_t i = 0; i < block_count; ++i) { - uint8x16_t vi = vld1q_u8(input + i * 16); - float32x4x4_t vo = { - vmulq_f32(vscale, - vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( - vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))), - vmulq_f32(vscale, - vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( - vget_high_u16(vmovl_u8(vget_low_u8(vi))))), vzero))), - vmulq_f32(vscale, - vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( - vget_low_u16(vmovl_u8(vget_high_u8(vi))))), vzero))), - vmulq_f32(vscale, - vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( - vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))), - }; - vst1q_f32(output + i * 16, vo.val[0]); - vst1q_f32(output + i * 16 + 4, vo.val[1]); - vst1q_f32(output + i * 16 + 8, vo.val[2]); - vst1q_f32(output + i * 16 + 12, vo.val[3]); - } - for (index_t i = block_count * 16; i < size; ++i) { - output[i] = scale * (input[i] - zero_point); - } -} -#endif // MACE_ENABLE_NEON - -template -inline void DeQuantize(const Tensor &input, - const float min_in, - const float max_in, - Tensor *output) { - MACE_CHECK(input.size() != 0); - Tensor::MappingGuard input_guard(&input); - Tensor::MappingGuard output_guard(output); - auto *input_data = input.data(); - auto *output_data = output->mutable_data(); - float scale; - int32_t zero_point; - - AdjustRange(min_in, max_in, false, &scale, &zero_point); - - Dequantize(input_data, input.size(), scale, zero_point, output_data); -} - -inline void QuantizeMultiplier(double multiplier, - int32_t *output_multiplier, - int32_t *shift) { - const double q = std::frexp(multiplier, shift); - auto qint = static_cast(roundl(q * (1ll << 31))); - if (qint == (1ll << 31)) { - qint /= 2; - ++*shift; - } - *output_multiplier = static_cast(qint); - MACE_CHECK(*output_multiplier <= std::numeric_limits::max()); -} - -inline void GetOutputMultiplierAndShift( - const float lhs_scale, const float rhs_scale, const float output_scale, - int32_t *quantized_multiplier, int *right_shift) { - float real_multiplier = lhs_scale * rhs_scale / output_scale; - MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier); - - int exponent; - QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent); - *right_shift = -exponent; - MACE_CHECK(*right_shift >= 0); -} - -} // namespace mace - -#endif // MACE_UTILS_QUANTIZE_H_ diff --git a/mace/utils/thread_pool.cc b/mace/utils/thread_pool.cc index 92c128a79d5baa43e3e5426c984bf5b14ca0e405..8d7d98c8535065b9ac1a9a55325f441898bec4a1 100644 --- a/mace/utils/thread_pool.cc +++ b/mace/utils/thread_pool.cc @@ -163,19 +163,19 @@ void ThreadPool::Init() { count_down_latch_.Wait(); } -void ThreadPool::Run(const std::function &func, - size_t iterations) { +void ThreadPool::Run(const std::function &func, + const int64_t iterations) { const size_t thread_count = threads_.size(); - const size_t iters_per_thread = iterations / thread_count; - const size_t remainder = iterations % thread_count; - size_t iters_offset = 0; + const int64_t iters_per_thread = iterations / thread_count; + const int64_t remainder = iterations % thread_count; + int64_t iters_offset = 0; std::unique_lock run_lock(run_mutex_); for (size_t i = 0; i < thread_count; ++i) { - size_t count = iters_per_thread + (i < remainder); + int64_t count = iters_per_thread + (static_cast(i) < remainder); thread_infos_[i].range_start = iters_offset; - size_t range_end = std::min(iterations, iters_offset + count); + int64_t range_end = std::min(iterations, iters_offset + count); thread_infos_[i].range_end = range_end; thread_infos_[i].range_len = range_end - iters_offset; thread_infos_[i].func = reinterpret_cast(&func); @@ -263,10 +263,10 @@ void ThreadPool::ThreadLoop(size_t tid) { void ThreadPool::ThreadRun(size_t tid) { ThreadInfo &thread_info = thread_infos_[tid]; uintptr_t func_ptr = thread_info.func; - const std::function *func = - reinterpret_cast *>(func_ptr); + const std::function *func = + reinterpret_cast *>(func_ptr); // do own work - size_t range_len; + int64_t range_len; while ((range_len = thread_info.range_len) > 0) { if (thread_info.range_len.compare_exchange_strong(range_len, range_len - 1)) { @@ -280,33 +280,33 @@ void ThreadPool::ThreadRun(size_t tid) { t = (t + 1) % thread_count) { ThreadInfo &other_thread_info = thread_infos_[t]; uintptr_t other_func_ptr = other_thread_info.func; - const std::function *other_func = - reinterpret_cast *>( + const std::function *other_func = + reinterpret_cast *>( other_func_ptr); while ((range_len = other_thread_info.range_len) > 0) { if (other_thread_info.range_len.compare_exchange_strong(range_len, range_len - 1)) { - size_t tail = other_thread_info.range_end--; + int64_t tail = other_thread_info.range_end--; other_func->operator()(tail - 1); } } } } -void ThreadPool::Compute1D(const std::function &func, - size_t start, - size_t end, - size_t step, - size_t tile_size, - int cost_per_item) { +void ThreadPool::Compute1D(const std::function &func, + const int64_t start, + const int64_t end, + const int64_t step, + int64_t tile_size, + const int cost_per_item) { if (start >= end) { return; } - size_t items = 1 + (end - start - 1) / step; + int64_t items = 1 + (end - start - 1) / step; if (threads_.size() <= 1 || (cost_per_item >= 0 && items * cost_per_item < kMaxCostUsingSingleThread)) { func(start, end, step); @@ -314,39 +314,39 @@ void ThreadPool::Compute1D(const std::function(1), items / default_tile_count_); + tile_size = std::max(static_cast(1), items / default_tile_count_); } - size_t step_tile_size = step * tile_size; - size_t tile_count = RoundUpDiv(items, tile_size); - Run([&](size_t tile_idx) { - size_t tile_start = start + tile_idx * step_tile_size; - size_t tile_end = std::min(end, tile_start + step_tile_size); + int64_t step_tile_size = step * tile_size; + int64_t tile_count = RoundUpDiv(items, tile_size); + Run([&](int64_t tile_idx) { + int64_t tile_start = start + tile_idx * step_tile_size; + int64_t tile_end = std::min(end, tile_start + step_tile_size); func(tile_start, tile_end, step); }, tile_count); } -void ThreadPool::Compute2D(const std::function &func, - size_t start0, - size_t end0, - size_t step0, - size_t start1, - size_t end1, - size_t step1, - size_t tile_size0, - size_t tile_size1, - int cost_per_item) { +void ThreadPool::Compute2D(const std::function &func, + const int64_t start0, + const int64_t end0, + const int64_t step0, + const int64_t start1, + const int64_t end1, + const int64_t step1, + int64_t tile_size0, + int64_t tile_size1, + const int cost_per_item) { if (start0 >= end0 || start1 >= end1) { return; } - size_t items0 = 1 + (end0 - start0 - 1) / step0; - size_t items1 = 1 + (end1 - start1 - 1) / step1; + int64_t items0 = 1 + (end0 - start0 - 1) / step0; + int64_t items1 = 1 + (end1 - start1 - 1) / step1; if (threads_.size() <= 1 || (cost_per_item >= 0 && items0 * items1 * cost_per_item < kMaxCostUsingSingleThread)) { func(start0, end0, step0, start1, end1, step1); @@ -359,56 +359,56 @@ void ThreadPool::Compute2D(const std::function(1), + tile_size1 = std::max(static_cast(1), items1 * items0 / default_tile_count_); } } - size_t step_tile_size0 = step0 * tile_size0; - size_t step_tile_size1 = step1 * tile_size1; - size_t tile_count0 = RoundUpDiv(items0, tile_size0); - size_t tile_count1 = RoundUpDiv(items1, tile_size1); - - Run([&](size_t tile_idx) { - size_t tile_idx0 = tile_idx / tile_count1; - size_t tile_idx1 = tile_idx - tile_idx0 * tile_count1; - size_t tile_start0 = start0 + tile_idx0 * step_tile_size0; - size_t tile_end0 = std::min(end0, tile_start0 + step_tile_size0); - size_t tile_start1 = start1 + tile_idx1 * step_tile_size1; - size_t tile_end1 = std::min(end1, tile_start1 + step_tile_size1); + int64_t step_tile_size0 = step0 * tile_size0; + int64_t step_tile_size1 = step1 * tile_size1; + int64_t tile_count0 = RoundUpDiv(items0, tile_size0); + int64_t tile_count1 = RoundUpDiv(items1, tile_size1); + + Run([&](int64_t tile_idx) { + int64_t tile_idx0 = tile_idx / tile_count1; + int64_t tile_idx1 = tile_idx - tile_idx0 * tile_count1; + int64_t tile_start0 = start0 + tile_idx0 * step_tile_size0; + int64_t tile_end0 = std::min(end0, tile_start0 + step_tile_size0); + int64_t tile_start1 = start1 + tile_idx1 * step_tile_size1; + int64_t tile_end1 = std::min(end1, tile_start1 + step_tile_size1); func(tile_start0, tile_end0, step0, tile_start1, tile_end1, step1); }, tile_count0 * tile_count1); } -void ThreadPool::Compute3D(const std::function &func, - size_t start0, - size_t end0, - size_t step0, - size_t start1, - size_t end1, - size_t step1, - size_t start2, - size_t end2, - size_t step2, - size_t tile_size0, - size_t tile_size1, - size_t tile_size2, - int cost_per_item) { +void ThreadPool::Compute3D(const std::function &func, + const int64_t start0, + const int64_t end0, + const int64_t step0, + const int64_t start1, + const int64_t end1, + const int64_t step1, + const int64_t start2, + const int64_t end2, + const int64_t step2, + int64_t tile_size0, + int64_t tile_size1, + int64_t tile_size2, + const int cost_per_item) { if (start0 >= end0 || start1 >= end1 || start2 >= end1) { return; } - size_t items0 = 1 + (end0 - start0 - 1) / step0; - size_t items1 = 1 + (end1 - start1 - 1) / step1; - size_t items2 = 1 + (end2 - start2 - 1) / step2; + int64_t items0 = 1 + (end0 - start0 - 1) / step0; + int64_t items1 = 1 + (end1 - start1 - 1) / step1; + int64_t items2 = 1 + (end2 - start2 - 1) / step2; if (threads_.size() <= 1 || (cost_per_item >= 0 && items0 * items1 * items2 * cost_per_item < kMaxCostUsingSingleThread)) { @@ -423,37 +423,37 @@ void ThreadPool::Compute3D(const std::function= default_tile_count_) { tile_size1 = items01 / default_tile_count_; tile_size2 = items2; } else { tile_size1 = 1; - tile_size2 = std::max(static_cast(1), + tile_size2 = std::max(static_cast(1), items01 * items2 / default_tile_count_); } } } - size_t step_tile_size0 = step0 * tile_size0; - size_t step_tile_size1 = step1 * tile_size1; - size_t step_tile_size2 = step2 * tile_size2; - size_t tile_count0 = RoundUpDiv(items0, tile_size0); - size_t tile_count1 = RoundUpDiv(items1, tile_size1); - size_t tile_count2 = RoundUpDiv(items2, tile_size2); - size_t tile_count12 = tile_count1 * tile_count2; - - Run([&](size_t tile_idx) { - size_t tile_idx0 = tile_idx / tile_count12; - size_t tile_idx12 = tile_idx - tile_idx0 * tile_count12; - size_t tile_idx1 = tile_idx12 / tile_count2; - size_t tile_idx2 = tile_idx12 - tile_idx1 * tile_count2; - size_t tile_start0 = start0 + tile_idx0 * step_tile_size0; - size_t tile_end0 = std::min(end0, tile_start0 + step_tile_size0); - size_t tile_start1 = start1 + tile_idx1 * step_tile_size1; - size_t tile_end1 = std::min(end1, tile_start1 + step_tile_size1); - size_t tile_start2 = start2 + tile_idx2 * step_tile_size2; - size_t tile_end2 = std::min(end2, tile_start2 + step_tile_size2); + int64_t step_tile_size0 = step0 * tile_size0; + int64_t step_tile_size1 = step1 * tile_size1; + int64_t step_tile_size2 = step2 * tile_size2; + int64_t tile_count0 = RoundUpDiv(items0, tile_size0); + int64_t tile_count1 = RoundUpDiv(items1, tile_size1); + int64_t tile_count2 = RoundUpDiv(items2, tile_size2); + int64_t tile_count12 = tile_count1 * tile_count2; + + Run([&](int64_t tile_idx) { + int64_t tile_idx0 = tile_idx / tile_count12; + int64_t tile_idx12 = tile_idx - tile_idx0 * tile_count12; + int64_t tile_idx1 = tile_idx12 / tile_count2; + int64_t tile_idx2 = tile_idx12 - tile_idx1 * tile_count2; + int64_t tile_start0 = start0 + tile_idx0 * step_tile_size0; + int64_t tile_end0 = std::min(end0, tile_start0 + step_tile_size0); + int64_t tile_start1 = start1 + tile_idx1 * step_tile_size1; + int64_t tile_end1 = std::min(end1, tile_start1 + step_tile_size1); + int64_t tile_start2 = start2 + tile_idx2 * step_tile_size2; + int64_t tile_end2 = std::min(end2, tile_start2 + step_tile_size2); func(tile_start0, tile_end0, step0, diff --git a/mace/utils/thread_pool.h b/mace/utils/thread_pool.h index 67fa89cf112cc3b3fc55124a73ef8bb63633e57c..90d30257bf66da0b7d6d82776b87071779396b9f 100644 --- a/mace/utils/thread_pool.h +++ b/mace/utils/thread_pool.h @@ -37,54 +37,55 @@ class ThreadPool { void Init(); - void Run(const std::function &func, size_t iterations); - - void Compute1D(const std::function &func, - size_t start, - size_t end, - size_t step, - size_t tile_size = 0, + void Run(const std::function &func, + const int64_t iterations); + + void Compute1D(const std::function &func, + int64_t start, + int64_t end, + int64_t step, + int64_t tile_size = 0, int cost_per_item = -1); - void Compute2D(const std::function &func, - size_t start0, - size_t end0, - size_t step0, - size_t start1, - size_t end1, - size_t step1, - size_t tile_size0 = 0, - size_t tile_size1 = 0, + void Compute2D(const std::function &func, + int64_t start0, + int64_t end0, + int64_t step0, + int64_t start1, + int64_t end1, + int64_t step1, + int64_t tile_size0 = 0, + int64_t tile_size1 = 0, int cost_per_item = -1); - void Compute3D(const std::function &func, - size_t start0, - size_t end0, - size_t step0, - size_t start1, - size_t end1, - size_t step1, - size_t start2, - size_t end2, - size_t step2, - size_t tile_size0 = 0, - size_t tile_size1 = 0, - size_t tile_size2 = 0, + void Compute3D(const std::function &func, + int64_t start0, + int64_t end0, + int64_t step0, + int64_t start1, + int64_t end1, + int64_t step1, + int64_t start2, + int64_t end2, + int64_t step2, + int64_t tile_size0 = 0, + int64_t tile_size1 = 0, + int64_t tile_size2 = 0, int cost_per_item = -1); private: @@ -100,16 +101,16 @@ class ThreadPool { std::mutex run_mutex_; struct ThreadInfo { - size_t range_start; - std::atomic range_end; - std::atomic range_len; + std::atomic range_start; + std::atomic range_end; + std::atomic range_len; uintptr_t func; std::vector cpu_cores; }; std::vector thread_infos_; std::vector threads_; - size_t default_tile_count_; + int64_t default_tile_count_; }; } // namespace utils diff --git a/mace/utils/thread_pool_test.cc b/mace/utils/thread_pool_test.cc index 281b335b42b24d0b86e5c6816d03a50cb8845633..a6d5cb04d4f6122b1e18774a737aa470e7261f25 100644 --- a/mace/utils/thread_pool_test.cc +++ b/mace/utils/thread_pool_test.cc @@ -30,27 +30,29 @@ class ThreadPoolTest : public ::testing::Test { ThreadPool thread_pool; }; -void Test1D(size_t start, size_t end, size_t step, std::vector *res) { - for (size_t i = start; i < end; i += step) { +void Test1D(int64_t start, int64_t end, int64_t step, std::vector *res) { + for (int64_t i = start; i < end; i += step) { (*res)[i]++; } } -void Test2D(size_t start0, size_t end0, size_t step0, - size_t start1, size_t end1, size_t step1, std::vector *res) { - for (size_t i = start0; i < end0; i += step0) { - for (size_t j = start1; j < end1; j += step1) { +void Test2D(int64_t start0, int64_t end0, int64_t step0, + int64_t start1, int64_t end1, int64_t step1, + std::vector *res) { + for (int64_t i = start0; i < end0; i += step0) { + for (int64_t j = start1; j < end1; j += step1) { (*res)[i * 100 + j]++; } } } -void Test3D(size_t start0, size_t end0, size_t step0, - size_t start1, size_t end1, size_t step1, - size_t start2, size_t end2, size_t step2, std::vector *res) { - for (size_t i = start0; i < end0; i += step0) { - for (size_t j = start1; j < end1; j += step1) { - for (size_t k = start2; k < end2; k += step2) { +void Test3D(int64_t start0, int64_t end0, int64_t step0, + int64_t start1, int64_t end1, int64_t step1, + int64_t start2, int64_t end2, int64_t step2, + std::vector *res) { + for (int64_t i = start0; i < end0; i += step0) { + for (int64_t j = start1; j < end1; j += step1) { + for (int64_t k = start2; k < end2; k += step2) { (*res)[(i * 100 + j) * 100 + k]++; } } @@ -58,47 +60,47 @@ void Test3D(size_t start0, size_t end0, size_t step0, } TEST_F(ThreadPoolTest, Compute1D) { - size_t test_size = 100; + int64_t test_size = 100; std::vector actual(test_size, 0); - thread_pool.Compute1D([&](size_t start, size_t end, size_t step) { + thread_pool.Compute1D([&](int64_t start, int64_t end, int64_t step) { Test1D(start, end, step, &actual); }, 0, test_size, 2); std::vector expected(test_size, 0); Test1D(0, test_size, 2, &expected); - for (size_t i = 0; i < test_size; ++i) { + for (int64_t i = 0; i < test_size; ++i) { EXPECT_EQ(expected[i], actual[i]); } } TEST_F(ThreadPoolTest, Compute2D) { - size_t test_size = 100; + int64_t test_size = 100; std::vector actual(test_size * test_size, 0); - thread_pool.Compute2D([&](size_t start0, size_t end0, size_t step0, - size_t start1, size_t end1, size_t step1) { + thread_pool.Compute2D([&](int64_t start0, int64_t end0, int64_t step0, + int64_t start1, int64_t end1, int64_t step1) { Test2D(start0, end0, step0, start1, end1, step1, &actual); }, 0, test_size, 2, 0, test_size, 2); std::vector expected(test_size * test_size, 0); Test2D(0, test_size, 2, 0, test_size, 2, &expected); - for (size_t i = 0; i < test_size * test_size; ++i) { + for (int64_t i = 0; i < test_size * test_size; ++i) { EXPECT_EQ(expected[i], actual[i]); } } TEST_F(ThreadPoolTest, Compute3D) { - size_t test_size = 100; + int64_t test_size = 100; std::vector actual(test_size * test_size * test_size, 0); - thread_pool.Compute3D([&](size_t start0, size_t end0, size_t step0, - size_t start1, size_t end1, size_t step1, - size_t start2, size_t end2, size_t step2) { + thread_pool.Compute3D([&](int64_t start0, int64_t end0, int64_t step0, + int64_t start1, int64_t end1, int64_t step1, + int64_t start2, int64_t end2, int64_t step2) { Test3D(start0, end0, step0, start1, end1, step1, start2, end2, step2, &actual); }, 0, test_size, 2, 0, test_size, 2, 0, test_size, 2); std::vector expected(test_size * test_size * test_size, 0); Test3D(0, test_size, 2, 0, test_size, 2, 0, test_size, 2, &expected); - for (size_t i = 0; i < test_size * test_size * test_size; ++i) { + for (int64_t i = 0; i < test_size * test_size * test_size; ++i) { EXPECT_EQ(expected[i], actual[i]); } } diff --git a/tools/bazel.rc b/tools/bazel.rc index ef5fd59791bcb68cb0bc1ffc75ad936b7f3d58c4..0a49d2cebd18add2bab3b7eae341b730acf771dd 100644 --- a/tools/bazel.rc +++ b/tools/bazel.rc @@ -8,7 +8,6 @@ build --copt=-std=c++11 build --copt=-fPIC build --copt=-D_GLIBCXX_USE_C99_MATH_TR1 build --copt=-DMACE_OBFUSCATE_LITERALS -build --copt=-DGEMMLOWP_USE_OPENMP # Usage example: bazel build --config android build:android --linkopt=-pie diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py index 7083d3180b94acca93778ad33667bb7869b46f17..dec4e21f566490c4dc8f4cf218f2999a25e44d3a 100644 --- a/tools/bazel_adb_run.py +++ b/tools/bazel_adb_run.py @@ -107,8 +107,8 @@ def parse_args(): parser.add_argument( "--enable_openmp", type=str2bool, - default=True, - help="Disable openmp for multiple thread.") + default=False, + help="Whether to use openmp") parser.add_argument( '--address_sanitizer', action="store_true", @@ -143,9 +143,9 @@ def main(unused_args): abi=target_abi, toolchain=toolchain, enable_neon=FLAGS.enable_neon, + enable_openmp=FLAGS.enable_openmp, address_sanitizer=FLAGS.address_sanitizer, - debug_mode=FLAGS.debug_mode, - enable_openmp=FLAGS.enable_openmp) + debug_mode=FLAGS.debug_mode) if FLAGS.run_target: target_devices = DeviceManager.list_devices(FLAGS.device_yml) if FLAGS.target_socs != TargetSOCTag.all and\ diff --git a/tools/build-standalone-lib.sh b/tools/build-standalone-lib.sh index 5d35e541a5be5252ff257865bbfaddac464925e7..f812450d1371b5bb7b5a713dad3a84da28d4ea9b 100755 --- a/tools/build-standalone-lib.sh +++ b/tools/build-standalone-lib.sh @@ -33,67 +33,67 @@ mkdir -p $LIB_DIR/aarch64_linux_gnu/cpu_gpu # build shared libraries echo "build shared lib for armeabi-v7a + cpu_gpu_dsp" -bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a +bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/ cp third_party/nnlib/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/ echo "build shared lib for arm64-v8a + cpu_gpu_dsp" -bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a +bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu_dsp/ cp third_party/nnlib/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_dsp/ echo "build shared lib for armeabi-v7a + cpu_gpu" -bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=armeabi-v7a +bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=true --define quantize=true --cpu=armeabi-v7a cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu/ echo "build shared lib for arm64-v8a + cpu_gpu" -bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=arm64-v8a +bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=true --define quantize=true --cpu=arm64-v8a cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu/ echo "build shared lib for arm_linux_gnueabihf + cpu_gpu" -bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true +bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=true --define quantize=true cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm_linux_gnueabihf/cpu_gpu/ echo "build shared lib for aarch64_linux_gnu + cpu_gpu" -bazel build --config aarch64_linux_gnu --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true +bazel build --config aarch64_linux_gnu --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=true --define quantize=true cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/aarch64_linux_gnu/cpu_gpu/ if [[ "$OSTYPE" != "darwin"* ]];then echo "build shared lib for linux-x86-64" - bazel build mace/libmace:libmace_dynamic --config optimization --define quantize=true --define openmp=true + bazel build mace/libmace:libmace_dynamic --config optimization --define quantize=true --define openmp=false cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/linux-x86-64/ fi # build static libraries echo "build static lib for armeabi-v7a + cpu_gpu_dsp" -bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a +bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/ cp third_party/nnlib/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/ echo "build static lib for arm64-v8a + cpu_gpu_dsp" -bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a +bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu_dsp/ cp third_party/nnlib/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_dsp/ echo "build static lib for armeabi-v7a + cpu_gpu" -bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=armeabi-v7a +bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define quantize=true --cpu=armeabi-v7a cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu/ echo "build static lib for arm64-v8a + cpu_gpu" -bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=arm64-v8a +bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define quantize=true --cpu=arm64-v8a cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu/ echo "build static lib for arm_linux_gnueabihf + cpu_gpu" -bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true +bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define quantize=true cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm_linux_gnueabihf/cpu_gpu/ echo "build static lib for aarch64_linux_gnu + cpu_gpu" -bazel build --config aarch64_linux_gnu --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true +bazel build --config aarch64_linux_gnu --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define quantize=true cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/aarch64_linux_gnu/cpu_gpu/ if [[ "$OSTYPE" != "darwin"* ]];then echo "build static lib for linux-x86-64" - bazel build mace/libmace:libmace_static --config optimization --define quantize=true --define openmp=true + bazel build mace/libmace:libmace_static --config optimization --define quantize=true --define openmp=false cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/linux-x86-64/ fi diff --git a/tools/converter.py b/tools/converter.py index 7bf387bd5835517d0b2c524d709febaf89fee175..a5df88a9cecd8493b26b6462b33a9aaff729f99b 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -983,7 +983,7 @@ def run_mace(flags): build_example(configs, target_abi, toolchain, - not flags.disable_openmp, + flags.enable_openmp, flags.mace_lib_type, flags.cl_binary_to_code, device, @@ -992,7 +992,7 @@ def run_mace(flags): build_mace_run(configs, target_abi, toolchain, - not flags.disable_openmp, + flags.enable_openmp, flags.address_sanitizer, flags.mace_lib_type, flags.debug_mode) @@ -1081,7 +1081,7 @@ def benchmark_model(flags): build_benchmark_model(configs, target_abi, toolchain, - not flags.disable_openmp, + flags.enable_openmp, flags.mace_lib_type, flags.debug_mode) device = DeviceWrapper(dev) @@ -1171,9 +1171,9 @@ def parse_args(): default=DefaultValues.mace_lib_type, help="[static | dynamic], Which type MACE library to use.") run_bm_parent_parser.add_argument( - "--disable_openmp", + "--enable_openmp", action="store_true", - help="Disable openmp for multiple thread.") + help="Enable openmp for multiple thread.") run_bm_parent_parser.add_argument( "--omp_num_threads", type=int, diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 44047c26518ea0ba00a3ea58a384b132992cc284..3b98c7a691bf6a047bdc91bfd4c90cc36d336d4e 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -265,7 +265,7 @@ def bazel_build(target, toolchain='android', enable_hexagon=False, enable_hta=False, - enable_openmp=True, + enable_openmp=False, enable_neon=True, enable_opencl=True, enable_quantize=True,