diff --git a/mace/examples/android/gradle/wrapper/gradle-wrapper.properties b/mace/examples/android/gradle/wrapper/gradle-wrapper.properties index 810d36ffaadac7c0ed9b1df9de730b53998ee86c..929e9008d387b7b251068dcde25327e03ba43aeb 100644 --- a/mace/examples/android/gradle/wrapper/gradle-wrapper.properties +++ b/mace/examples/android/gradle/wrapper/gradle-wrapper.properties @@ -1,4 +1,4 @@ -#Wed May 02 11:53:35 CST 2018 +#Wed Sep 12 13:35:24 CST 2018 distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists zipStoreBase=GRADLE_USER_HOME diff --git a/mace/kernels/arm/deconv_2d_neon.h b/mace/kernels/arm/deconv_2d_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..87b86b066bb1274ca3a3350a85b4d409880d43d6 --- /dev/null +++ b/mace/kernels/arm/deconv_2d_neon.h @@ -0,0 +1,100 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_ARM_DECONV_2D_NEON_H_ +#define MACE_KERNELS_ARM_DECONV_2D_NEON_H_ + +#if defined(MACE_ENABLE_NEON) +#include +#endif + +#include "mace/core/types.h" + +namespace mace { +namespace kernels { + +void Deconv2dNeonK3x3S1(const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +void Deconv2dNeonK3x3S2(const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +void Deconv2dNeonK4x4S1(const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +void Deconv2dNeonK4x4S2(const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +#ifdef MACE_ENABLE_NEON +inline float32x4_t neon_vfma_lane_0(float32x4_t a, + float32x4_t b, + float32x4_t c) { +#ifdef __aarch64__ + return vfmaq_laneq_f32(a, b, c, 0); +#else + return vmlaq_lane_f32(a, b, vget_low_f32(c), 0); +#endif +} + +inline float32x4_t neon_vfma_lane_1(float32x4_t a, + float32x4_t b, + float32x4_t c) { +#ifdef __aarch64__ + return vfmaq_laneq_f32(a, b, c, 1); +#else + return vmlaq_lane_f32(a, b, vget_low_f32(c), 1); +#endif +} + +inline float32x4_t neon_vfma_lane_2(float32x4_t a, + float32x4_t b, + float32x4_t c) { +#ifdef __aarch64__ + return vfmaq_laneq_f32(a, b, c, 2); +#else + return vmlaq_lane_f32(a, b, vget_high_f32(c), 0); +#endif +} + +inline float32x4_t neon_vfma_lane_3(float32x4_t a, + float32x4_t b, + float32x4_t c) { +#ifdef __aarch64__ + return vfmaq_laneq_f32(a, b, c, 3); +#else + return vmlaq_lane_f32(a, b, vget_high_f32(c), 1); +#endif +} +#endif + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_ARM_DECONV_2D_NEON_H_ diff --git a/mace/kernels/arm/deconv_2d_neon_3x3.cc b/mace/kernels/arm/deconv_2d_neon_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..d4d7d0cdffe767ca8dd5bf8092bbce57b504d8f5 --- /dev/null +++ b/mace/kernels/arm/deconv_2d_neon_3x3.cc @@ -0,0 +1,405 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/macros.h" +#include "mace/kernels/arm/deconv_2d_neon.h" + +namespace mace { +namespace kernels { + +void Deconv2dNeonK3x3S1(const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t out_img_size = outh * outw; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t oc = 0; oc < outch; oc += 2) { + if (oc + 1 < outch) { + float *out_base0 = output + (b * outch + oc) * out_img_size; + float *out_base1 = out_base0 + out_img_size; + + const float bias_value0 = bias ? bias[oc] : 0.f; + const float bias_value1 = bias ? bias[oc + 1] : 0.f; + std::fill_n(out_base0, out_img_size, bias_value0); + std::fill_n(out_base1, out_img_size, bias_value1); + + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input + (b * inch + ic) * h * w; + const float *kernel_base0 = filter + (oc * inch + ic) * 9; + const float *kernel_base1 = kernel_base0 + inch * 9; + const float *in = input_base; + + // output channel 0 + const float *k0_0 = kernel_base0; + const float *k0_1 = kernel_base0 + 3; + const float *k0_2 = kernel_base0 + 5; + // output channel 1 + const float *k1_0 = kernel_base1; + const float *k1_1 = kernel_base1 + 3; + const float *k1_2 = kernel_base1 + 5; + +#if defined(MACE_ENABLE_NEON) + // load filter + float32x4_t k00_vec, k01_vec, k02_vec; + float32x4_t k10_vec, k11_vec, k12_vec; + + k00_vec = vld1q_f32(k0_0); + k01_vec = vld1q_f32(k0_1); + k02_vec = vld1q_f32(k0_2); + + k10_vec = vld1q_f32(k1_0); + k11_vec = vld1q_f32(k1_1); + k12_vec = vld1q_f32(k1_2); +#endif + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + float *out_row0_2 = out_row_base0 + 2 * outw; + + float *out_row_base1 = out_base1 + i * outw; + float *out_row1_0 = out_row_base1; + float *out_row1_1 = out_row_base1 + outw; + float *out_row1_2 = out_row_base1 + 2 * outw; + + index_t j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02; + float32x4_t out10, out11, out12; + float32x4_t out20, out21, out22; + + out00 = vld1q_f32(out_row0_0); + out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); + vst1q_f32(out_row0_0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 2, out02); + + out10 = vld1q_f32(out_row0_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 0, out10); + + out11 = vld1q_f32(out_row0_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 1, out11); + + out12 = vld1q_f32(out_row0_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 2, out12); + + out20 = vld1q_f32(out_row0_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 0, out20); + + out21 = vld1q_f32(out_row0_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 1, out21); + + out22 = vld1q_f32(out_row0_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 2, out22); + + out00 = vld1q_f32(out_row1_0 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 0, out00); + + out01 = vld1q_f32(out_row1_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 1, out01); + + out02 = vld1q_f32(out_row1_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 2, out02); + + out10 = vld1q_f32(out_row1_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 0, out10); + + out11 = vld1q_f32(out_row1_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 1, out11); + + out12 = vld1q_f32(out_row1_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 2, out12); + + out20 = vld1q_f32(out_row1_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 0, out20); + + out21 = vld1q_f32(out_row1_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 1, out21); + + out22 = vld1q_f32(out_row1_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 2, out22); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + out_row0_2 += 4; + out_row1_0 += 4; + out_row1_1 += 4; + out_row1_2 += 4; + } +#endif + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 3; ++k) { + out_row0_0[k] += val * k0_0[k]; + out_row0_1[k] += val * k0_1[k]; + out_row0_2[k] += val * k0_2[k + 1]; + out_row1_0[k] += val * k1_0[k]; + out_row1_1[k] += val * k1_1[k]; + out_row1_2[k] += val * k1_2[k + 1]; + } + in++; + out_row0_0++; + out_row0_1++; + out_row0_2++; + out_row1_0++; + out_row1_1++; + out_row1_2++; + } + } + } + } else { + float *out_base0 = output + (b * outch + oc) * outh * outw; + const float bias_value0 = bias ? bias[oc] : 0.f; + std::fill_n(out_base0, outh * outw, bias_value0); + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input + (b * inch + ic) * h * w; + const float *kernel_base0 = filter + (oc * inch + ic) * 9; + const float *in = input_base; + const float *k0_0 = kernel_base0; + const float *k0_1 = kernel_base0 + 3; + const float *k0_2 = kernel_base0 + 5; + +#if defined(MACE_ENABLE_NEON) + // load filter + float32x4_t k00_vec = vld1q_f32(k0_0); + float32x4_t k01_vec = vld1q_f32(k0_1); + float32x4_t k02_vec = vld1q_f32(k0_2); +#endif + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + float *out_row0_2 = out_row_base0 + 2 * outw; + index_t j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02; + float32x4_t out10, out11, out12; + float32x4_t out20, out21, out22; + + out00 = vld1q_f32(out_row0_0 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 2, out02); + + out10 = vld1q_f32(out_row0_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 0, out10); + + out11 = vld1q_f32(out_row0_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 1, out11); + + out12 = vld1q_f32(out_row0_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 2, out12); + + out20 = vld1q_f32(out_row0_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 0, out20); + + out21 = vld1q_f32(out_row0_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 1, out21); + + out22 = vld1q_f32(out_row0_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 2, out22); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + out_row0_2 += 4; + } +#endif + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 3; ++k) { + out_row0_0[k] += val * k0_0[k]; + out_row0_1[k] += val * k0_1[k]; + out_row0_2[k] += val * k0_2[k + 1]; + } + in++; + out_row0_0++; + out_row0_1++; + out_row0_2++; + } + } + } + } + } + } +} + +void Deconv2dNeonK3x3S2(const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t oc = 0; oc < outch; ++oc) { + float *out_base = output + (b * outch + oc) * out_img_size; + + const float bias_value = bias ? bias[oc] : 0.f; + std::fill_n(out_base, out_img_size, bias_value); + + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input + (b * inch + ic) * h * w; + const float *kernel_base = filter + (oc * inch + ic) * 9; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 3; + const float *k2 = kernel_base + 5; + +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); +#endif + for (index_t i = 0; i < h; ++i) { + float *out_row_base = out_base + i * 2 * outw; + float *out_row_0 = out_row_base; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + + index_t j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + // out row 0 + float32x4x2_t out00 = vld2q_f32(out_row_0); + out00.val[0] = + neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); + out00.val[1] = + neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out00); + + float32x4x2_t out01 = vld2q_f32(out_row_0 + 2); + out01.val[0] = + neon_vfma_lane_2(out01.val[0], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out01); + + // out row 1 + float32x4x2_t out10 = vld2q_f32(out_row_1); + out10.val[0] = + neon_vfma_lane_0(out10.val[0], in_vec, k1_vec); + out10.val[1] = + neon_vfma_lane_1(out10.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out10); + + float32x4x2_t out11 = vld2q_f32(out_row_1 + 2); + out11.val[0] = + neon_vfma_lane_2(out11.val[0], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out11); + + // out row 2 + float32x4x2_t out20 = vld2q_f32(out_row_2); + out20.val[0] = + neon_vfma_lane_1(out20.val[0], in_vec, k2_vec); + out20.val[1] = + neon_vfma_lane_2(out20.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out20); + + float32x4x2_t out21 = vld2q_f32(out_row_2 + 2); + out21.val[0] = + neon_vfma_lane_3(out21.val[0], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out21); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + } +#endif + for (; j < w; ++j) { + float val = in[0]; + + for (int k = 0; k < 3; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k + 1]; + } + + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + } + } + } + } + } +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/deconv_2d_neon_4x4.cc b/mace/kernels/arm/deconv_2d_neon_4x4.cc new file mode 100644 index 0000000000000000000000000000000000000000..719a17e34fb1fce03276c59fa26a9f0476f3c75b --- /dev/null +++ b/mace/kernels/arm/deconv_2d_neon_4x4.cc @@ -0,0 +1,516 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/macros.h" +#include "mace/kernels/arm/deconv_2d_neon.h" + +namespace mace { +namespace kernels { + +void Deconv2dNeonK4x4S1(const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t w = in_shape[3]; + const index_t h = in_shape[2]; + const index_t inch = in_shape[1]; + + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t outch = out_shape[1]; + const index_t out_img_size = outh * outw; +#pragma omp parallel for + for (int b = 0; b < out_shape[0]; ++b) { + for (int oc = 0; oc < outch; oc += 2) { + if (oc + 1 < outch) { + float *out_base = output + (b * outch + oc) * out_img_size; + float *out_base1 = out_base + out_img_size; + const float bias_value = bias ? bias[oc] : 0.f; + std::fill_n(out_base, out_img_size, bias_value); + const float bias_value1 = bias ? bias[oc + 1] : 0.f; + std::fill_n(out_base1, out_img_size, bias_value1); + for (int q = 0; q < inch; q++) { + const float *input_base = input + (b * inch + q) * h * w; + const float *in = input_base; + const float *kernel_base = filter + (oc * inch + q) * 16; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; + + const float *kernel_base1 = kernel_base + inch * 16; + const float *k10 = kernel_base1; + const float *k11 = kernel_base1 + 4; + const float *k12 = kernel_base1 + 8; + const float *k13 = kernel_base1 + 12; +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); + + float32x4_t k10_vec = vld1q_f32(k10); + float32x4_t k11_vec = vld1q_f32(k11); + float32x4_t k12_vec = vld1q_f32(k12); + float32x4_t k13_vec = vld1q_f32(k13); +#endif + for (int i = 0; i < h; i++) { + float *out_row = out_base + i * outw; + + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + + float *out_row1 = out_base1 + i * outw; + + float *out_row1_0 = out_row1; + float *out_row1_1 = out_row1_0 + outw; + float *out_row1_2 = out_row1_1 + outw; + float *out_row1_3 = out_row1_2 + outw; + + int j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + float32x4_t out00, out01, out02, out03; + float32x4_t out10, out11, out12, out13; + + out00 = vld1q_f32(out_row_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row_0, out00); + + out10 = vld1q_f32(out_row1_0); + out10 = neon_vfma_lane_0(out10, in_vec, k10_vec); + vst1q_f32(out_row1_0, out10); + + out01 = vld1q_f32(out_row_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row_0 + 1, out01); + + out11 = vld1q_f32(out_row1_0 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 1, out11); + + out02 = vld1q_f32(out_row_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row_0 + 2, out02); + + out12 = vld1q_f32(out_row1_0 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 2, out12); + + out03 = vld1q_f32(out_row_0 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row_0 + 3, out03); + + out13 = vld1q_f32(out_row1_0 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 3, out13); + + // + out00 = vld1q_f32(out_row_1); + out00 = neon_vfma_lane_0(out00, in_vec, k1_vec); + vst1q_f32(out_row_1, out00); + + out10 = vld1q_f32(out_row1_1); + out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); + vst1q_f32(out_row1_1, out10); + + out01 = vld1q_f32(out_row_1 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k1_vec); + vst1q_f32(out_row_1 + 1, out01); + + out11 = vld1q_f32(out_row1_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 1, out11); + + out02 = vld1q_f32(out_row_1 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k1_vec); + vst1q_f32(out_row_1 + 2, out02); + + out12 = vld1q_f32(out_row1_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 2, out12); + + out03 = vld1q_f32(out_row_1 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k1_vec); + vst1q_f32(out_row_1 + 3, out03); + + out13 = vld1q_f32(out_row1_1 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 3, out13); + + // + out00 = vld1q_f32(out_row_2 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k2_vec); + vst1q_f32(out_row_2 + 0, out00); + + out10 = vld1q_f32(out_row1_2 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 0, out10); + + out01 = vld1q_f32(out_row_2 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k2_vec); + vst1q_f32(out_row_2 + 1, out01); + + out11 = vld1q_f32(out_row1_2 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 1, out11); + + out02 = vld1q_f32(out_row_2 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k2_vec); + vst1q_f32(out_row_2 + 2, out02); + + out12 = vld1q_f32(out_row1_2 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 2, out12); + + out03 = vld1q_f32(out_row_2 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k2_vec); + vst1q_f32(out_row_2 + 3, out03); + + out13 = vld1q_f32(out_row1_2 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 3, out13); + + // + out00 = vld1q_f32(out_row_3 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k3_vec); + vst1q_f32(out_row_3 + 0, out00); + + out10 = vld1q_f32(out_row1_3 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 0, out10); + + out01 = vld1q_f32(out_row_3 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k3_vec); + vst1q_f32(out_row_3 + 1, out01); + + out11 = vld1q_f32(out_row1_3 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 1, out11); + + out02 = vld1q_f32(out_row_3 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k3_vec); + vst1q_f32(out_row_3 + 2, out02); + + out12 = vld1q_f32(out_row1_3 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 2, out12); + + out03 = vld1q_f32(out_row_3 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k3_vec); + vst1q_f32(out_row_3 + 3, out03); + + out13 = vld1q_f32(out_row1_3 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 3, out13); + + in += 4; + out_row_0 += 4; + out_row_1 += 4; + out_row_2 += 4; + out_row_3 += 4; + out_row1_0 += 4; + out_row1_1 += 4; + out_row1_2 += 4; + out_row1_3 += 4; + } +#endif + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + out_row1_0[k] += val * k10[k]; + out_row1_1[k] += val * k11[k]; + out_row1_2[k] += val * k12[k]; + out_row1_3[k] += val * k13[k]; + } + in++; + out_row_0++; + out_row_1++; + out_row_2++; + out_row_3++; + out_row1_0++; + out_row1_1++; + out_row1_2++; + out_row1_3++; + } + } + } + } else { + float *out_base = output + (b * outch + oc) * out_img_size; + const float bias_value = bias ? bias[oc] : 0.f; + std::fill_n(out_base, out_img_size, bias_value); + for (int q = 0; q < inch; q++) { + const float *input_base = input + (b * inch + q) * h * w; + const float *kernel_base = filter + (oc * inch + q) * 16; + const float *in = input_base; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); +#endif + for (int i = 0; i < h; i++) { + float *out_row = out_base + i * outw; + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + int j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00 = vld1q_f32(out_row_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row_0, out00); + + float32x4_t out01 = vld1q_f32(out_row_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row_0 + 1, out01); + + float32x4_t out02 = vld1q_f32(out_row_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row_0 + 2, out02); + + float32x4_t out03 = vld1q_f32(out_row_0 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row_0 + 3, out03); + + // + float32x4_t out10 = vld1q_f32(out_row_1); + out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); + vst1q_f32(out_row_1, out10); + + float32x4_t out11 = vld1q_f32(out_row_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); + vst1q_f32(out_row_1 + 1, out11); + + float32x4_t out12 = vld1q_f32(out_row_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); + vst1q_f32(out_row_1 + 2, out12); + + float32x4_t out13 = vld1q_f32(out_row_1 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); + vst1q_f32(out_row_1 + 3, out13); + + // + float32x4_t out20 = vld1q_f32(out_row_2 + 0); + out20 = neon_vfma_lane_0(out20, in_vec, k2_vec); + vst1q_f32(out_row_2 + 0, out20); + + float32x4_t out21 = vld1q_f32(out_row_2 + 1); + out21 = neon_vfma_lane_1(out21, in_vec, k2_vec); + vst1q_f32(out_row_2 + 1, out21); + + float32x4_t out22 = vld1q_f32(out_row_2 + 2); + out22 = neon_vfma_lane_2(out22, in_vec, k2_vec); + vst1q_f32(out_row_2 + 2, out22); + + float32x4_t out23 = vld1q_f32(out_row_2 + 3); + out23 = neon_vfma_lane_3(out23, in_vec, k2_vec); + vst1q_f32(out_row_2 + 3, out23); + + // + float32x4_t out30 = vld1q_f32(out_row_3 + 0); + out30 = neon_vfma_lane_0(out30, in_vec, k3_vec); + vst1q_f32(out_row_3 + 0, out30); + + float32x4_t out31 = vld1q_f32(out_row_3 + 1); + out31 = neon_vfma_lane_1(out31, in_vec, k3_vec); + vst1q_f32(out_row_3 + 1, out31); + + float32x4_t out32 = vld1q_f32(out_row_3 + 2); + out32 = neon_vfma_lane_2(out32, in_vec, k3_vec); + vst1q_f32(out_row_3 + 2, out32); + + float32x4_t out33 = vld1q_f32(out_row_3 + 3); + out33 = neon_vfma_lane_3(out33, in_vec, k3_vec); + vst1q_f32(out_row_3 + 3, out33); + + in += 4; + out_row_0 += 4; + out_row_1 += 4; + out_row_2 += 4; + out_row_3 += 4; + } +#endif + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0++; + out_row_1++; + out_row_2++; + out_row_3++; + } + } + } + } + } + } +} + +void Deconv2dNeonK4x4S2(const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t w = in_shape[3]; + const index_t h = in_shape[2]; + const index_t inch = in_shape[1]; + + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t outch = out_shape[1]; + const index_t out_img_size = outh * outw; + +#pragma omp parallel for + for (int b = 0; b < out_shape[0]; ++b) { + for (int p = 0; p < outch; p++) { + float *out_base = output + (b * outch + p) * out_img_size; + const float bias_value = bias ? bias[p] : 0.f; + std::fill_n(out_base, outh * outw, bias_value); + + for (int q = 0; q < inch; q++) { + const float *input_base = input + (b * inch + q) * h * w; + const float *kernel_base = filter + (p * inch + q) * 16; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); +#endif + for (int i = 0; i < h; i++) { + float *out_row = out_base + 2 * i * outw; + + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + + int j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + // row 0 + float32x4x2_t out0 = vld2q_f32(out_row_0); + out0.val[0] = + neon_vfma_lane_0(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_1(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out0); + out0 = vld2q_f32(out_row_0 + 2); + out0.val[0] = + neon_vfma_lane_2(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_3(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out0); + + // row 1 + float32x4x2_t out1 = vld2q_f32(out_row_1); + out1.val[0] = + neon_vfma_lane_0(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_1(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out1); + out1 = vld2q_f32(out_row_1 + 2); + out1.val[0] = + neon_vfma_lane_2(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_3(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out1); + + // row 2 + float32x4x2_t out2 = vld2q_f32(out_row_2); + out2.val[0] = + neon_vfma_lane_0(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_1(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out2); + out2 = vld2q_f32(out_row_2 + 2); + out2.val[0] = + neon_vfma_lane_2(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_3(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out2); + + // row 3 + float32x4x2_t out3 = vld2q_f32(out_row_3); + out3.val[0] = + neon_vfma_lane_0(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_1(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3, out3); + out3 = vld2q_f32(out_row_3 + 2); + out3.val[0] = + neon_vfma_lane_2(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_3(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3 + 2, out3); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + out_row_3 += 8; + } +#endif + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + out_row_3 += 2; + } + } + } + } + } +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index 72edfca81b886d1624ce8828e72209c29d4c2ce7..aeead4e5b72152d806ca57c1d04aae9582310b81 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -15,76 +15,25 @@ #ifndef MACE_KERNELS_DECONV_2D_H_ #define MACE_KERNELS_DECONV_2D_H_ -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#if defined(MACE_ENABLE_NEON) #include #endif + #include +#include #include #include #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/kernels/activation.h" +#include "mace/kernels/arm/deconv_2d_neon.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/utils/utils.h" namespace mace { namespace kernels { -namespace deconv { - -template -void Deconv2dNCHW(const T *input, - const T *filter, - const T *bias, - const index_t *in_shape, - const index_t *out_shape, - const index_t *kernel_hw, - const int *strides, - const int *padding, - float *output) { -#pragma omp parallel for collapse(4) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t oc = 0; oc < out_shape[1]; ++oc) { - for (index_t oh = 0; oh < out_shape[2]; ++oh) { - for (index_t ow = 0; ow < out_shape[3]; ++ow) { - index_t filter_start_y, filter_start_x; - index_t start_x = std::max(0, ow + strides[1] -1 - padding[1]); - index_t start_y = std::max(0, oh + strides[0] -1 - padding[0]); - start_x /= strides[1]; - start_y /= strides[0]; - filter_start_x = padding[1] + strides[1] * start_x - ow; - filter_start_y = padding[0] + strides[0] * start_y - oh; - filter_start_x = kernel_hw[1] - 1 - filter_start_x; - filter_start_y = kernel_hw[0] - 1 - filter_start_y; - T out_value = 0; - index_t out_pos = - ((b * out_shape[1] + oc) * out_shape[2] + oh) * out_shape[3] + ow; - for (index_t ic = 0; ic < in_shape[1]; ++ic) { - for (index_t f_y = filter_start_y, ih = start_y; - f_y >= 0 && ih < in_shape[2]; f_y -= strides[0], ++ih) { - for (index_t f_x = filter_start_x, iw = start_x; - f_x >= 0 && iw < in_shape[3]; f_x -= strides[1], ++iw) { - index_t weight_pos = - ((oc * in_shape[1] + ic) * kernel_hw[0] + f_y) - * kernel_hw[1] + f_x; - index_t in_pos = - ((b * in_shape[1] + ic) * in_shape[2] + ih) - * in_shape[3] + iw; - out_value += input[in_pos] * filter[weight_pos]; - } - } - } - if (bias != nullptr) - out_value += bias[oc]; - output[out_pos] = out_value; - } - } - } - } -} -} // namespace deconv - struct Deconv2dFunctorBase : OpKernel { Deconv2dFunctorBase(OpKernelContext *context, const std::vector &strides, @@ -107,6 +56,7 @@ struct Deconv2dFunctorBase : OpKernel { const int *strides, index_t *output_shape, const int *padding_size, + int *input_padding, const bool isNCHW = false) { MACE_CHECK_NOTNULL(output_shape); MACE_CHECK_NOTNULL(padding_size); @@ -119,13 +69,18 @@ struct Deconv2dFunctorBase : OpKernel { const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; - const index_t filter_h = filter_shape[2]; - const index_t filter_w = filter_shape[3]; + const index_t kernel_h = filter_shape[2]; + const index_t kernel_w = filter_shape[3]; + + input_padding[0] = static_cast((kernel_h -1) * 2 - padding_size[0]); + input_padding[1] = static_cast((kernel_w -1) * 2 - padding_size[1]); + input_padding[0] = std::max(0, input_padding[0]); + input_padding[1] = std::max(0, input_padding[1]); index_t out_height = - (in_height - 1) * strides[0] + filter_h -padding_size[0]; + (in_height - 1) * strides[0] + kernel_h - padding_size[0]; index_t out_width = - (in_width - 1) * strides[1] + filter_w -padding_size[1]; + (in_width - 1) * strides[1] + kernel_w - padding_size[1]; output_shape[0] = input_shape[0]; if (isNCHW) { @@ -206,8 +161,12 @@ struct Deconv2dFunctorBase : OpKernel { const float relux_max_limit_; }; -template -struct Deconv2dFunctor : Deconv2dFunctorBase { + +template +struct Deconv2dFunctor; + +template<> +struct Deconv2dFunctor: Deconv2dFunctorBase { Deconv2dFunctor(OpKernelContext *context, const std::vector &strides, const Padding &padding_type, @@ -223,6 +182,92 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { activation, relux_max_limit) {} + void Deconv2dGeneral(const float *input, + const float *filter, + const float *bias, + const index_t kernel_h, + const index_t kernel_w, + const int *strides, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t kernel_size = kernel_h * kernel_w; + std::vector out_map(kernel_size); + int p0 = 0; + int p1 = 0; + index_t gap = out_shape[3] - kernel_w; + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + out_map[p0] = p1; + p0++; + p1++; + } + p1 += gap; + } + + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_img_size = out_height * out_width; + const index_t in_img_size = in_height * in_width; + +#pragma omp parallel for + for (int b = 0; b < in_shape[0]; ++b) { + for (int oc = 0; oc < out_shape[1]; ++oc) { + float *out_base = + output + (b * out_shape[1] + oc) * out_img_size; + const float bias_value = bias ? bias[oc] : 0.f; + std::fill_n(out_base, out_img_size, bias_value); + for (int i = 0; i < in_height; ++i) { + for (int j = 0; j < in_width; ++j) { + const index_t out_offset = + i * strides[0] * out_width + j * strides[1]; + for (int ic = 0; ic < in_shape[1]; ++ic) { + const index_t input_idx = + (b * in_shape[1] + ic) * in_img_size + i * in_width + j; + const float val = input[input_idx]; + const index_t kernel_offset = + (oc * in_shape[1] + ic) * kernel_size; + for (int k = 0; k < kernel_size; ++k) { + const index_t out_idx = out_offset + out_map[k]; + const index_t kernel_idx = kernel_offset + k; + out_base[out_idx] += val * filter[kernel_idx]; + } + } + } + } + } + } + } + + void CropPadOut(const float *input, + const index_t *in_shape, + const index_t *out_shape, + const index_t pad_h, + const index_t pad_w, + float *output) { + const index_t batch = in_shape[0]; + const index_t channel = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; +#pragma omp parallel for + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channel; ++j) { + for (int k = 0; k < out_height; ++k) { + const float *input_base = + input + ((i * channel + j) * in_height + (k + pad_h)) * in_width; + float *output_base = + output + ((i * channel + j) * out_height + k)* out_width; + memcpy(output_base, input_base + pad_w, out_width * sizeof(float)); + } + } + } + } + MaceStatus operator()(const Tensor *input, // NCHW const Tensor *filter, // OIHW const Tensor *bias, @@ -235,6 +280,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { MACE_CHECK_NOTNULL(output); std::vector paddings(2); + std::vector out_paddings(2); std::vector output_shape(4); if (paddings_.empty()) { // tensorflow paddings = std::vector(2, 0); @@ -261,19 +307,20 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { output_shape.data(), paddings.data(), true); } else { // caffe - paddings = paddings_; + out_paddings = paddings_; output_shape = std::vector(4, 0); CalcDeconvOutputSize(input->shape().data(), filter->shape().data(), strides_.data(), output_shape.data(), - paddings.data(), true); + out_paddings.data(), + paddings.data(), + true); } MACE_RETURN_IF_ERROR(output->Resize(output_shape)); index_t kernel_h = filter->dim(2); index_t kernel_w = filter->dim(3); const index_t *in_shape = input->shape().data(); - const index_t kernel_hw[2] = {kernel_h, kernel_w}; MACE_CHECK(filter->dim(0) == output_shape[1], filter->dim(0), " != ", output_shape[1]); @@ -281,28 +328,144 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { in_shape[1]); MACE_CHECK(in_shape[0] == output_shape[0], "Input/Output batch size mismatch"); + std::function deconv_func; + Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard bias_mapper(bias); Tensor::MappingGuard output_mapper(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto bias_data = bias == nullptr ? nullptr : bias->data(); - auto output_data = output->mutable_data(); - int padding[2]; - padding[0] = (paddings[0] + 1) >> 1; - padding[1] = (paddings[1] + 1) >> 1; - deconv::Deconv2dNCHW(input_data, - filter_data, - bias_data, - in_shape, - output_shape.data(), - kernel_hw, - strides_.data(), - padding, - output_data); - - DoActivation(output_data, + auto input_data = input->data(); + auto filter_data = filter->data(); + auto bias_data = bias == nullptr ? nullptr : bias->data(); + auto output_data = output->mutable_data(); + + const index_t padded_out_h = (in_shape[2] - 1) * strides_[0] + kernel_h; + const index_t padded_out_w = (in_shape[3] - 1) * strides_[1] + kernel_w; + const index_t pad_h = (padded_out_h - output_shape[2]) / 2; + const index_t pad_w = (padded_out_w - output_shape[3]) / 2; + + std::vector padded_out_shape({output_shape[0], output_shape[1], + padded_out_h, padded_out_w}); + index_t padded_out_size = + std::accumulate(padded_out_shape.begin(), + padded_out_shape.end(), + 1, + std::multiplies()) * sizeof(float); + ScratchBuffer *scratch = context_->device()->scratch_buffer(); + scratch->Rewind(); + scratch->GrowSize(padded_out_size); + Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT); + auto *padded_out_data = padded_out.mutable_data(); + + bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 && + strides_[0] == strides_[1] && strides_[0] == 1; + bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 && + strides_[0] == strides_[1] && strides_[0] == 2; + + bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 && + strides_[0] == strides_[1] && strides_[0] == 1; + bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 && + strides_[0] == strides_[1] && strides_[0] == 2; + + if (use_neon_3x3_s1) { + deconv_func = [=](const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + Deconv2dNeonK3x3S1(input, + filter, + bias, + in_shape, + padded_out_shape, + padded_output); + }; + } else if (use_neon_3x3_s2) { + deconv_func = [=](const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + Deconv2dNeonK3x3S2(input, + filter, + bias, + in_shape, + padded_out_shape, + padded_output); + }; + } else if (use_neon_4x4_s1) { + deconv_func = [=](const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + Deconv2dNeonK4x4S1(input, + filter, + bias, + in_shape, + padded_out_shape, + padded_output); + }; + } else if (use_neon_4x4_s2) { + deconv_func = [=](const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + Deconv2dNeonK4x4S2(input, + filter, + bias, + in_shape, + padded_out_shape, + padded_output); + }; + } else { + deconv_func = [=](const float *input, + const float *filter, + const float *bias, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + Deconv2dGeneral(input, + filter, + bias, + kernel_h, + kernel_w, + strides_.data(), + in_shape, + padded_out_shape, + padded_output); + }; + } + + bool no_pad = + padded_out_h == output_shape[2] && padded_out_w == output_shape[3]; + float *out_data = no_pad ? output_data : padded_out_data; + deconv_func(input_data, + filter_data, + bias_data, + in_shape, + padded_out_shape.data(), + out_data); + if (!no_pad) { + CropPadOut(out_data, + padded_out_shape.data(), + output_shape.data(), + pad_h, + pad_w, + output_data); + } + + DoActivation(output_data, output_data, output->size(), activation_, diff --git a/mace/kernels/opencl/deconv_2d.cc b/mace/kernels/opencl/deconv_2d.cc index 4911e26beb36f53aba64f6451d854066d2ceb614..524f6ef8daa3bf637b4bd3794f2ec0344e0ad08f 100644 --- a/mace/kernels/opencl/deconv_2d.cc +++ b/mace/kernels/opencl/deconv_2d.cc @@ -53,6 +53,7 @@ MaceStatus Deconv2dFunctor::operator()( MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); std::vector paddings(2); + std::vector out_paddings(2); std::vector output_shape(4); if (paddings_.empty()) { paddings = std::vector(2, 0); @@ -74,12 +75,14 @@ MaceStatus Deconv2dFunctor::operator()( output_shape.data(), paddings.data()); } else { - paddings = paddings_; + out_paddings = paddings_; + paddings = std::vector(2, 0); output_shape = std::vector(4, 0); CalcDeconvOutputSize(input->shape().data(), filter->shape().data(), strides_.data(), output_shape.data(), + out_paddings.data(), paddings.data()); } diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc index 2a414e3c5e7b2e8a83855a6bec7ea9aa606d8d41..9835bc4d1f9c2f57903980d31415317657abff82 100644 --- a/mace/ops/deconv_2d_benchmark.cc +++ b/mace/ops/deconv_2d_benchmark.cc @@ -116,6 +116,7 @@ static void Deconv2d(int iters, // TODO(liutuo): add cpu benchmark when optimized. #define MACE_BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC) \ + MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, CPU); \ MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \ MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU); @@ -124,6 +125,11 @@ MACE_BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128); MACE_BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128); MACE_BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32); + +MACE_BM_DECONV_2D(1, 128, 60, 60, 4, 4, 1, 63, 63, VALID, 128); +MACE_BM_DECONV_2D(1, 32, 60, 60, 4, 4, 1, 60, 60, SAME, 32); +MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 448, 448, SAME, 32); +MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 450, 450, VALID, 32); MACE_BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32); MACE_BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32); MACE_BM_DECONV_2D(1, 128, 64, 64, 5, 5, 1, 68, 68, VALID, 32); @@ -134,6 +140,7 @@ MACE_BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128); MACE_BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128); MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32); MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32); +MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 448, 448, SAME, 32); } // namespace test } // namespace ops diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc index 02fea790ba1498a7283e072ca06b059bc4a1a29e..fa17a090b53e55a55d2af2567091036ee215aaee 100644 --- a/mace/ops/deconv_2d_test.cc +++ b/mace/ops/deconv_2d_test.cc @@ -380,11 +380,11 @@ void TestComplexDeconvNxNS12(const int batch, 1e-4); }; - for (int kernel_size : {1, 3, 5, 7}) { + for (int kernel_size : {3, 4, 5, 7}) { func(kernel_size, kernel_size, stride, stride, VALID, -1); func(kernel_size, kernel_size, stride, stride, SAME, -1); - func(kernel_size, kernel_size, stride, stride, VALID, 1); func(kernel_size, kernel_size, stride, stride, VALID, 2); + func(kernel_size, kernel_size, stride, stride, VALID, 3); } } } // namespace @@ -410,8 +410,8 @@ TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS34) { } TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNMultiBatch) { - TestComplexDeconvNxNS12(3, {17, 13, 5, 7}, 1); - TestComplexDeconvNxNS12(5, {17, 13, 5, 7}, 2); + TestComplexDeconvNxNS12(3, {17, 113, 5, 7}, 1); + TestComplexDeconvNxNS12(5, {17, 113, 5, 7}, 2); } } // namespace test