From c057b7ec5c7df03fe3fc0da7f4ce1856056e1b85 Mon Sep 17 00:00:00 2001 From: liutuo Date: Fri, 28 Sep 2018 09:56:04 +0800 Subject: [PATCH] add depthwise deconv --- mace/ops/arm/common_neon.h | 70 ++ mace/ops/arm/deconv_2d_neon.h | 47 +- mace/ops/arm/depthwise_deconv2d_neon.h | 79 ++ mace/ops/arm/depthwise_deconv2d_neon_3x3.cc | 627 ++++++++++++++ mace/ops/arm/depthwise_deconv2d_neon_4x4.cc | 805 ++++++++++++++++++ mace/ops/deconv_2d.cc | 39 +- mace/ops/deconv_2d.h | 30 + mace/ops/depthwise_deconv2d.cc | 547 ++++++++++++ mace/ops/depthwise_deconv2d_benchmark.cc | 130 +++ mace/ops/depthwise_deconv2d_test.cc | 286 +++++++ mace/ops/opencl/cl/deconv_2d.cl | 3 +- mace/ops/opencl/cl/depthwise_deconv2d.cl | 149 ++++ mace/ops/opencl/depthwise_deconv2d.h | 49 ++ mace/ops/opencl/image/deconv_2d.h | 4 +- mace/ops/opencl/image/depthwise_deconv2d.h | 196 +++++ mace/ops/ops_registry.cc | 2 + .../tools/converter_tool/base_converter.py | 2 + .../tools/converter_tool/caffe_converter.py | 17 +- .../tools/converter_tool/shape_inference.py | 5 + .../tools/converter_tool/transformer.py | 53 +- .../opencl-kernel/opencl_kernel_configure.bzl | 1 + tools/validate.py | 12 +- 22 files changed, 3037 insertions(+), 116 deletions(-) create mode 100644 mace/ops/arm/common_neon.h create mode 100644 mace/ops/arm/depthwise_deconv2d_neon.h create mode 100644 mace/ops/arm/depthwise_deconv2d_neon_3x3.cc create mode 100644 mace/ops/arm/depthwise_deconv2d_neon_4x4.cc create mode 100644 mace/ops/depthwise_deconv2d.cc create mode 100644 mace/ops/depthwise_deconv2d_benchmark.cc create mode 100644 mace/ops/depthwise_deconv2d_test.cc create mode 100644 mace/ops/opencl/cl/depthwise_deconv2d.cl create mode 100644 mace/ops/opencl/depthwise_deconv2d.h create mode 100644 mace/ops/opencl/image/depthwise_deconv2d.h diff --git a/mace/ops/arm/common_neon.h b/mace/ops/arm/common_neon.h new file mode 100644 index 00000000..be0daf79 --- /dev/null +++ b/mace/ops/arm/common_neon.h @@ -0,0 +1,70 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_COMMON_NEON_H_ +#define MACE_OPS_ARM_COMMON_NEON_H_ + +#if defined(MACE_ENABLE_NEON) +#include +#endif + +namespace mace { +namespace ops { + +#ifdef MACE_ENABLE_NEON +inline float32x4_t neon_vfma_lane_0(float32x4_t a, + float32x4_t b, + float32x4_t c) { +#ifdef __aarch64__ + return vfmaq_laneq_f32(a, b, c, 0); +#else + return vmlaq_lane_f32(a, b, vget_low_f32(c), 0); +#endif +} + +inline float32x4_t neon_vfma_lane_1(float32x4_t a, + float32x4_t b, + float32x4_t c) { +#ifdef __aarch64__ + return vfmaq_laneq_f32(a, b, c, 1); +#else + return vmlaq_lane_f32(a, b, vget_low_f32(c), 1); +#endif +} + +inline float32x4_t neon_vfma_lane_2(float32x4_t a, + float32x4_t b, + float32x4_t c) { +#ifdef __aarch64__ + return vfmaq_laneq_f32(a, b, c, 2); +#else + return vmlaq_lane_f32(a, b, vget_high_f32(c), 0); +#endif +} + +inline float32x4_t neon_vfma_lane_3(float32x4_t a, + float32x4_t b, + float32x4_t c) { +#ifdef __aarch64__ + return vfmaq_laneq_f32(a, b, c, 3); +#else + return vmlaq_lane_f32(a, b, vget_high_f32(c), 1); +#endif +} +#endif + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_COMMON_NEON_H_ diff --git a/mace/ops/arm/deconv_2d_neon.h b/mace/ops/arm/deconv_2d_neon.h index d8abe427..abed4926 100644 --- a/mace/ops/arm/deconv_2d_neon.h +++ b/mace/ops/arm/deconv_2d_neon.h @@ -15,11 +15,8 @@ #ifndef MACE_OPS_ARM_DECONV_2D_NEON_H_ #define MACE_OPS_ARM_DECONV_2D_NEON_H_ -#if defined(MACE_ENABLE_NEON) -#include -#endif - #include "mace/core/types.h" +#include "mace/ops/arm/common_neon.h" namespace mace { namespace ops { @@ -48,48 +45,6 @@ void Deconv2dNeonK4x4S2(const float *input, const index_t *out_shape, float *output); -#ifdef MACE_ENABLE_NEON -inline float32x4_t neon_vfma_lane_0(float32x4_t a, - float32x4_t b, - float32x4_t c) { -#ifdef __aarch64__ - return vfmaq_laneq_f32(a, b, c, 0); -#else - return vmlaq_lane_f32(a, b, vget_low_f32(c), 0); -#endif -} - -inline float32x4_t neon_vfma_lane_1(float32x4_t a, - float32x4_t b, - float32x4_t c) { -#ifdef __aarch64__ - return vfmaq_laneq_f32(a, b, c, 1); -#else - return vmlaq_lane_f32(a, b, vget_low_f32(c), 1); -#endif -} - -inline float32x4_t neon_vfma_lane_2(float32x4_t a, - float32x4_t b, - float32x4_t c) { -#ifdef __aarch64__ - return vfmaq_laneq_f32(a, b, c, 2); -#else - return vmlaq_lane_f32(a, b, vget_high_f32(c), 0); -#endif -} - -inline float32x4_t neon_vfma_lane_3(float32x4_t a, - float32x4_t b, - float32x4_t c) { -#ifdef __aarch64__ - return vfmaq_laneq_f32(a, b, c, 3); -#else - return vmlaq_lane_f32(a, b, vget_high_f32(c), 1); -#endif -} -#endif - } // namespace ops } // namespace mace diff --git a/mace/ops/arm/depthwise_deconv2d_neon.h b/mace/ops/arm/depthwise_deconv2d_neon.h new file mode 100644 index 00000000..35460d75 --- /dev/null +++ b/mace/ops/arm/depthwise_deconv2d_neon.h @@ -0,0 +1,79 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_ +#define MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_ + +#include "mace/core/types.h" +#include "mace/ops/arm/common_neon.h" + +namespace mace { +namespace ops { + +void DepthwiseDeconv2dNeonK3x3S1(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +void DepthwiseDeconv2dNeonK3x3S2(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +void DepthwiseDeconv2dNeonK4x4S1(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +void DepthwiseDeconv2dNeonK4x4S2(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +void GroupDeconv2dNeonK3x3S1(const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +void GroupDeconv2dNeonK3x3S2(const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +void GroupDeconv2dNeonK4x4S1(const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +void GroupDeconv2dNeonK4x4S2(const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_ diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc new file mode 100644 index 00000000..8a90b9fc --- /dev/null +++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc @@ -0,0 +1,627 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/macros.h" +#include "mace/ops/arm/depthwise_deconv2d_neon.h" + +namespace mace { +namespace ops { + +void DepthwiseDeconv2dNeonK3x3S1(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t channels = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + const index_t in_img_size = h * w; + + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t c = 0; c < channels; ++c) { + const index_t offset = b * channels + c; + float *out_base = output + offset * out_img_size; + const float *input_base = input + offset * in_img_size; + const float *kernel_base = filter + c * 9; + const float *in = input_base; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 3; + const float *k2 = kernel_base + 5; + +#if defined(MACE_ENABLE_NEON) + // load filter + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); +#endif + for (index_t i = 0; i < h; ++i) { + float *out_row_base = out_base + i * outw; + float *out_row0 = out_row_base; + float *out_row1 = out_row_base + outw; + float *out_row2 = out_row_base + 2 * outw; + index_t j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02; + float32x4_t out10, out11, out12; + float32x4_t out20, out21, out22; + + out00 = vld1q_f32(out_row0 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row0 + 0, out00); + + out01 = vld1q_f32(out_row0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row0 + 1, out01); + + out02 = vld1q_f32(out_row0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row0 + 2, out02); + + out10 = vld1q_f32(out_row1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); + vst1q_f32(out_row1 + 0, out10); + + out11 = vld1q_f32(out_row1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); + vst1q_f32(out_row1 + 1, out11); + + out12 = vld1q_f32(out_row1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); + vst1q_f32(out_row1 + 2, out12); + + out20 = vld1q_f32(out_row2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k2_vec); + vst1q_f32(out_row2 + 0, out20); + + out21 = vld1q_f32(out_row2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k2_vec); + vst1q_f32(out_row2 + 1, out21); + + out22 = vld1q_f32(out_row2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k2_vec); + vst1q_f32(out_row2 + 2, out22); + + in += 4; + out_row0 += 4; + out_row1 += 4; + out_row2 += 4; + } +#endif + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 3; ++k) { + out_row0[k] += val * k0[k]; + out_row1[k] += val * k1[k]; + out_row2[k] += val * k2[k + 1]; + } + in++; + out_row0++; + out_row1++; + out_row2++; + } + } + } + } +} + +void DepthwiseDeconv2dNeonK3x3S2(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t channels = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + const index_t in_img_size = h * w; + + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t c = 0; c < channels; ++c) { + const index_t offset = b * channels + c; + float *out_base = output + offset * out_img_size; + const float *input_base = input + offset * in_img_size; + const float *kernel_base = filter + c * 9; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 3; + const float *k2 = kernel_base + 5; + +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); +#endif + for (index_t i = 0; i < h; ++i) { + float *out_row_base = out_base + i * 2 * outw; + float *out_row_0 = out_row_base; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + + index_t j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + // out row 0 + float32x4x2_t out00 = vld2q_f32(out_row_0); + out00.val[0] = + neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); + out00.val[1] = + neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out00); + + float32x4x2_t out01 = vld2q_f32(out_row_0 + 2); + out01.val[0] = + neon_vfma_lane_2(out01.val[0], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out01); + + // out row 1 + float32x4x2_t out10 = vld2q_f32(out_row_1); + out10.val[0] = + neon_vfma_lane_0(out10.val[0], in_vec, k1_vec); + out10.val[1] = + neon_vfma_lane_1(out10.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out10); + + float32x4x2_t out11 = vld2q_f32(out_row_1 + 2); + out11.val[0] = + neon_vfma_lane_2(out11.val[0], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out11); + + // out row 2 + float32x4x2_t out20 = vld2q_f32(out_row_2); + out20.val[0] = + neon_vfma_lane_1(out20.val[0], in_vec, k2_vec); + out20.val[1] = + neon_vfma_lane_2(out20.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out20); + + float32x4x2_t out21 = vld2q_f32(out_row_2 + 2); + out21.val[0] = + neon_vfma_lane_3(out21.val[0], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out21); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + } +#endif + for (; j < w; ++j) { + float val = in[0]; + + for (int k = 0; k < 3; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k + 1]; + } + + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + } + } + } + } +} + +void GroupDeconv2dNeonK3x3S1(const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t in_img_size = h * w; + const index_t out_img_size = outh * outw; + + const index_t inch_g = inch / group; + const index_t outch_g = outch / group; + +#pragma omp parallel for collapse(3) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (int g = 0; g < group; ++g) { + for (index_t oc = 0; oc < outch_g; oc += 2) { + if (oc + 1 < outch_g) { + const index_t out_offset = b * outch + outch_g * g + oc; + float *out_base0 = output + out_offset * out_img_size; + float *out_base1 = out_base0 + out_img_size; + for (index_t ic = 0; ic < inch_g; ++ic) { + const index_t in_offset = b * inch + inch_g * g + ic; + const float *input_base = input + in_offset * in_img_size; + const index_t kernel_offset = (oc * group + g) * inch_g + ic; + const float *kernel_base0 = filter + kernel_offset * 9; + const float *kernel_base1 = kernel_base0 + inch * 9; + const float *in = input_base; + + // output channel 0 + const float *k0_0 = kernel_base0; + const float *k0_1 = kernel_base0 + 3; + const float *k0_2 = kernel_base0 + 5; + // output channel 1 + const float *k1_0 = kernel_base1; + const float *k1_1 = kernel_base1 + 3; + const float *k1_2 = kernel_base1 + 5; + +#if defined(MACE_ENABLE_NEON) + // load filter + float32x4_t k00_vec, k01_vec, k02_vec; + float32x4_t k10_vec, k11_vec, k12_vec; + + k00_vec = vld1q_f32(k0_0); + k01_vec = vld1q_f32(k0_1); + k02_vec = vld1q_f32(k0_2); + + k10_vec = vld1q_f32(k1_0); + k11_vec = vld1q_f32(k1_1); + k12_vec = vld1q_f32(k1_2); +#endif + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + float *out_row0_2 = out_row_base0 + 2 * outw; + + float *out_row_base1 = out_base1 + i * outw; + float *out_row1_0 = out_row_base1; + float *out_row1_1 = out_row_base1 + outw; + float *out_row1_2 = out_row_base1 + 2 * outw; + + index_t j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02; + float32x4_t out10, out11, out12; + float32x4_t out20, out21, out22; + + out00 = vld1q_f32(out_row0_0); + out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); + vst1q_f32(out_row0_0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 2, out02); + + out10 = vld1q_f32(out_row0_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 0, out10); + + out11 = vld1q_f32(out_row0_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 1, out11); + + out12 = vld1q_f32(out_row0_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 2, out12); + + out20 = vld1q_f32(out_row0_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 0, out20); + + out21 = vld1q_f32(out_row0_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 1, out21); + + out22 = vld1q_f32(out_row0_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 2, out22); + + out00 = vld1q_f32(out_row1_0 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 0, out00); + + out01 = vld1q_f32(out_row1_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 1, out01); + + out02 = vld1q_f32(out_row1_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 2, out02); + + out10 = vld1q_f32(out_row1_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 0, out10); + + out11 = vld1q_f32(out_row1_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 1, out11); + + out12 = vld1q_f32(out_row1_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 2, out12); + + out20 = vld1q_f32(out_row1_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 0, out20); + + out21 = vld1q_f32(out_row1_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 1, out21); + + out22 = vld1q_f32(out_row1_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 2, out22); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + out_row0_2 += 4; + out_row1_0 += 4; + out_row1_1 += 4; + out_row1_2 += 4; + } +#endif + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 3; ++k) { + out_row0_0[k] += val * k0_0[k]; + out_row0_1[k] += val * k0_1[k]; + out_row0_2[k] += val * k0_2[k + 1]; + out_row1_0[k] += val * k1_0[k]; + out_row1_1[k] += val * k1_1[k]; + out_row1_2[k] += val * k1_2[k + 1]; + } + in++; + out_row0_0++; + out_row0_1++; + out_row0_2++; + out_row1_0++; + out_row1_1++; + out_row1_2++; + } + } + } + } else { + const index_t out_offset = b * outch + outch_g * g + oc; + float *out_base0 = output + out_offset * out_img_size; + for (index_t ic = 0; ic < inch_g; ++ic) { + const index_t in_offset = (b * group + g) * inch_g + ic; + const float *input_base = input + in_offset * in_img_size; + const index_t kernel_offset = (oc * group + g) * inch_g + ic; + const float *kernel_base0 = filter + kernel_offset * 9; + const float *in = input_base; + const float *k0_0 = kernel_base0; + const float *k0_1 = kernel_base0 + 3; + const float *k0_2 = kernel_base0 + 5; + +#if defined(MACE_ENABLE_NEON) + // load filter + float32x4_t k00_vec = vld1q_f32(k0_0); + float32x4_t k01_vec = vld1q_f32(k0_1); + float32x4_t k02_vec = vld1q_f32(k0_2); +#endif + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + float *out_row0_2 = out_row_base0 + 2 * outw; + index_t j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02; + float32x4_t out10, out11, out12; + float32x4_t out20, out21, out22; + + out00 = vld1q_f32(out_row0_0 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k00_vec); + vst1q_f32(out_row0_0 + 2, out02); + + out10 = vld1q_f32(out_row0_1 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 0, out10); + + out11 = vld1q_f32(out_row0_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 1, out11); + + out12 = vld1q_f32(out_row0_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k01_vec); + vst1q_f32(out_row0_1 + 2, out12); + + out20 = vld1q_f32(out_row0_2 + 0); + out20 = neon_vfma_lane_1(out20, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 0, out20); + + out21 = vld1q_f32(out_row0_2 + 1); + out21 = neon_vfma_lane_2(out21, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 1, out21); + + out22 = vld1q_f32(out_row0_2 + 2); + out22 = neon_vfma_lane_3(out22, in_vec, k02_vec); + vst1q_f32(out_row0_2 + 2, out22); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + out_row0_2 += 4; + } +#endif + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 3; ++k) { + out_row0_0[k] += val * k0_0[k]; + out_row0_1[k] += val * k0_1[k]; + out_row0_2[k] += val * k0_2[k + 1]; + } + in++; + out_row0_0++; + out_row0_1++; + out_row0_2++; + } + } + } + } + } + } + } +} + +void GroupDeconv2dNeonK3x3S2(const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t in_img_size = h * w; + const index_t out_img_size = outh * outw; + + const index_t inch_g = inch / group; + const index_t outch_g = outch / group; + +#pragma omp parallel for collapse(3) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (int g = 0; g < group; ++g) { + for (index_t oc = 0; oc < outch_g; ++oc) { + const index_t out_offset = b * outch + outch_g * g + oc; + float *out_base = output + out_offset * out_img_size; + for (index_t ic = 0; ic < inch_g; ++ic) { + const index_t in_offset = b * inch + inch_g * g + ic; + const float *input_base = input + in_offset * in_img_size; + const index_t kernel_offset = (oc * group + g) * inch_g + ic; + const float *kernel_base = filter + kernel_offset * 9; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 3; + const float *k2 = kernel_base + 5; + +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); +#endif + for (index_t i = 0; i < h; ++i) { + float *out_row_base = out_base + i * 2 * outw; + float *out_row_0 = out_row_base; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + + index_t j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + // out row 0 + float32x4x2_t out00 = vld2q_f32(out_row_0); + out00.val[0] = + neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); + out00.val[1] = + neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out00); + + float32x4x2_t out01 = vld2q_f32(out_row_0 + 2); + out01.val[0] = + neon_vfma_lane_2(out01.val[0], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out01); + + // out row 1 + float32x4x2_t out10 = vld2q_f32(out_row_1); + out10.val[0] = + neon_vfma_lane_0(out10.val[0], in_vec, k1_vec); + out10.val[1] = + neon_vfma_lane_1(out10.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out10); + + float32x4x2_t out11 = vld2q_f32(out_row_1 + 2); + out11.val[0] = + neon_vfma_lane_2(out11.val[0], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out11); + + // out row 2 + float32x4x2_t out20 = vld2q_f32(out_row_2); + out20.val[0] = + neon_vfma_lane_1(out20.val[0], in_vec, k2_vec); + out20.val[1] = + neon_vfma_lane_2(out20.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out20); + + float32x4x2_t out21 = vld2q_f32(out_row_2 + 2); + out21.val[0] = + neon_vfma_lane_3(out21.val[0], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out21); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + } +#endif + for (; j < w; ++j) { + float val = in[0]; + + for (int k = 0; k < 3; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k + 1]; + } + + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + } + } + } + } + } + } +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc new file mode 100644 index 00000000..6ae7dbb1 --- /dev/null +++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc @@ -0,0 +1,805 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/macros.h" +#include "mace/ops/arm/deconv_2d_neon.h" + +namespace mace { +namespace ops { + +void DepthwiseDeconv2dNeonK4x4S1(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t batch = in_shape[0]; + const index_t channels = in_shape[1]; + const index_t w = in_shape[3]; + const index_t h = in_shape[2]; + const index_t in_img_size = h * w; + + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + +#pragma omp parallel for collapse(2) + for (int b = 0; b < batch; ++b) { + for (int c = 0; c < channels; ++c) { + const index_t offset = b * channels + c; + float *out_base = output + offset * out_img_size; + const float *input_base = input + offset * in_img_size; + const float *kernel_base = filter + c * 16; + const float *in = input_base; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); +#endif + for (int i = 0; i < h; i++) { + float *out_row = out_base + i * outw; + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + int j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00 = vld1q_f32(out_row_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row_0, out00); + + float32x4_t out01 = vld1q_f32(out_row_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row_0 + 1, out01); + + float32x4_t out02 = vld1q_f32(out_row_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row_0 + 2, out02); + + float32x4_t out03 = vld1q_f32(out_row_0 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row_0 + 3, out03); + + // + float32x4_t out10 = vld1q_f32(out_row_1); + out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); + vst1q_f32(out_row_1, out10); + + float32x4_t out11 = vld1q_f32(out_row_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); + vst1q_f32(out_row_1 + 1, out11); + + float32x4_t out12 = vld1q_f32(out_row_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); + vst1q_f32(out_row_1 + 2, out12); + + float32x4_t out13 = vld1q_f32(out_row_1 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); + vst1q_f32(out_row_1 + 3, out13); + + // + float32x4_t out20 = vld1q_f32(out_row_2 + 0); + out20 = neon_vfma_lane_0(out20, in_vec, k2_vec); + vst1q_f32(out_row_2 + 0, out20); + + float32x4_t out21 = vld1q_f32(out_row_2 + 1); + out21 = neon_vfma_lane_1(out21, in_vec, k2_vec); + vst1q_f32(out_row_2 + 1, out21); + + float32x4_t out22 = vld1q_f32(out_row_2 + 2); + out22 = neon_vfma_lane_2(out22, in_vec, k2_vec); + vst1q_f32(out_row_2 + 2, out22); + + float32x4_t out23 = vld1q_f32(out_row_2 + 3); + out23 = neon_vfma_lane_3(out23, in_vec, k2_vec); + vst1q_f32(out_row_2 + 3, out23); + + // + float32x4_t out30 = vld1q_f32(out_row_3 + 0); + out30 = neon_vfma_lane_0(out30, in_vec, k3_vec); + vst1q_f32(out_row_3 + 0, out30); + + float32x4_t out31 = vld1q_f32(out_row_3 + 1); + out31 = neon_vfma_lane_1(out31, in_vec, k3_vec); + vst1q_f32(out_row_3 + 1, out31); + + float32x4_t out32 = vld1q_f32(out_row_3 + 2); + out32 = neon_vfma_lane_2(out32, in_vec, k3_vec); + vst1q_f32(out_row_3 + 2, out32); + + float32x4_t out33 = vld1q_f32(out_row_3 + 3); + out33 = neon_vfma_lane_3(out33, in_vec, k3_vec); + vst1q_f32(out_row_3 + 3, out33); + + in += 4; + out_row_0 += 4; + out_row_1 += 4; + out_row_2 += 4; + out_row_3 += 4; + } +#endif + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0++; + out_row_1++; + out_row_2++; + out_row_3++; + } + } + } + } +} + +void DepthwiseDeconv2dNeonK4x4S2(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t w = in_shape[3]; + const index_t h = in_shape[2]; + const index_t channels = in_shape[1]; + const index_t in_img_size = h * w; + + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + +#pragma omp parallel for collapse(2) + for (int b = 0; b < out_shape[0]; ++b) { + for (int c = 0; c < channels; ++c) { + const index_t offset = b * channels + c; + float *out_base = output + offset * out_img_size; + const float *input_base = input + offset * in_img_size; + const float *kernel_base = filter + c * 16; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); +#endif + for (int i = 0; i < h; i++) { + float *out_row = out_base + 2 * i * outw; + + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + + int j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + // row 0 + float32x4x2_t out0 = vld2q_f32(out_row_0); + out0.val[0] = + neon_vfma_lane_0(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_1(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out0); + out0 = vld2q_f32(out_row_0 + 2); + out0.val[0] = + neon_vfma_lane_2(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_3(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out0); + + // row 1 + float32x4x2_t out1 = vld2q_f32(out_row_1); + out1.val[0] = + neon_vfma_lane_0(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_1(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out1); + out1 = vld2q_f32(out_row_1 + 2); + out1.val[0] = + neon_vfma_lane_2(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_3(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out1); + + // row 2 + float32x4x2_t out2 = vld2q_f32(out_row_2); + out2.val[0] = + neon_vfma_lane_0(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_1(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out2); + out2 = vld2q_f32(out_row_2 + 2); + out2.val[0] = + neon_vfma_lane_2(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_3(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out2); + + // row 3 + float32x4x2_t out3 = vld2q_f32(out_row_3); + out3.val[0] = + neon_vfma_lane_0(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_1(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3, out3); + out3 = vld2q_f32(out_row_3 + 2); + out3.val[0] = + neon_vfma_lane_2(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_3(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3 + 2, out3); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + out_row_3 += 8; + } +#endif + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + out_row_3 += 2; + } + } + } + } +} + +void GroupDeconv2dNeonK4x4S1(const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t w = in_shape[3]; + const index_t h = in_shape[2]; + const index_t inch = in_shape[1]; + + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t outch = out_shape[1]; + + const index_t in_img_size = h * w; + const index_t out_img_size = outh * outw; + + const index_t inch_g = inch / group; + const index_t outch_g = outch / group; + +#pragma omp parallel for collapse(3) + for (int b = 0; b < out_shape[0]; ++b) { + for (int g = 0; g < group; ++g) { + for (int oc = 0; oc < outch_g; oc += 2) { + if (oc + 1 < outch_g) { + const index_t out_offset = + (b * outch + outch_g * g + oc) * out_img_size; + float *out_base = output + out_offset; + float *out_base1 = out_base + out_img_size; + for (int ic = 0; ic < inch_g; ic++) { + const index_t in_offset = + (b * inch + inch_g * g + ic) * in_img_size; + const float *input_base = input + in_offset; + const float *in = input_base; + const index_t kernel_offset = + ((oc * group + g) * inch_g + ic) * 16; + const float *kernel_base = filter + kernel_offset; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; + + const float *kernel_base1 = kernel_base + inch * 16; + const float *k10 = kernel_base1; + const float *k11 = kernel_base1 + 4; + const float *k12 = kernel_base1 + 8; + const float *k13 = kernel_base1 + 12; +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); + + float32x4_t k10_vec = vld1q_f32(k10); + float32x4_t k11_vec = vld1q_f32(k11); + float32x4_t k12_vec = vld1q_f32(k12); + float32x4_t k13_vec = vld1q_f32(k13); +#endif + for (int i = 0; i < h; i++) { + float *out_row = out_base + i * outw; + + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + + float *out_row1 = out_base1 + i * outw; + + float *out_row1_0 = out_row1; + float *out_row1_1 = out_row1_0 + outw; + float *out_row1_2 = out_row1_1 + outw; + float *out_row1_3 = out_row1_2 + outw; + + int j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + float32x4_t out00, out01, out02, out03; + float32x4_t out10, out11, out12, out13; + + out00 = vld1q_f32(out_row_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row_0, out00); + + out10 = vld1q_f32(out_row1_0); + out10 = neon_vfma_lane_0(out10, in_vec, k10_vec); + vst1q_f32(out_row1_0, out10); + + out01 = vld1q_f32(out_row_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row_0 + 1, out01); + + out11 = vld1q_f32(out_row1_0 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 1, out11); + + out02 = vld1q_f32(out_row_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row_0 + 2, out02); + + out12 = vld1q_f32(out_row1_0 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 2, out12); + + out03 = vld1q_f32(out_row_0 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row_0 + 3, out03); + + out13 = vld1q_f32(out_row1_0 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k10_vec); + vst1q_f32(out_row1_0 + 3, out13); + + // + out00 = vld1q_f32(out_row_1); + out00 = neon_vfma_lane_0(out00, in_vec, k1_vec); + vst1q_f32(out_row_1, out00); + + out10 = vld1q_f32(out_row1_1); + out10 = neon_vfma_lane_0(out10, in_vec, k11_vec); + vst1q_f32(out_row1_1, out10); + + out01 = vld1q_f32(out_row_1 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k1_vec); + vst1q_f32(out_row_1 + 1, out01); + + out11 = vld1q_f32(out_row1_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 1, out11); + + out02 = vld1q_f32(out_row_1 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k1_vec); + vst1q_f32(out_row_1 + 2, out02); + + out12 = vld1q_f32(out_row1_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 2, out12); + + out03 = vld1q_f32(out_row_1 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k1_vec); + vst1q_f32(out_row_1 + 3, out03); + + out13 = vld1q_f32(out_row1_1 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k11_vec); + vst1q_f32(out_row1_1 + 3, out13); + + // + out00 = vld1q_f32(out_row_2 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k2_vec); + vst1q_f32(out_row_2 + 0, out00); + + out10 = vld1q_f32(out_row1_2 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 0, out10); + + out01 = vld1q_f32(out_row_2 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k2_vec); + vst1q_f32(out_row_2 + 1, out01); + + out11 = vld1q_f32(out_row1_2 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 1, out11); + + out02 = vld1q_f32(out_row_2 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k2_vec); + vst1q_f32(out_row_2 + 2, out02); + + out12 = vld1q_f32(out_row1_2 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 2, out12); + + out03 = vld1q_f32(out_row_2 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k2_vec); + vst1q_f32(out_row_2 + 3, out03); + + out13 = vld1q_f32(out_row1_2 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k12_vec); + vst1q_f32(out_row1_2 + 3, out13); + + // + out00 = vld1q_f32(out_row_3 + 0); + out00 = neon_vfma_lane_0(out00, in_vec, k3_vec); + vst1q_f32(out_row_3 + 0, out00); + + out10 = vld1q_f32(out_row1_3 + 0); + out10 = neon_vfma_lane_0(out10, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 0, out10); + + out01 = vld1q_f32(out_row_3 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k3_vec); + vst1q_f32(out_row_3 + 1, out01); + + out11 = vld1q_f32(out_row1_3 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 1, out11); + + out02 = vld1q_f32(out_row_3 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k3_vec); + vst1q_f32(out_row_3 + 2, out02); + + out12 = vld1q_f32(out_row1_3 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 2, out12); + + out03 = vld1q_f32(out_row_3 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k3_vec); + vst1q_f32(out_row_3 + 3, out03); + + out13 = vld1q_f32(out_row1_3 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k13_vec); + vst1q_f32(out_row1_3 + 3, out13); + + in += 4; + out_row_0 += 4; + out_row_1 += 4; + out_row_2 += 4; + out_row_3 += 4; + out_row1_0 += 4; + out_row1_1 += 4; + out_row1_2 += 4; + out_row1_3 += 4; + } +#endif + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + out_row1_0[k] += val * k10[k]; + out_row1_1[k] += val * k11[k]; + out_row1_2[k] += val * k12[k]; + out_row1_3[k] += val * k13[k]; + } + in++; + out_row_0++; + out_row_1++; + out_row_2++; + out_row_3++; + out_row1_0++; + out_row1_1++; + out_row1_2++; + out_row1_3++; + } + } + } + } else { + const index_t out_offset = + (b * outch + outch_g * g + oc) * out_img_size; + float *out_base = output + out_offset; + for (int ic = 0; ic < inch_g; ++ic) { + const index_t in_offset = + (b * inch + inch_g * g + ic) * in_img_size; + const index_t kernel_offset = + ((oc * group + g) * inch_g + ic) * 16; + + const float *input_base = input + in_offset; + const float *kernel_base = filter + kernel_offset; + const float *in = input_base; + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); +#endif + for (int i = 0; i < h; i++) { + float *out_row = out_base + i * outw; + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + int j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00 = vld1q_f32(out_row_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row_0, out00); + + float32x4_t out01 = vld1q_f32(out_row_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row_0 + 1, out01); + + float32x4_t out02 = vld1q_f32(out_row_0 + 2); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row_0 + 2, out02); + + float32x4_t out03 = vld1q_f32(out_row_0 + 3); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row_0 + 3, out03); + + // + float32x4_t out10 = vld1q_f32(out_row_1); + out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); + vst1q_f32(out_row_1, out10); + + float32x4_t out11 = vld1q_f32(out_row_1 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); + vst1q_f32(out_row_1 + 1, out11); + + float32x4_t out12 = vld1q_f32(out_row_1 + 2); + out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); + vst1q_f32(out_row_1 + 2, out12); + + float32x4_t out13 = vld1q_f32(out_row_1 + 3); + out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); + vst1q_f32(out_row_1 + 3, out13); + + // + float32x4_t out20 = vld1q_f32(out_row_2 + 0); + out20 = neon_vfma_lane_0(out20, in_vec, k2_vec); + vst1q_f32(out_row_2 + 0, out20); + + float32x4_t out21 = vld1q_f32(out_row_2 + 1); + out21 = neon_vfma_lane_1(out21, in_vec, k2_vec); + vst1q_f32(out_row_2 + 1, out21); + + float32x4_t out22 = vld1q_f32(out_row_2 + 2); + out22 = neon_vfma_lane_2(out22, in_vec, k2_vec); + vst1q_f32(out_row_2 + 2, out22); + + float32x4_t out23 = vld1q_f32(out_row_2 + 3); + out23 = neon_vfma_lane_3(out23, in_vec, k2_vec); + vst1q_f32(out_row_2 + 3, out23); + + // + float32x4_t out30 = vld1q_f32(out_row_3 + 0); + out30 = neon_vfma_lane_0(out30, in_vec, k3_vec); + vst1q_f32(out_row_3 + 0, out30); + + float32x4_t out31 = vld1q_f32(out_row_3 + 1); + out31 = neon_vfma_lane_1(out31, in_vec, k3_vec); + vst1q_f32(out_row_3 + 1, out31); + + float32x4_t out32 = vld1q_f32(out_row_3 + 2); + out32 = neon_vfma_lane_2(out32, in_vec, k3_vec); + vst1q_f32(out_row_3 + 2, out32); + + float32x4_t out33 = vld1q_f32(out_row_3 + 3); + out33 = neon_vfma_lane_3(out33, in_vec, k3_vec); + vst1q_f32(out_row_3 + 3, out33); + + in += 4; + out_row_0 += 4; + out_row_1 += 4; + out_row_2 += 4; + out_row_3 += 4; + } +#endif + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0++; + out_row_1++; + out_row_2++; + out_row_3++; + } + } + } + } + } + } + } +} + +void GroupDeconv2dNeonK4x4S2(const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t w = in_shape[3]; + const index_t h = in_shape[2]; + const index_t inch = in_shape[1]; + + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t outch = out_shape[1]; + const index_t in_img_size = h * w; + const index_t out_img_size = outh * outw; + + const index_t inch_g = inch / group; + const index_t outch_g = outch / group; + +#pragma omp parallel for collapse(3) + for (int b = 0; b < out_shape[0]; ++b) { + for (int g = 0; g < group; ++g) { + for (int oc = 0; oc < outch_g; oc++) { + const index_t out_offset = + (b * outch + outch_g * g + oc) * out_img_size; + float *out_base = output + out_offset; + for (int ic = 0; ic < inch_g; ic++) { + const index_t in_offset = + (b * inch + inch_g * g + ic) * in_img_size; + const index_t kernel_offset = + ((oc * group + g) * inch_g + ic) * 16; + const float *input_base = input + in_offset; + const float *kernel_base = filter + kernel_offset; + const float *in = input_base; + + const float *k0 = kernel_base; + const float *k1 = kernel_base + 4; + const float *k2 = kernel_base + 8; + const float *k3 = kernel_base + 12; +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); + float32x4_t k2_vec = vld1q_f32(k2); + float32x4_t k3_vec = vld1q_f32(k3); +#endif + for (int i = 0; i < h; i++) { + float *out_row = out_base + 2 * i * outw; + + float *out_row_0 = out_row; + float *out_row_1 = out_row_0 + outw; + float *out_row_2 = out_row_1 + outw; + float *out_row_3 = out_row_2 + outw; + + int j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + // row 0 + float32x4x2_t out0 = vld2q_f32(out_row_0); + out0.val[0] = + neon_vfma_lane_0(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_1(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out0); + out0 = vld2q_f32(out_row_0 + 2); + out0.val[0] = + neon_vfma_lane_2(out0.val[0], in_vec, k0_vec); + out0.val[1] = + neon_vfma_lane_3(out0.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0 + 2, out0); + + // row 1 + float32x4x2_t out1 = vld2q_f32(out_row_1); + out1.val[0] = + neon_vfma_lane_0(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_1(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1, out1); + out1 = vld2q_f32(out_row_1 + 2); + out1.val[0] = + neon_vfma_lane_2(out1.val[0], in_vec, k1_vec); + out1.val[1] = + neon_vfma_lane_3(out1.val[1], in_vec, k1_vec); + vst2q_f32(out_row_1 + 2, out1); + + // row 2 + float32x4x2_t out2 = vld2q_f32(out_row_2); + out2.val[0] = + neon_vfma_lane_0(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_1(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2, out2); + out2 = vld2q_f32(out_row_2 + 2); + out2.val[0] = + neon_vfma_lane_2(out2.val[0], in_vec, k2_vec); + out2.val[1] = + neon_vfma_lane_3(out2.val[1], in_vec, k2_vec); + vst2q_f32(out_row_2 + 2, out2); + + // row 3 + float32x4x2_t out3 = vld2q_f32(out_row_3); + out3.val[0] = + neon_vfma_lane_0(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_1(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3, out3); + out3 = vld2q_f32(out_row_3 + 2); + out3.val[0] = + neon_vfma_lane_2(out3.val[0], in_vec, k3_vec); + out3.val[1] = + neon_vfma_lane_3(out3.val[1], in_vec, k3_vec); + vst2q_f32(out_row_3 + 2, out3); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + out_row_2 += 8; + out_row_3 += 8; + } +#endif + for (; j < w; j++) { + float val = in[0]; + for (int k = 0; k < 4; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k1[k]; + out_row_2[k] += val * k2[k]; + out_row_3[k] += val * k3[k]; + } + in++; + out_row_0 += 2; + out_row_1 += 2; + out_row_2 += 2; + out_row_3 += 2; + } + } + } + } + } + } +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 0bfa8200..001b38a9 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -359,12 +359,12 @@ class Deconv2dOp : public Deconv2dOpBase { padded_out_shape.data(), out_data); if (!no_pad) { - CropPadOut(out_data, - padded_out_shape.data(), - output_shape.data(), - pad_h, - pad_w, - output_data); + CropPadOut(out_data, + padded_out_shape.data(), + output_shape.data(), + pad_h, + pad_w, + output_data); } if (bias_data != nullptr) { @@ -445,33 +445,6 @@ class Deconv2dOp : public Deconv2dOpBase { } } } - - void CropPadOut(const float *input, - const index_t *in_shape, - const index_t *out_shape, - const index_t pad_h, - const index_t pad_w, - float *output) { - const index_t batch = in_shape[0]; - const index_t channel = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; -#pragma omp parallel for collapse(3) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channel; ++j) { - for (int k = 0; k < out_height; ++k) { - const float *input_base = - input + ((i * channel + j) * in_height + (k + pad_h)) * in_width; - float *output_base = - output + ((i * channel + j) * out_height + k)* out_width; - memcpy(output_base, input_base + pad_w, out_width * sizeof(float)); - } - } - } - } }; #ifdef MACE_ENABLE_OPENCL diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h index 35dcee8b..1af7362b 100644 --- a/mace/ops/deconv_2d.h +++ b/mace/ops/deconv_2d.h @@ -15,6 +15,8 @@ #ifndef MACE_OPS_DECONV_2D_H_ #define MACE_OPS_DECONV_2D_H_ +#include "mace/core/types.h" + namespace mace { namespace ops { @@ -23,6 +25,34 @@ enum FrameworkType { CAFFE = 1, }; +template +void CropPadOut(const T *input, + const index_t *in_shape, + const index_t *out_shape, + const index_t pad_h, + const index_t pad_w, + T *output) { + const index_t batch = in_shape[0]; + const index_t channel = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; +#pragma omp parallel for collapse(3) + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channel; ++j) { + for (int k = 0; k < out_height; ++k) { + const T *input_base = + input + ((i * channel + j) * in_height + (k + pad_h)) * in_width; + T *output_base = + output + ((i * channel + j) * out_height + k)* out_width; + memcpy(output_base, input_base + pad_w, out_width * sizeof(T)); + } + } + } +} + } // namespace ops } // namespace mace diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc new file mode 100644 index 00000000..a4b2ba1c --- /dev/null +++ b/mace/ops/depthwise_deconv2d.cc @@ -0,0 +1,547 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/deconv_2d.h" + +#if defined(MACE_ENABLE_NEON) +#include +#endif + +#include +#include +#include +#include + +#include "mace/core/future.h" +#include "mace/core/operator.h" +#include "mace/core/tensor.h" +#include "mace/ops/activation.h" +#include "mace/ops/arm/depthwise_deconv2d_neon.h" +#include "mace/ops/conv_pool_2d_util.h" +#include "mace/utils/utils.h" +#include "mace/public/mace.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/image/depthwise_deconv2d.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace ops { + +class DepthwiseDeconv2dOpBase : public Operation { + public: + explicit DepthwiseDeconv2dOpBase(OpConstructContext *context) + : Operation(context), + strides_(Operation::GetRepeatedArgs("strides")), + paddings_(Operation::GetRepeatedArgs("padding_values")), + group_(Operation::GetOptionalArg("group", 1)), + activation_(ops::StringToActivationType( + Operation::GetOptionalArg("activation", + "NOOP"))), + relux_max_limit_(Operation::GetOptionalArg("max_limit", + 0.0f)) {} + + static void CalcGroupDeconvOutputSize( + const index_t *input_shape, // NHWC + const index_t *filter_shape, // OIHW + const int group, + const int *strides, + const int *paddings, + int *pre_paddings, + index_t *out_shape, + index_t *padded_out_shape, + const bool isNCHW = false) { + MACE_CHECK_NOTNULL(paddings); + MACE_CHECK_NOTNULL(input_shape); + MACE_CHECK_NOTNULL(filter_shape); + MACE_CHECK_NOTNULL(strides); + + const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; + const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; + + const index_t output_channel = filter_shape[0] * group; + + const index_t kernel_h = filter_shape[2]; + const index_t kernel_w = filter_shape[3]; + + index_t padded_out_height = + (in_height - 1) * strides[0] + kernel_h; + index_t padded_out_width = + (in_width - 1) * strides[1] + kernel_w; + + if (pre_paddings != nullptr) { + pre_paddings[0] = static_cast((kernel_h - 1) * 2 - paddings[0]); + pre_paddings[1] = static_cast((kernel_w - 1) * 2 - paddings[1]); + pre_paddings[0] = std::max(0, pre_paddings[0]); + pre_paddings[1] = std::max(0, pre_paddings[1]); + } + + if (padded_out_shape != nullptr) { + padded_out_shape[0] = input_shape[0]; + padded_out_shape[1] = isNCHW ? output_channel : padded_out_height; + padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width; + padded_out_shape[3] = isNCHW ? padded_out_width : output_channel; + } + + if (out_shape != nullptr) { + index_t out_height = padded_out_height - paddings[0]; + index_t out_width = padded_out_width - paddings[1]; + out_shape[0] = input_shape[0]; + out_shape[1] = isNCHW ? output_channel : out_height; + out_shape[2] = isNCHW ? out_height : out_width; + out_shape[3] = isNCHW ? out_width : output_channel; + } + } + + protected: + std::vector strides_; // [stride_h, stride_w] + std::vector paddings_; + const int group_; + const ActivationType activation_; + const float relux_max_limit_; +}; + + + +template +class DepthwiseDeconv2dOp; + +template<> +class DepthwiseDeconv2dOp + : public DepthwiseDeconv2dOpBase { + public: + explicit DepthwiseDeconv2dOp(OpConstructContext *context) + : DepthwiseDeconv2dOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + const Tensor *filter = this->Input(1); + const Tensor *bias = this->InputSize() >= 3 ? this->Input(2) : nullptr; + Tensor *output = this->Output(0); + + MACE_CHECK_NOTNULL(input); + MACE_CHECK_NOTNULL(filter); + MACE_CHECK_NOTNULL(output); + + std::vector out_paddings(2, 0); + std::vector out_shape(4, 0); + std::vector padded_out_shape(4, 0); + + if (!paddings_.empty()) out_paddings = paddings_; + CalcGroupDeconvOutputSize(input->shape().data(), + filter->shape().data(), + group_, + strides_.data(), + out_paddings.data(), + nullptr, + out_shape.data(), + padded_out_shape.data(), + true); + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + output->Clear(); + index_t kernel_h = filter->dim(2); + index_t kernel_w = filter->dim(3); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard bias_mapper(bias); + Tensor::MappingGuard output_mapper(output); + auto input_data = input->data(); + auto filter_data = filter->data(); + auto bias_data = bias == nullptr ? nullptr : bias->data(); + + auto output_data = output->mutable_data(); + + const index_t pad_left = out_paddings[0] / 2; + const index_t pad_top = out_paddings[1] / 2; + + index_t padded_out_size = + std::accumulate(padded_out_shape.begin(), + padded_out_shape.end(), + 1, + std::multiplies()) * sizeof(float); + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + scratch->GrowSize(padded_out_size); + Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT); + padded_out.Reshape(padded_out_shape); + padded_out.Clear(); + auto *padded_out_data = padded_out.mutable_data(); + + const index_t in_channels = input->dim(1); + const index_t out_channels = output->dim(1); + + bool no_pad = paddings_[0] == 0 && paddings_[1] == 0; + float *out_data = no_pad ? output_data : padded_out_data; + + bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 && + strides_[0] == strides_[1] && strides_[0] == 1; + bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 && + strides_[0] == strides_[1] && strides_[0] == 2; + bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 && + strides_[0] == strides_[1] && strides_[0] == 1; + bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 && + strides_[0] == strides_[1] && strides_[0] == 2; + + bool is_depthwise = (group_ == in_channels && group_ == out_channels); + + std::function kernel_func; + + if (use_neon_3x3_s1) { + kernel_func = [=](const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + if (is_depthwise) { + DepthwiseDeconv2dNeonK3x3S1(input, + filter, + in_shape, + padded_out_shape, + padded_output); + } else { + GroupDeconv2dNeonK3x3S1(input, + filter, + group, + in_shape, + padded_out_shape, + padded_output); + } + }; + } else if (use_neon_3x3_s2) { + kernel_func = [=](const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + if (is_depthwise) { + DepthwiseDeconv2dNeonK3x3S2(input, + filter, + in_shape, + padded_out_shape, + padded_output); + } else { + GroupDeconv2dNeonK3x3S2(input, + filter, + group, + in_shape, + padded_out_shape, + padded_output); + } + }; + } else if (use_neon_4x4_s1) { + kernel_func = [=](const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + if (is_depthwise) { + DepthwiseDeconv2dNeonK4x4S1(input, + filter, + in_shape, + padded_out_shape, + padded_output); + } else { + GroupDeconv2dNeonK4x4S1(input, + filter, + group, + in_shape, + padded_out_shape, + padded_output); + } + }; + } else if (use_neon_4x4_s2) { + kernel_func = [=](const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + if (is_depthwise) { + DepthwiseDeconv2dNeonK4x4S2(input, + filter, + in_shape, + padded_out_shape, + padded_output); + } else { + GroupDeconv2dNeonK4x4S2(input, + filter, + group, + in_shape, + padded_out_shape, + padded_output); + } + }; + } else { + kernel_func = [=](const float *input, + const float *filter, + const int group, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + if (is_depthwise) { + DepthwiseDeconv2dGeneral(input, + filter, + kernel_h, + kernel_w, + strides_.data(), + in_shape, + padded_out_shape, + padded_output); + } else { + GroupDeconv2dGeneral(input, + filter, + kernel_h, + kernel_w, + strides_.data(), + group, + in_shape, + padded_out_shape, + padded_output); + } + }; + } + + kernel_func(input_data, + filter_data, + group_, + input->shape().data(), + padded_out_shape.data(), + out_data); + + + if (!no_pad) { + CropPadOut(out_data, + padded_out_shape.data(), + out_shape.data(), + pad_left, + pad_top, + output_data); + } + + if (bias_data != nullptr) { + const index_t batch = out_shape[0]; + const index_t channels = out_shape[1]; + const index_t img_size = out_shape[2] * out_shape[3]; +#pragma omp parallel for collapse(3) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + for (index_t i = 0; i < img_size; ++i) { + output_data[(b * channels + c) * img_size + i] += + bias_data[c]; + } + } + } + } + + DoActivation(output_data, + output_data, + output->size(), + activation_, + relux_max_limit_); + + return MaceStatus::MACE_SUCCESS; + } + + private: + void DepthwiseDeconv2dGeneral(const float *input, + const float *filter, + const index_t kernel_h, + const index_t kernel_w, + const int *strides, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t batch = in_shape[0]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + + const index_t out_img_size = out_height * out_width; + const index_t in_img_size = in_height * in_width; + + const int kernel_size = kernel_h * kernel_w; + std::vector index_map(kernel_size, 0); + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + index_map[i * kernel_w + j] = i * out_width + j; + } + } + +#pragma omp parallel for collapse(2) + for (int b = 0; b < batch; ++b) { + for (int c = 0; c < channels; ++c) { + float *out_base = + output + (b * channels + c) * out_img_size; + for (int i = 0; i < in_height; ++i) { + for (int j = 0; j < in_width; ++j) { + const index_t out_offset = + i * strides[0] * out_width + j * strides[1]; + const index_t input_idx = + (b * channels + c) * in_img_size + i * in_width + j; + const float val = input[input_idx]; + const index_t kernel_offset = c * kernel_size; + for (int k = 0; k < kernel_size; ++k) { + const index_t out_idx = out_offset + index_map[k]; + const index_t kernel_idx = kernel_offset + k; + out_base[out_idx] += val * filter[kernel_idx]; + } + } + } + } + } + } + + void GroupDeconv2dGeneral(const float *input, + const float *filter, + const index_t kernel_h, + const index_t kernel_w, + const int *strides, + const int group, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + + MACE_CHECK(in_channels % group == 0 && out_channels % group == 0, + "invalid input/output channel and group."); + + const index_t out_img_size = out_height * out_width; + const index_t in_img_size = in_height * in_width; + + const int kernel_size = kernel_h * kernel_w; + std::vector index_map(kernel_size, 0); + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + index_map[i * kernel_w + j] = i * out_width + j; + } + } + + const int in_channels_g = in_channels / group; + const int out_channels_g = out_channels / group; +#pragma omp parallel for collapse(3) + for (int b = 0; b < in_shape[0]; ++b) { + for (int g = 0; g < group; ++g) { + for (int p = 0; p < out_channels_g; ++p) { + const index_t out_base = + ((b * group + g) * out_channels_g + p) * out_img_size; + for (int i = 0; i < in_height; ++i) { + for (int j = 0; j < in_width; ++j) { + const index_t out_offset = + i * strides[0] * out_width + j * strides[1]; + for (int q = 0; q < in_channels_g; ++q) { + const index_t in_base = + ((b * group + g) * in_channels_g + q) * in_img_size; + const index_t in_offset = + in_base + i * in_width + j; + const float val = input[in_offset]; + const index_t k_offset = + ((p * group + g) * in_channels_g + q) * kernel_size; + for (int k = 0; k < kernel_size; ++k) { + const index_t out_idx = out_base + out_offset + index_map[k]; + const float w = filter[k_offset + k]; + output[out_idx] += val * w; + } + } + } + } + } + } + } + } +}; + +#ifdef MACE_ENABLE_OPENCL +template +class DepthwiseDeconv2dOp : public DepthwiseDeconv2dOpBase { + public: + explicit DepthwiseDeconv2dOp(OpConstructContext *context) + : DepthwiseDeconv2dOpBase(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel); + } else { + MACE_NOT_IMPLEMENTED; + } + } + + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + const Tensor *filter = this->Input(1); + const Tensor *bias = this->InputSize() >= 3 ? this->Input(2) : nullptr; + Tensor *output = this->Output(0); + MACE_CHECK_NOTNULL(input); + MACE_CHECK_NOTNULL(filter); + MACE_CHECK_NOTNULL(output); + + std::vector in_paddings(2, 0); + std::vector out_shape(4, 0); + + CalcGroupDeconvOutputSize(input->shape().data(), + filter->shape().data(), + group_, + strides_.data(), + paddings_.data(), + in_paddings.data(), + out_shape.data(), + nullptr); + + return kernel_->Compute(context, + input, + filter, + bias, + strides_.data(), + in_paddings.data(), + group_, + activation_, + relux_max_limit_, + out_shape, + output); + } + + private: + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL + +void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d", + DepthwiseDeconv2dOp, DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d", + DepthwiseDeconv2dOp, DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d", + DepthwiseDeconv2dOp, DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/depthwise_deconv2d_benchmark.cc b/mace/ops/depthwise_deconv2d_benchmark.cc new file mode 100644 index 00000000..3e3da26f --- /dev/null +++ b/mace/ops/depthwise_deconv2d_benchmark.cc @@ -0,0 +1,130 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "mace/core/operator.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +template +static void DepthwiseDeconv2d(int iters, + int batch, + int channels, + int height, + int width, + int kernel_h, + int kernel_w, + int stride, + int padding) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + if (D == DeviceType::CPU) { + net.AddRandomInput("Input", {batch, channels, height, width}); + } else { + net.AddRandomInput("Input", {batch, height, width, channels}); + } + net.AddRandomInput("Filter", + {1, channels, kernel_h, + kernel_w}); + if (D == DeviceType::GPU) { + BufferToImage(&net, "Input", "InputImage", + ops::BufferType::IN_OUT_CHANNEL); + BufferToImage(&net, "Filter", "FilterImage", + ops::BufferType::DW_CONV2D_FILTER); + OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Output("Output") + .AddIntsArg("strides", {stride, stride}) + .AddIntsArg("padding_values", {padding, padding}) + .AddIntArg("group", channels) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") + .Input("Input") + .Input("Filter") + .Output("Output") + .AddIntsArg("strides", {stride, stride}) + .AddIntsArg("padding_values", {padding, padding}) + .AddIntArg("group", channels) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + } + + net.Setup(D); + + // Warm-up + for (int i = 0; i < 2; ++i) { + net.Run(); + net.Sync(); + } + + mace::testing::StartTiming(); + while (iters--) { + net.Run(); + net.Sync(); + } +} + +// In common network, there are usually more than 1 layers, this is used to +// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is +// in-order. + +#define MACE_BM_DEPTHWISE_DECONV2D_MACRO( \ + N, C, H, W, KH, KW, S, P, TYPE, DEVICE) \ + static void \ + MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\ + ##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + const int64_t macc = \ + static_cast(iters) * N * H * W * KH * KW * C; \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + DepthwiseDeconv2d(iters, N, C, H, W, KH, KW, S, P); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\ + ##_##TYPE##_##DEVICE) + +#define MACE_BM_DEPTHWISE_DECONV2D(N, C, H, W, KH, KW, S, P) \ + MACE_BM_DEPTHWISE_DECONV2D_MACRO(N, C, H, W, KH, KW, S, P, float, CPU); \ + MACE_BM_DEPTHWISE_DECONV2D_MACRO(N, C, H, W, KH, KW, S, P, float, GPU); \ + MACE_BM_DEPTHWISE_DECONV2D_MACRO(N, C, H, W, KH, KW, S, P, half, GPU); + +MACE_BM_DEPTHWISE_DECONV2D(1, 128, 15, 15, 1, 1, 1, 0); +MACE_BM_DEPTHWISE_DECONV2D(1, 32, 60, 60, 1, 1, 1, 0); + +MACE_BM_DEPTHWISE_DECONV2D(1, 32, 60, 60, 3, 3, 1, 0); + +MACE_BM_DEPTHWISE_DECONV2D(1, 128, 60, 60, 4, 4, 1, 0); +MACE_BM_DEPTHWISE_DECONV2D(1, 3, 224, 224, 4, 4, 2, 0); +MACE_BM_DEPTHWISE_DECONV2D(1, 3, 512, 512, 7, 7, 2, 0); +MACE_BM_DEPTHWISE_DECONV2D(1, 128, 16, 16, 5, 5, 1, 0); + +MACE_BM_DEPTHWISE_DECONV2D(1, 64, 32, 32, 1, 1, 1, 0); +MACE_BM_DEPTHWISE_DECONV2D(1, 64, 33, 32, 3, 3, 2, 0); +MACE_BM_DEPTHWISE_DECONV2D(1, 3, 224, 224, 3, 3, 2, 0); +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/depthwise_deconv2d_test.cc b/mace/ops/depthwise_deconv2d_test.cc new file mode 100644 index 00000000..b1f36845 --- /dev/null +++ b/mace/ops/depthwise_deconv2d_test.cc @@ -0,0 +1,286 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class DepthwiseDeconv2dOpTest : public OpsTestBase {}; + +namespace { +template +void RunTestSimple(const int group, + const std::vector &input_shape, + const std::vector &input_data, + const std::vector &bias_data, + const int stride, + const std::vector &paddings, + const std::vector &filter_shape, + const std::vector &filter_data, + const std::vector &expected_shape, + const std::vector &expected_data) { + OpsTestNet net; + // Add input data + net.AddInputFromArray("Input", input_shape, input_data); + net.AddInputFromArray("Filter", filter_shape, filter_data); + net.TransformDataFormat("Filter", HWOI, "FilterOIHW", OIHW); + const index_t out_channels = expected_shape[3]; + net.AddInputFromArray("Bias", {out_channels}, bias_data); + + if (D == DeviceType::GPU) { + BufferToImage(&net, "Input", "InputImage", + ops::BufferType::IN_OUT_CHANNEL); + BufferToImage(&net, "FilterOIHW", "FilterImage", + ops::BufferType::DW_CONV2D_FILTER); + BufferToImage(&net, "Bias", "BiasImage", + ops::BufferType::ARGUMENT); + OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride, stride}) + .AddIntArg("group", group) + .AddIntsArg("padding_values", paddings) + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + + // Transfer output + ImageToBuffer(&net, "OutputImage", "Output", + ops::BufferType::IN_OUT_CHANNEL); + } else { + net.TransformDataFormat("Input", NHWC, + "InputNCHW", NCHW); + OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") + .Input("InputNCHW") + .Input("FilterOIHW") + .Input("Bias") + .Output("OutputNCHW") + .AddIntArg("group", group) + .AddIntsArg("strides", {stride, stride}) + .AddIntsArg("padding_values", paddings) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + net.TransformDataFormat("OutputNCHW", NCHW, + "Output", NHWC); + } + + auto expected = net.CreateTensor(expected_shape, expected_data); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.0001); +} + +template +void TestNHWCSimple3x3_DW() { + RunTestSimple(3, + {1, 3, 3, 3}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0}, + 1, {0, 0}, + {3, 3, 1, 3}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {1, 5, 5, 3}, + {1, 1, 1, 2, 2, 2, 3, 3, 3, 2, 2, 2, 1, 1, 1, + 2, 2, 2, 4, 4, 4, 6, 6, 6, 4, 4, 4, 2, 2, 2, + 3, 3, 3, 6, 6, 6, 9, 9, 9, 6, 6, 6, 3, 3, 3, + 2, 2, 2, 4, 4, 4, 6, 6, 6, 4, 4, 4, 2, 2, 2, + 1, 1, 1, 2, 2, 2, 3, 3, 3, 2, 2, 2, 1, 1, 1}); +} + +template +void TestNHWCSimple3x3_Group() { + RunTestSimple(2, + {1, 3, 3, 4}, + {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4}, + {0, 0, 0, 0, 0, 0}, + 1, {0, 0}, + {3, 3, 3, 4}, + {1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1}, + {1, 5, 5, 6}, + {3, 6, 3, 7, 14, 7, + 6, 12, 6, 14, 28, 14, + 9, 18, 9, 21, 42, 21, + 6, 12, 6, 14, 28, 14, + 3, 6, 3, 7, 14, 7, + 6, 12, 6, 14, 28, 14, + 12, 24, 12, 28, 56, 28, + 18, 36, 18, 42, 84, 42, + 12, 24, 12, 28, 56, 28, + 6, 12, 6, 14, 28, 14, + 9, 18, 9, 21, 42, 21, + 18, 36, 18, 42, 84, 42, + 27, 54, 27, 63, 126, 63, + 18, 36, 18, 42, 84, 42, + 9, 18, 9, 21, 42, 21, + 6, 12, 6, 14, 28, 14, + 12, 24, 12, 28, 56, 28, + 18, 36, 18, 42, 84, 42, + 12, 24, 12, 28, 56, 28, + 6, 12, 6, 14, 28, 14, + 3, 6, 3, 7, 14, 7, + 6, 12, 6, 14, 28, 14, + 9, 18, 9, 21, 42, 21, + 6, 12, 6, 14, 28, 14, + 3, 6, 3, 7, 14, 7}); +} +} // namespace + +TEST_F(DepthwiseDeconv2dOpTest, CPUSimple3X3Depthwise) { + TestNHWCSimple3x3_DW(); +} + +TEST_F(DepthwiseDeconv2dOpTest, CPUSimple3X3Group) { +TestNHWCSimple3x3_Group(); +} + +TEST_F(DepthwiseDeconv2dOpTest, GPUSimple3X3Depthwise) { +TestNHWCSimple3x3_DW(); +} + +namespace { +template +void RandomTest(index_t batch, + index_t channel, + index_t height, + index_t width, + index_t kernel, + int stride, + int padding) { + testing::internal::LogToStderr(); + // Construct graph + OpsTestNet net; + int multiplier = 1; + + // Add input data + std::vector input_data(batch * height * width * channel); + GenerateRandomRealTypeData({batch, height, width, channel}, &input_data); + net.AddInputFromArray("Input", + {batch, + height, + width, + channel}, + input_data); + std::vector filter_data(kernel * kernel * channel * multiplier); + GenerateRandomRealTypeData({multiplier, channel, kernel, kernel}, + &filter_data); + net.AddInputFromArray( + "Filter", {multiplier, channel, kernel, kernel}, filter_data); + std::vector bias_data(channel * multiplier); + GenerateRandomRealTypeData({channel * multiplier}, &bias_data); + net.AddInputFromArray("Bias", + {channel * multiplier}, + bias_data); + + net.TransformDataFormat("Input", NHWC, "InputNCHW", + NCHW); + OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") + .Input("InputNCHW") + .Input("Filter") + .Input("Bias") + .Output("OutputNCHW") + .AddIntsArg("strides", {stride, stride}) + .AddIntsArg("padding_values", {padding, padding}) + .AddIntArg("group", channel) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(DeviceType::CPU); + net.TransformDataFormat("OutputNCHW", NCHW, + "Output", NHWC); + + + // Check + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); + + + BufferToImage(&net, "Input", "InputImage", + ops::BufferType::IN_OUT_CHANNEL); + BufferToImage(&net, "Filter", "FilterImage", + ops::BufferType::DW_CONV2D_FILTER); + BufferToImage(&net, "Bias", "BiasImage", + ops::BufferType::ARGUMENT); + OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride, stride}) + .AddIntsArg("padding_values", {padding, padding}) + .AddIntArg("group", channel) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + net.RunOp(DeviceType::GPU); + + // Transfer output + ImageToBuffer(&net, "OutputImage", "OPENCLOutput", + ops::BufferType::IN_OUT_CHANNEL); + + if (DataTypeToEnum::value == DT_FLOAT) { + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); + } else { + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2); + } +} + +TEST_F(DepthwiseDeconv2dOpTest, RandomTestFloat) { + RandomTest(1, 32, 256, 256, 5, 1, 2); + RandomTest(1, 3, 256, 256, 5, 1, 1); + RandomTest(1, 3, 256, 256, 5, 2, 2); + RandomTest(1, 3, 256, 256, 5, 1, 3); + RandomTest(1, 3, 256, 256, 5, 2, 4); + RandomTest(1, 4, 256, 256, 5, 1, 1); + RandomTest(1, 4, 256, 256, 5, 2, 2); + RandomTest(1, 4, 256, 256, 5, 1, 3); + RandomTest(1, 4, 256, 256, 5, 2, 4); +} +// +TEST_F(DepthwiseDeconv2dOpTest, RandomTestHalf) { + RandomTest(1, 32, 256, 256, 5, 1, 2); + RandomTest(1, 3, 256, 256, 5, 1, 1); + RandomTest(1, 3, 256, 256, 5, 2, 2); + RandomTest(1, 3, 256, 256, 5, 1, 3); + RandomTest(1, 3, 256, 256, 5, 2, 4); + RandomTest(1, 4, 256, 256, 5, 1, 1); + RandomTest(1, 4, 256, 256, 5, 2, 2); + RandomTest(1, 4, 256, 256, 5, 1, 3); + RandomTest(1, 4, 256, 256, 5, 2, 4); +} + +} // namespace +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/cl/deconv_2d.cl b/mace/ops/opencl/cl/deconv_2d.cl index ae08b409..d39d3fe3 100644 --- a/mace/ops/opencl/cl/deconv_2d.cl +++ b/mace/ops/opencl/cl/deconv_2d.cl @@ -163,4 +163,5 @@ __kernel void deconv_2d(OUT_OF_RANGE_PARAMS out_pos.x += stride_w; WRITE_IMAGET(output, out_pos, out4); } -} \ No newline at end of file +} + diff --git a/mace/ops/opencl/cl/depthwise_deconv2d.cl b/mace/ops/opencl/cl/depthwise_deconv2d.cl new file mode 100644 index 00000000..9d648b65 --- /dev/null +++ b/mace/ops/opencl/cl/depthwise_deconv2d.cl @@ -0,0 +1,149 @@ +#include + +__kernel void depthwise_deconv2d(OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t input, + __read_only image2d_t weights, +#ifdef BIAS + __read_only image2d_t bias, +#endif + __write_only image2d_t output, + __private const float relux_max_limit, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int out_channel, + __private const int stride_h, + __private const int stride_w, + __private const float stride_h_r, + __private const float stride_w_r, + __private const int align_h, + __private const int align_w, + __private const int padding_h, + __private const int padding_w, + __private const int kernel_h, + __private const int kernel_w, + __private const int kernel_size, + __private const int out_channel_blocks) +{ + const int c = get_global_id(0); + const int w_id = get_global_id(1); + const int hb = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (c >= global_size_dim0 || w_id >= global_size_dim1 + || hb >= global_size_dim2) { + return; + } +#endif + +#ifdef BIAS + DATA_TYPE4 out0 = + READ_IMAGET(bias, SAMPLER, (int2)(c, 0)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; + DATA_TYPE4 out4 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; + DATA_TYPE4 out4 = 0; +#endif + const int n_stride = mad(w_id, stride_w_r, 0); + const int mod_stride = w_id - mul24(n_stride, stride_w); + const int w = mad24(mul24(n_stride, 5), stride_w, mod_stride); + const int b = hb / out_height; + const int h = hb - mul24(b, out_height); + if (w < out_width) { + + int start_x = floor((float) (w + align_w) * stride_w_r); + int start_y = (h + align_h) * stride_h_r; + start_y = max(0, start_y); + + int f_start_x = mad24(start_x, stride_w, padding_w) - w; + int f_start_y = mad24(start_y, stride_h, padding_h) - h; + f_start_x = kernel_w - 1 - f_start_x; + f_start_y = kernel_h - 1 - f_start_y; + + int2 in_pos; + int f_pos; + DATA_TYPE4 in0, in1, in2, in3, in4; + DATA_TYPE4 weight; + int idx_w0, idx_w1, idx_w2, idx_w3, idx_w4; + int index_x, index_y; + for (int f_y = f_start_y, idx_h = start_y ; f_y >= 0; f_y -= stride_h, ++idx_h) { + index_y = mad24(b, in_height, idx_h); + in_pos.y = select(index_y, -1, idx_h < 0 || idx_h >= in_height); + for (int f_x = f_start_x, idx_w = start_x; f_x >= 0; f_x -= stride_w, ++idx_w) { + idx_w0 = idx_w; + idx_w1 = idx_w + 1; + idx_w2 = idx_w + 2; + idx_w3 = idx_w + 3; + idx_w4 = idx_w + 4; + +#define READ_INPUT(i) \ + index_x = mad24(c, in_width, idx_w##i); \ + in_pos.x = \ + select(index_x, -1, idx_w##i < 0 || idx_w##i >= in_width); \ + in##i = READ_IMAGET(input, SAMPLER, in_pos); + + READ_INPUT(0); + READ_INPUT(1); + READ_INPUT(2); + READ_INPUT(3); + READ_INPUT(4); +#undef READ_INPUT + + f_pos = mad24(f_y, kernel_w, f_x); + weight = READ_IMAGET(weights, SAMPLER, (int2)(f_pos, c)); + out0 = mad(in0, weight, out0); + out1 = mad(in1, weight, out1); + out2 = mad(in2, weight, out2); + out3 = mad(in3, weight, out3); + out4 = mad(in4, weight, out4); + + } + } + +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID) + out0 = do_activation(out0, relux_max_limit); + out1 = do_activation(out1, relux_max_limit); + out2 = do_activation(out2, relux_max_limit); + out3 = do_activation(out3, relux_max_limit); + out4 = do_activation(out4, relux_max_limit); +#endif + + + + int2 out_pos; + out_pos.y = hb; + + int ow = w; + if (ow >= out_width) return; + out_pos.x = mad24(c, out_width, ow); + WRITE_IMAGET(output, out_pos, out0); + + ow += stride_w; + if (ow >= out_width) return; + out_pos.x += stride_w; + WRITE_IMAGET(output, out_pos, out1); + + ow += stride_w; + if (ow >= out_width) return; + out_pos.x += stride_w; + WRITE_IMAGET(output, out_pos, out2); + + ow += stride_w; + if (ow >= out_width) return; + out_pos.x += stride_w; + WRITE_IMAGET(output, out_pos, out3); + + ow += stride_w; + if (ow >= out_width) return; + out_pos.x += stride_w; + WRITE_IMAGET(output, out_pos, out4); + } +} \ No newline at end of file diff --git a/mace/ops/opencl/depthwise_deconv2d.h b/mace/ops/opencl/depthwise_deconv2d.h new file mode 100644 index 00000000..994c98a2 --- /dev/null +++ b/mace/ops/opencl/depthwise_deconv2d.h @@ -0,0 +1,49 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_OPENCL_DEPTHWISE_DECONV2D_H_ +#define MACE_OPS_OPENCL_DEPTHWISE_DECONV2D_H_ + +#include + +#include "mace/ops/activation.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace ops { + +class OpenCLDepthwiseDeconv2dKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const int *padding_data, + const int group, + const ActivationType activation, + const float relux_max_limit, + const std::vector &output_shape, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDepthwiseDeconv2dKernel); +}; + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_OPENCL_DEPTHWISE_DECONV2D_H_ diff --git a/mace/ops/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h index c4dfa2bf..a0f51874 100644 --- a/mace/ops/opencl/image/deconv_2d.h +++ b/mace/ops/opencl/image/deconv_2d.h @@ -79,10 +79,10 @@ MaceStatus Deconv2dKernel::Compute( const int stride_h = strides[0]; const int stride_w = strides[1]; MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0."); -#define MACE_WIDTH_BLK 5 + const int width_tile = 5; const index_t n_strides = (width + stride_w - 1) / stride_w; const index_t width_blocks = - ((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride_w; + ((n_strides + width_tile - 1) / width_tile) * stride_w; const float stride_h_r = 1.f / static_cast(stride_h); const float stride_w_r = 1.f / static_cast(stride_w); const int padding_h = (padding_data[0] + 1) >> 1; diff --git a/mace/ops/opencl/image/depthwise_deconv2d.h b/mace/ops/opencl/image/depthwise_deconv2d.h new file mode 100644 index 00000000..040c349d --- /dev/null +++ b/mace/ops/opencl/image/depthwise_deconv2d.h @@ -0,0 +1,196 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_OPS_OPENCL_IMAGE_DEPTHWISE_DECONV2D_H_ +#define MACE_OPS_OPENCL_IMAGE_DEPTHWISE_DECONV2D_H_ + +#include "mace/ops/opencl/depthwise_deconv2d.h" + +#include +#include +#include +#include + +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" +#include "mace/ops/opencl/helper.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +template +class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel { + public: + MaceStatus Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const int *padding_data, + const int group, + const ActivationType activation, + const float relux_max_limit, + const std::vector &output_shape, + Tensor *output) override; + + private: + cl::Kernel kernel_; + uint32_t kwg_size_; + std::vector input_shape_; +}; + +template +MaceStatus DepthwiseDeconv2dKernel::Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const int *padding_data, + const int group, + const ActivationType activation, + const float relux_max_limit, + const std::vector &output_shape, + Tensor *output) { + const index_t batch = output_shape[0]; + const index_t height = output_shape[1]; + const index_t width = output_shape[2]; + const index_t channels = output_shape[3]; + const index_t input_channels = input->dim(3); + const index_t multiplier = filter->dim(0); + + MACE_CHECK(group == channels && group == input_channels && multiplier == 1, + "opencl image deconv only supports depthwise type group."); + + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, + &output_image_shape); + MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); + const DataType dt = DataTypeToEnum::value; + + const index_t channel_blocks = RoundUpDiv4(channels); + const int stride_h = strides[0]; + const int stride_w = strides[1]; + MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0."); + const int width_tile = 5; + const index_t n_strides = (width + stride_w - 1) / stride_w; + const index_t width_blocks = + ((n_strides + width_tile - 1) / width_tile) * stride_w; + const float stride_h_r = 1.f / static_cast(stride_h); + const float stride_w_r = 1.f / static_cast(stride_w); + const int padding_h = (padding_data[0] + 1) >> 1; + const int padding_w = (padding_data[1] + 1) >> 1; + + const int align_h = stride_h - 1 - padding_h; + const int align_w = stride_w - 1 - padding_w; + const int kernel_size = filter->dim(2) * filter->dim(3); + + auto runtime = context->device()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d"); + built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation; + } + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name, + built_options, &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + + MACE_OUT_OF_RANGE_INIT(kernel_); + if (!IsVecEqual(input_shape_, input->shape())) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, *(filter->opencl_image())); + if (bias != nullptr) { + kernel_.setArg(idx++, *(bias->opencl_image())); + } + kernel_.setArg(idx++, *(output->opencl_image())); + kernel_.setArg(idx++, relux_max_limit); + kernel_.setArg(idx++, static_cast(input->dim(1))); + kernel_.setArg(idx++, static_cast(input->dim(2))); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(channels)); + kernel_.setArg(idx++, static_cast(stride_h)); + kernel_.setArg(idx++, static_cast(stride_w)); + kernel_.setArg(idx++, stride_h_r); + kernel_.setArg(idx++, stride_w_r); + kernel_.setArg(idx++, static_cast(align_h)); + kernel_.setArg(idx++, static_cast(align_w)); + kernel_.setArg(idx++, static_cast(padding_h)); + kernel_.setArg(idx++, static_cast(padding_w)); + kernel_.setArg(idx++, static_cast(filter->dim(2))); + kernel_.setArg(idx++, static_cast(filter->dim(3))); + kernel_.setArg(idx++, static_cast(kernel_size)); + kernel_.setArg(idx++, static_cast(channel_blocks)); + + input_shape_ = input->shape(); + } + + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("depthwise_deconv2d_kernel_", + activation, + output->dim(0), + output->dim(1), + output->dim(2), + output->dim(3)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_OPENCL_IMAGE_DEPTHWISE_DECONV2D_H_ diff --git a/mace/ops/ops_registry.cc b/mace/ops/ops_registry.cc index e330d66a..bd3038c5 100644 --- a/mace/ops/ops_registry.cc +++ b/mace/ops/ops_registry.cc @@ -32,6 +32,7 @@ extern void RegisterCrop(OpRegistryBase *op_registry); extern void RegisterDeconv2D(OpRegistryBase *op_registry); extern void RegisterDepthToSpace(OpRegistryBase *op_registry); extern void RegisterDepthwiseConv2d(OpRegistryBase *op_registry); +extern void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry); extern void RegisterDequantize(OpRegistryBase *op_registry); extern void RegisterEltwise(OpRegistryBase *op_registry); extern void RegisterExpandDims(OpRegistryBase *op_registry); @@ -89,6 +90,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() { ops::RegisterDeconv2D(this); ops::RegisterDepthToSpace(this); ops::RegisterDepthwiseConv2d(this); + ops::RegisterDepthwiseDeconv2d(this); ops::RegisterDequantize(this); ops::RegisterEltwise(this); ops::RegisterExpandDims(this); diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index e7dff9bd..fe030985 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -91,6 +91,7 @@ MaceSupportedOps = [ 'Deconv2D', 'DepthToSpace', 'DepthwiseConv2d', + 'DepthwiseDeconv2d', 'Dequantize', 'Eltwise', 'ExpandDims', @@ -183,6 +184,7 @@ class MaceKeyword(object): mace_scalar_input_index_str = 'scalar_input_index' mace_opencl_mem_type = "opencl_mem_type" mace_framework_type_str = "framework_type" + mace_group_str = "group" class TransformerRule(Enum): diff --git a/mace/python/tools/converter_tool/caffe_converter.py b/mace/python/tools/converter_tool/caffe_converter.py index 374d1073..a2e89a6d 100644 --- a/mace/python/tools/converter_tool/caffe_converter.py +++ b/mace/python/tools/converter_tool/caffe_converter.py @@ -411,17 +411,14 @@ class CaffeConverter(base_converter.ConverterInterface): def convert_deconv2d(self, caffe_op): op = self.convert_general_op(caffe_op) param = caffe_op.layer.convolution_param - is_depthwise = False - if param.HasField(caffe_group_str) and param.group > 1: - filter_data = caffe_op.blobs[0] - mace_check(param.group == filter_data.shape[0] and - filter_data.shape[1] == 1, - "Mace does not support group deconvolution yet") - is_depthwise = True - mace_check(is_depthwise is False, - "Mace do not support depthwise deconvolution yet") - op.type = MaceOp.Deconv2D.name + if param.HasField(caffe_group_str) and param.group > 1: + group_arg = op.arg.add() + group_arg.name = MaceKeyword.mace_group_str + group_arg.i = param.group + op.type = MaceOp.DepthwiseDeconv2d.name + else: + op.type = MaceOp.Deconv2D.name self.add_stride_pad_kernel_arg(param, op) # dilation is specific for convolution in caffe diff --git a/mace/python/tools/converter_tool/shape_inference.py b/mace/python/tools/converter_tool/shape_inference.py index e62affaf..fbc22783 100644 --- a/mace/python/tools/converter_tool/shape_inference.py +++ b/mace/python/tools/converter_tool/shape_inference.py @@ -36,6 +36,7 @@ class ShapeInference(object): MaceOp.Conv2D.name: self.infer_shape_conv_pool_shape, MaceOp.Deconv2D.name: self.infer_shape_deconv, MaceOp.DepthwiseConv2d.name: self.infer_shape_conv_pool_shape, + MaceOp.DepthwiseDeconv2d.name: self.infer_shape_deconv, MaceOp.Eltwise.name: self.infer_shape_general, MaceOp.BatchNorm.name: self.infer_shape_general, MaceOp.AddN.name: self.infer_shape_general, @@ -159,11 +160,15 @@ class ShapeInference(object): dilations = [1, 1] round_func = math.floor + group_arg = ConverterUtil.get_arg(op, + MaceKeyword.mace_group_str) output_shape[0] = input_shape[0] if ConverterUtil.data_format(op) == DataFormat.NCHW \ and ConverterUtil.filter_format(self._net) == FilterFormat.OIHW: # noqa # filter format: IOHW output_shape[1] = filter_shape[1] + if group_arg is not None and group_arg.i > 1: + output_shape[1] = group_arg.i * filter_shape[1] output_shape[2] = int( round_func((input_shape[2] - 1) * strides[0] + (filter_shape[2] - 1) * (dilations[0] - 1) + diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 1ab81452..107aab02 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -583,7 +583,7 @@ class Transformer(base_converter.ConverterInterface): def fold_deconv_and_bn(self): net = self._model for op in net.op: - if (op.type == MaceOp.Deconv2D.name) \ + if (op.type in [MaceOp.Deconv2D.name, MaceOp.DepthwiseDeconv2d]) \ and self.consumer_count(op.output[0]) == 1: consumer_op = self._consumers[op.output[0]][0] if consumer_op.type == MaceOp.BatchNorm.name: @@ -1365,7 +1365,8 @@ class Transformer(base_converter.ConverterInterface): self.set_filter_format(FilterFormat.OIHW) # deconv's filter's output channel and input channel is reversed for op in net.op: - if op.type == MaceOp.Deconv2D.name \ + if op.type in [MaceOp.Deconv2D.name, + MaceOp.DepthwiseDeconv2d] \ and op.input[1] not in transposed_deconv_filter: filter = self._consts[op.input[1]] filter_data = np.array(filter.float_data).reshape( @@ -1427,11 +1428,17 @@ class Transformer(base_converter.ConverterInterface): self.buffer_transform(op, 1, OpenCLBufferType.CONV2D_FILTER) if len(op.input) >= 3: self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) - elif op.type == MaceOp.Deconv2D.name: - self.buffer_transform(op, 1, OpenCLBufferType.CONV2D_FILTER) + elif op.type == MaceOp.Deconv2D.name\ + or op.type == MaceOp.DepthwiseDeconv2d.name: + if op.type == MaceOp.Deconv2D.name: + self.buffer_transform(op, 1, + OpenCLBufferType.CONV2D_FILTER) + elif op.type == MaceOp.DepthwiseDeconv2d.name: + self.buffer_transform(op, 1, + OpenCLBufferType.DW_CONV2D_FILTER) if ConverterUtil.get_arg( op, - MaceKeyword.mace_framework_type_str).i ==\ + MaceKeyword.mace_framework_type_str).i == \ FrameworkType.CAFFE.value: if len(op.input) >= 3: self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) @@ -1456,8 +1463,10 @@ class Transformer(base_converter.ConverterInterface): if len(op.input) >= 4: self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.MatMul.name and \ - ConverterUtil.get_arg(op, - MaceKeyword.mace_winograd_filter_transformed) is not None: # noqa + ConverterUtil.get_arg( + op, + MaceKeyword.mace_winograd_filter_transformed + ) is not None: # noqa self.buffer_transform(op, 0, OpenCLBufferType.WINOGRAD_FILTER) elif op.type == MaceOp.WinogradInverseTransform.name \ and len(op.input) >= 3: @@ -1467,8 +1476,10 @@ class Transformer(base_converter.ConverterInterface): if len(op.input) >= 3: self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.Activation.name: - if ConverterUtil.get_arg(op, - MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name: # noqa + if ConverterUtil.get_arg( + op, + MaceKeyword.mace_activation_type_str + ).s == ActivationType.PRELU.name: # noqa self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT) elif op.type == MaceOp.LSTMCell.name: if op.input[1] in self._consts: @@ -1793,24 +1804,24 @@ class Transformer(base_converter.ConverterInterface): check_conv = False check_deconv = False if ops is not None and len(ops) == 1: - check_conv =\ - ops[0].type in [MaceOp.Conv2D.name, - MaceOp.DepthwiseConv2d.name, - MaceOp.FullyConnected.name]\ - and len(ops[0].input) >= 3\ - and ops[0].input[2] == tensor.name + if len(ops[0].input) >= 3: + check_conv =\ + ops[0].type in [MaceOp.Conv2D.name, + MaceOp.DepthwiseConv2d.name, + MaceOp.FullyConnected.name]\ + and ops[0].input[2] == tensor.name # in tensorflow deconv's bias is the forth input - if ops[0].type == MaceOp.Deconv2D.name: + if ops[0].type in [MaceOp.Deconv2D.name, + MaceOp.DepthwiseDeconv2d]: from_caffe = ConverterUtil.get_arg( ops[0], MaceKeyword.mace_framework_type_str).i ==\ FrameworkType.CAFFE.value - if from_caffe: - check_deconv = len(ops[0].input) >= 3\ - and ops[0].input[2] == tensor.name + if from_caffe and len(ops[0].input) >= 3: + check_deconv = ops[0].input[2] == tensor.name else: - check_deconv = len(ops[0].input) >= 4\ - and ops[0].input[3] == tensor.name + if len(ops[0].input) >= 4: + check_deconv = ops[0].input[3] == tensor.name if check_conv or check_deconv: if self._option.device == DeviceType.CPU.value: conv_op = ops[0] diff --git a/repository/opencl-kernel/opencl_kernel_configure.bzl b/repository/opencl-kernel/opencl_kernel_configure.bzl index c844a5d3..65cc5635 100644 --- a/repository/opencl-kernel/opencl_kernel_configure.bzl +++ b/repository/opencl-kernel/opencl_kernel_configure.bzl @@ -37,6 +37,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx): unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_buffer.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/crop.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/deconv_2d.cl")) + unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_deconv2d.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depth_to_space.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d_buffer.cl")) diff --git a/tools/validate.py b/tools/validate.py index be499c1a..3c85fb55 100644 --- a/tools/validate.py +++ b/tools/validate.py @@ -18,8 +18,6 @@ import os import os.path import numpy as np import re -from scipy import spatial -from scipy import stats import common @@ -60,14 +58,22 @@ def calculate_sqnr(expected, actual): return signal_power_sum / (noise_power_sum + 1e-15) +def calculate_similarity(u, v, data_type=np.float64): + if u.dtype is not data_type: + u = u.astype(data_type) + if v.dtype is not data_type: + v = v.astype(data_type) + return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)) + + def compare_output(platform, device_type, output_name, mace_out_value, out_value, validation_threshold): if mace_out_value.size != 0: out_value = out_value.reshape(-1) mace_out_value = mace_out_value.reshape(-1) assert len(out_value) == len(mace_out_value) - similarity = (1 - spatial.distance.cosine(out_value, mace_out_value)) sqnr = calculate_sqnr(out_value, mace_out_value) + similarity = calculate_similarity(out_value, mace_out_value) common.MaceLogger.summary( output_name + ' MACE VS ' + platform.upper() + ' similarity: ' + str(similarity) + ' , sqnr: ' + str(sqnr)) -- GitLab