diff --git a/.gitignore b/.gitignore index 08c6e7f8350fec54634e88f4f63ae9d34652d3fb..c7866876efb3bf00acc2546207d2393b27efbf6c 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,9 @@ mace/examples/android/macelibrary/src/main/cpp/mace/ *swp *~ .python-version + +mace/examples/android/macelibrary/src/main/cpp/include/mace/public/ + +mace/examples/android/macelibrary/src/main/cpp/lib/arm64-v8a/ + +mace/examples/android/macelibrary/src/main/jniLibs/arm64-v8a/ diff --git a/mace/ops/arm/deconv_2d_neon.h b/mace/ops/arm/deconv_2d_neon.h index abed4926c00e247c3f89a05f80aa702c5953eeb1..916670447af91e35ccf27a8377d849a6172df0fb 100644 --- a/mace/ops/arm/deconv_2d_neon.h +++ b/mace/ops/arm/deconv_2d_neon.h @@ -21,6 +21,18 @@ namespace mace { namespace ops { +void Deconv2dNeonK2x2S1(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output); + +void Deconv2dNeonK2x2S2(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output); + void Deconv2dNeonK3x3S1(const float *input, const float *filter, const index_t *in_shape, diff --git a/mace/ops/arm/deconv_2d_neon_2x2.cc b/mace/ops/arm/deconv_2d_neon_2x2.cc new file mode 100644 index 0000000000000000000000000000000000000000..001ab01be369f4b3f880c457073be754b7ef1eb9 --- /dev/null +++ b/mace/ops/arm/deconv_2d_neon_2x2.cc @@ -0,0 +1,262 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/macros.h" +#include "mace/ops/arm/deconv_2d_neon.h" + +namespace mace { +namespace ops { + +void Deconv2dNeonK2x2S1(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t out_img_size = outh * outw; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t oc = 0; oc < outch; oc += 2) { + if (oc + 1 < outch) { + float *out_base0 = output + (b * outch + oc) * out_img_size; + float *out_base1 = out_base0 + out_img_size; + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input + (b * inch + ic) * h * w; + const float *kernel_base0 = filter + (oc * inch + ic) * 4; + const float *kernel_base1 = kernel_base0 + inch * 4; + const float *in = input_base; + // output channel 0 + const float *k0 = kernel_base0; + // output channel 1 + const float *k1 = kernel_base1; +#if defined(MACE_ENABLE_NEON) + // load filter + float32x4_t k0_vec = vld1q_f32(k0); + float32x4_t k1_vec = vld1q_f32(k1); +#endif + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + + float *out_row_base1 = out_base1 + i * outw; + float *out_row1_0 = out_row_base1; + float *out_row1_1 = out_row_base1 + outw; + + index_t j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + float32x4_t out00, out01, out02, out03; + float32x4_t out10, out11, out12, out13; + + out00 = vld1q_f32(out_row0_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row0_0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_1); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row0_1, out02); + + out03 = vld1q_f32(out_row0_1 + 1); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row0_1 + 1, out03); + + out10 = vld1q_f32(out_row1_0); + out10 = neon_vfma_lane_0(out10, in_vec, k1_vec); + vst1q_f32(out_row1_0, out10); + + out11 = vld1q_f32(out_row1_0 + 1); + out11 = neon_vfma_lane_1(out11, in_vec, k1_vec); + vst1q_f32(out_row1_0 + 1, out11); + + out12 = vld1q_f32(out_row1_1); + out12 = neon_vfma_lane_2(out12, in_vec, k1_vec); + vst1q_f32(out_row1_1, out12); + + out13 = vld1q_f32(out_row1_1 + 1); + out13 = neon_vfma_lane_3(out13, in_vec, k1_vec); + vst1q_f32(out_row1_1 + 1, out13); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + out_row1_0 += 4; + out_row1_1 += 4; + } +#endif + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 2; ++k) { + out_row0_0[k] += val * k0[k]; + out_row0_1[k] += val * k0[k + 2]; + out_row1_0[k] += val * k1[k]; + out_row1_1[k] += val * k1[k + 2]; + } + in++; + out_row0_0++; + out_row0_1++; + out_row1_0++; + out_row1_1++; + } + } + } + } else { + float *out_base0 = output + (b * outch + oc) * outh * outw; + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input + (b * inch + ic) * h * w; + const float *kernel_base0 = filter + (oc * inch + ic) * 4; + const float *in = input_base; + const float *k0 = kernel_base0; + +#if defined(MACE_ENABLE_NEON) + // load filter + float32x4_t k0_vec = vld1q_f32(k0); +#endif + for (index_t i = 0; i < h; ++i) { + float *out_row_base0 = out_base0 + i * outw; + float *out_row0_0 = out_row_base0; + float *out_row0_1 = out_row_base0 + outw; + index_t j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + float32x4_t out00, out01, out02, out03; + + out00 = vld1q_f32(out_row0_0); + out00 = neon_vfma_lane_0(out00, in_vec, k0_vec); + vst1q_f32(out_row0_0, out00); + + out01 = vld1q_f32(out_row0_0 + 1); + out01 = neon_vfma_lane_1(out01, in_vec, k0_vec); + vst1q_f32(out_row0_0 + 1, out01); + + out02 = vld1q_f32(out_row0_1); + out02 = neon_vfma_lane_2(out02, in_vec, k0_vec); + vst1q_f32(out_row0_1, out02); + + out03 = vld1q_f32(out_row0_1 + 1); + out03 = neon_vfma_lane_3(out03, in_vec, k0_vec); + vst1q_f32(out_row0_1 + 1, out03); + + in += 4; + out_row0_0 += 4; + out_row0_1 += 4; + } +#endif + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 2; ++k) { + out_row0_0[k] += val * k0[k]; + out_row0_1[k] += val * k0[k + 2]; + } + in++; + out_row0_0++; + out_row0_1++; + } + } + } + } + } + } +} + +void Deconv2dNeonK2x2S2(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t oc = 0; oc < outch; ++oc) { + float *out_base = output + (b * outch + oc) * out_img_size; + for (index_t ic = 0; ic < inch; ++ic) { + const float *input_base = input + (b * inch + ic) * h * w; + const float *kernel_base = filter + (oc * inch + ic) * 4; + const float *in = input_base; + const float *k0 = kernel_base; +#if defined(MACE_ENABLE_NEON) + float32x4_t k0_vec = vld1q_f32(k0); +#endif + for (index_t i = 0; i < h; ++i) { + float *out_row_base = out_base + i * 2 * outw; + float *out_row_0 = out_row_base; + float *out_row_1 = out_row_0 + outw; + + index_t j = 0; +#if defined(MACE_ENABLE_NEON) + for (; j + 3 < w; j += 4) { + float32x4_t in_vec = vld1q_f32(in); + + // out row 0 + float32x4x2_t out00 = vld2q_f32(out_row_0); + out00.val[0] = + neon_vfma_lane_0(out00.val[0], in_vec, k0_vec); + out00.val[1] = + neon_vfma_lane_1(out00.val[1], in_vec, k0_vec); + vst2q_f32(out_row_0, out00); + + // out row 1 + float32x4x2_t out10 = vld2q_f32(out_row_1); + out10.val[0] = + neon_vfma_lane_2(out10.val[0], in_vec, k0_vec); + out10.val[1] = + neon_vfma_lane_3(out10.val[1], in_vec, k0_vec); + vst2q_f32(out_row_1, out10); + + in += 4; + out_row_0 += 8; + out_row_1 += 8; + } +#endif + for (; j < w; ++j) { + float val = in[0]; + for (int k = 0; k < 2; ++k) { + out_row_0[k] += val * k0[k]; + out_row_1[k] += val * k0[k + 2]; + } + in++; + out_row_0 += 2; + out_row_1 += 2; + } + } + } + } + } +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 001b38a91426d638a84abdcf0cccfd747234517b..183885295f8ac780ad14b170d0c298e9902c4b48 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -25,11 +25,9 @@ #include #include "mace/core/future.h" -#include "mace/core/operator.h" #include "mace/core/tensor.h" #include "mace/ops/activation.h" #include "mace/ops/arm/deconv_2d_neon.h" -#include "mace/ops/conv_pool_2d_util.h" #include "mace/utils/utils.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/deconv_2d.h" @@ -38,134 +36,6 @@ namespace mace { namespace ops { -class Deconv2dOpBase : public Operation { - public: - explicit Deconv2dOpBase(OpConstructContext *context) - : Operation(context), - strides_(Operation::GetRepeatedArgs("strides")), - padding_type_(static_cast(Operation::GetOptionalArg( - "padding", static_cast(SAME)))), - paddings_(Operation::GetRepeatedArgs("padding_values")), - model_type_(static_cast( - Operation::GetOptionalArg("framework_type", 0))), - activation_(ops::StringToActivationType( - Operation::GetOptionalArg("activation", - "NOOP"))), - relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)) {} - - - static void CalcDeconvOutputSize( - const index_t *input_shape, // NHWC - const index_t *filter_shape, // OIHW - const int *strides, - index_t *output_shape, - const int *padding_size, - int *input_padding, - const bool isNCHW = false) { - MACE_CHECK_NOTNULL(output_shape); - MACE_CHECK_NOTNULL(padding_size); - MACE_CHECK_NOTNULL(input_shape); - MACE_CHECK_NOTNULL(filter_shape); - MACE_CHECK_NOTNULL(strides); - - const index_t output_channel = filter_shape[0]; - - const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; - const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; - - const index_t kernel_h = filter_shape[2]; - const index_t kernel_w = filter_shape[3]; - - input_padding[0] = static_cast((kernel_h -1) * 2 - padding_size[0]); - input_padding[1] = static_cast((kernel_w -1) * 2 - padding_size[1]); - input_padding[0] = std::max(0, input_padding[0]); - input_padding[1] = std::max(0, input_padding[1]); - - index_t out_height = - (in_height - 1) * strides[0] + kernel_h - padding_size[0]; - index_t out_width = - (in_width - 1) * strides[1] + kernel_w - padding_size[1]; - - output_shape[0] = input_shape[0]; - if (isNCHW) { - output_shape[1] = output_channel; - output_shape[2] = out_height; - output_shape[3] = out_width; - } else { - output_shape[1] = out_height; - output_shape[2] = out_width; - output_shape[3] = output_channel; - } - } - - static void CalcDeconvPaddingAndInputSize( - const index_t *input_shape, // NHWC - const index_t *filter_shape, // OIHW - const int *strides, - Padding padding, - const index_t *output_shape, - int *padding_size, - const bool isNCHW = false) { - MACE_CHECK_NOTNULL(output_shape); - MACE_CHECK_NOTNULL(padding_size); - MACE_CHECK_NOTNULL(input_shape); - MACE_CHECK_NOTNULL(filter_shape); - MACE_CHECK_NOTNULL(strides); - - const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; - const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; - - const index_t out_height = isNCHW ? output_shape[2] : output_shape[1]; - const index_t out_width = isNCHW ? output_shape[3] : output_shape[2]; - - const index_t extended_input_height = (in_height - 1) * strides[0] + 1; - const index_t extended_input_width = (in_width - 1) * strides[1] + 1; - - const index_t filter_h = filter_shape[2]; - const index_t filter_w = filter_shape[3]; - - index_t expected_input_height = 0, expected_input_width = 0; - - switch (padding) { - case VALID: - expected_input_height = - (out_height - filter_h + strides[0]) / strides[0]; - expected_input_width = - (out_width - filter_w + strides[1]) / strides[1]; - break; - case SAME: - expected_input_height = - (out_height + strides[0] - 1) / strides[0]; - expected_input_width = - (out_width + strides[1] - 1) / strides[1]; - break; - default: - MACE_CHECK(false, "Unsupported padding type: ", padding); - } - - MACE_CHECK(expected_input_height == in_height, - expected_input_height, "!=", in_height); - MACE_CHECK(expected_input_width == in_width, - expected_input_width, "!=", in_width); - - const int p_h = static_cast(out_height + - filter_h - 1 - extended_input_height); - const int p_w = static_cast(out_width + - filter_w - 1 - extended_input_width); - - padding_size[0] = std::max(0, p_h); - padding_size[1] = std::max(0, p_w); - } - - protected: - std::vector strides_; // [stride_h, stride_w] - const Padding padding_type_; - std::vector paddings_; - const FrameworkType model_type_; - const ActivationType activation_; - const float relux_max_limit_; -}; - template class Deconv2dOp; @@ -193,56 +63,65 @@ class Deconv2dOp : public Deconv2dOpBase { MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); - std::vector paddings(2); - std::vector out_paddings(2); - std::vector output_shape(4); + std::vector in_paddings(2, 0); + std::vector out_paddings(2, 0); + std::vector out_shape(4, 0); + std::vector padded_out_shape(4, 0); + if (model_type_ == FrameworkType::TENSORFLOW) { // tensorflow - paddings = std::vector(2, 0); MACE_CHECK_NOTNULL(output_shape_tensor); MACE_CHECK(output_shape_tensor->size() == 4); Tensor::MappingGuard output_shape_mapper(output_shape_tensor); auto output_shape_data = output_shape_tensor->data(); - output_shape = + out_shape = std::vector(output_shape_data, output_shape_data + 4); - const index_t t = output_shape[1]; - output_shape[1] = output_shape[3]; - output_shape[3] = output_shape[2]; - output_shape[2] = t; + const index_t t = out_shape[1]; + out_shape[1] = out_shape[3]; + out_shape[3] = out_shape[2]; + out_shape[2] = t; - CalcDeconvPaddingAndInputSize( + CalcDeconvShape_TF( input->shape().data(), filter->shape().data(), - strides_.data(), padding_type_, - output_shape.data(), - paddings.data(), true); + out_shape.data(), + strides_.data(), + 1, + padding_type_, + in_paddings.data(), + out_paddings.data(), + padded_out_shape.data(), + true); } else { // caffe - out_paddings = paddings_; - output_shape = std::vector(4, 0); - CalcDeconvOutputSize(input->shape().data(), - filter->shape().data(), - strides_.data(), - output_shape.data(), - out_paddings.data(), - paddings.data(), - true); + if (!paddings_.empty()) out_paddings = paddings_; + CalcDeconvShape_Caffe( + input->shape().data(), + filter->shape().data(), + strides_.data(), + out_paddings.data(), + 1, + in_paddings.data(), + out_shape.data(), + padded_out_shape.data(), + true); } - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + output->Clear(); index_t kernel_h = filter->dim(2); index_t kernel_w = filter->dim(3); const index_t *in_shape = input->shape().data(); - MACE_CHECK(filter->dim(0) == output_shape[1], filter->dim(0), " != ", - output_shape[1]); + MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ", + out_shape[1]); MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ", in_shape[1]); - MACE_CHECK(in_shape[0] == output_shape[0], + MACE_CHECK(in_shape[0] == out_shape[0], "Input/Output batch size mismatch"); std::function deconv_func; Tensor::MappingGuard input_mapper(input); @@ -254,13 +133,9 @@ class Deconv2dOp : public Deconv2dOpBase { auto bias_data = bias == nullptr ? nullptr : bias->data(); auto output_data = output->mutable_data(); - const index_t padded_out_h = (in_shape[2] - 1) * strides_[0] + kernel_h; - const index_t padded_out_w = (in_shape[3] - 1) * strides_[1] + kernel_w; - const index_t pad_h = (padded_out_h - output_shape[2]) / 2; - const index_t pad_w = (padded_out_w - output_shape[3]) / 2; + const index_t pad_h = out_paddings[0] / 2; + const index_t pad_w = out_paddings[1] / 2; - std::vector padded_out_shape({output_shape[0], output_shape[1], - padded_out_h, padded_out_w}); index_t padded_out_size = std::accumulate(padded_out_shape.begin(), padded_out_shape.end(), @@ -274,6 +149,11 @@ class Deconv2dOp : public Deconv2dOpBase { padded_out.Clear(); auto *padded_out_data = padded_out.mutable_data(); + bool use_neon_2x2_s1 = kernel_h == kernel_w && kernel_h == 2 && + strides_[0] == strides_[1] && strides_[0] == 1; + bool use_neon_2x2_s2 = kernel_h == kernel_w && kernel_h == 2 && + strides_[0] == strides_[1] && strides_[0] == 2; + bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 && strides_[0] == strides_[1] && strides_[0] == 1; bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 && @@ -284,73 +164,98 @@ class Deconv2dOp : public Deconv2dOpBase { bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 && strides_[0] == strides_[1] && strides_[0] == 2; - if (use_neon_3x3_s1) { + if (use_neon_2x2_s1) { deconv_func = [=](const float *input, const float *filter, - const index_t *in_shape, - const index_t *padded_out_shape, + const index_t *input_shape, + const index_t *padded_output_shape, + float *padded_output) { + Deconv2dNeonK2x2S1(input, + filter, + input_shape, + padded_output_shape, + padded_output); + }; + } else if (use_neon_2x2_s2) { + deconv_func = [=](const float *input, + const float *filter, + const index_t *input_shape, + const index_t *padded_output_shape, + float *padded_output) { + Deconv2dNeonK2x2S2(input, + filter, + input_shape, + padded_output_shape, + padded_output); + }; + } else if (use_neon_3x3_s1) { + deconv_func = [=](const float *input, + const float *filter, + const index_t *input_shape, + const index_t *padded_output_shape, float *padded_output) { Deconv2dNeonK3x3S1(input, filter, - in_shape, - padded_out_shape, + input_shape, + padded_output_shape, padded_output); }; } else if (use_neon_3x3_s2) { deconv_func = [=](const float *input, const float *filter, - const index_t *in_shape, - const index_t *padded_out_shape, + const index_t *input_shape, + const index_t *padded_output_shape, float *padded_output) { Deconv2dNeonK3x3S2(input, filter, - in_shape, - padded_out_shape, + input_shape, + padded_output_shape, padded_output); }; } else if (use_neon_4x4_s1) { deconv_func = [=](const float *input, const float *filter, - const index_t *in_shape, - const index_t *padded_out_shape, + const index_t *input_shape, + const index_t *padded_output_shape, float *padded_output) { Deconv2dNeonK4x4S1(input, filter, - in_shape, - padded_out_shape, + input_shape, + padded_output_shape, padded_output); }; } else if (use_neon_4x4_s2) { deconv_func = [=](const float *input, const float *filter, - const index_t *in_shape, - const index_t *padded_out_shape, + const index_t *input_shape, + const index_t *padded_output_shape, float *padded_output) { Deconv2dNeonK4x4S2(input, filter, - in_shape, - padded_out_shape, + input_shape, + padded_output_shape, padded_output); }; } else { deconv_func = [=](const float *input, const float *filter, - const index_t *in_shape, - const index_t *padded_out_shape, + const index_t *input_shape, + const index_t *padded_output_shape, float *padded_output) { Deconv2dGeneral(input, filter, kernel_h, kernel_w, strides_.data(), - in_shape, - padded_out_shape, + input_shape, + padded_output_shape, padded_output); }; } bool no_pad = - padded_out_h == output_shape[2] && padded_out_w == output_shape[3]; + (padded_out_shape[2] == out_shape[2]) && + (padded_out_shape[3] == out_shape[3]); float *out_data = no_pad ? output_data : padded_out_data; deconv_func(input_data, @@ -361,16 +266,16 @@ class Deconv2dOp : public Deconv2dOpBase { if (!no_pad) { CropPadOut(out_data, padded_out_shape.data(), - output_shape.data(), + out_shape.data(), pad_h, pad_w, output_data); } if (bias_data != nullptr) { - const index_t batch = output_shape[0]; - const index_t channels = output_shape[1]; - const index_t img_size = output_shape[2] * output_shape[3]; + const index_t batch = out_shape[0]; + const index_t channels = out_shape[1]; + const index_t img_size = out_shape[2] * out_shape[3]; #pragma omp parallel for collapse(3) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { @@ -476,39 +381,46 @@ class Deconv2dOp : public Deconv2dOpBase { MACE_CHECK_NOTNULL(input); MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); - std::vector paddings(2); - std::vector out_paddings(2); - std::vector output_shape(4); + + std::vector in_paddings(2, 0); + std::vector out_shape(4, 0); + if (model_type_ == FrameworkType::TENSORFLOW) { - paddings = std::vector(2, 0); MACE_CHECK_NOTNULL(output_shape_tensor); MACE_CHECK(output_shape_tensor->size() == 4); Tensor::MappingGuard output_shape_mapper(output_shape_tensor); auto output_shape_data = output_shape_tensor->data(); - output_shape = + out_shape = std::vector(output_shape_data, output_shape_data + 4); - CalcDeconvPaddingAndInputSize(input->shape().data(), - filter->shape().data(), - strides_.data(), - padding_type_, - output_shape.data(), - paddings.data()); + + CalcDeconvShape_TF( + input->shape().data(), + filter->shape().data(), + out_shape.data(), + strides_.data(), + 1, + padding_type_, + in_paddings.data(), + nullptr, + nullptr); } else { - out_paddings = paddings_; - paddings = std::vector(2, 0); - output_shape = std::vector(4, 0); - CalcDeconvOutputSize(input->shape().data(), - filter->shape().data(), - strides_.data(), - output_shape.data(), - out_paddings.data(), - paddings.data()); + std::vector out_paddings(2, 0); + if (!paddings_.empty()) out_paddings = paddings_; + CalcDeconvShape_Caffe( + input->shape().data(), + filter->shape().data(), + strides_.data(), + out_paddings.data(), + 1, + in_paddings.data(), + out_shape.data(), + nullptr); } return kernel_->Compute(context, input, filter, bias, - strides_.data(), paddings.data(), activation_, - relux_max_limit_, output_shape, output); + strides_.data(), in_paddings.data(), activation_, + relux_max_limit_, out_shape, output); } private: diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h index 1af7362bd19cf642d4be923152495f6352cbede4..f6a4200ced1f867f03778a15b61c443ba7899971 100644 --- a/mace/ops/deconv_2d.h +++ b/mace/ops/deconv_2d.h @@ -15,7 +15,14 @@ #ifndef MACE_OPS_DECONV_2D_H_ #define MACE_OPS_DECONV_2D_H_ +#include +#include +#include + +#include "mace/core/operator.h" #include "mace/core/types.h" +#include "mace/ops/activation.h" +#include "mace/ops/conv_pool_2d_util.h" namespace mace { namespace ops { @@ -25,6 +32,167 @@ enum FrameworkType { CAFFE = 1, }; +class Deconv2dOpBase : public Operation { + public: + explicit Deconv2dOpBase(OpConstructContext *context) + : Operation(context), + strides_(Operation::GetRepeatedArgs("strides")), + padding_type_(static_cast(Operation::GetOptionalArg( + "padding", static_cast(SAME)))), + paddings_(Operation::GetRepeatedArgs("padding_values")), + group_(Operation::GetOptionalArg("group", 1)), + model_type_(static_cast( + Operation::GetOptionalArg("framework_type", 0))), + activation_(ops::StringToActivationType( + Operation::GetOptionalArg("activation", + "NOOP"))), + relux_max_limit_( + Operation::GetOptionalArg("max_limit", 0.0f)) {} + + static void CalcDeconvShape_Caffe( + const index_t *input_shape, // NHWC + const index_t *filter_shape, // OIHW + const int *strides, + const int *out_paddings, + const int group, + int *in_paddings, + index_t *out_shape, + index_t *padded_out_shape, + const bool isNCHW = false) { + MACE_CHECK_NOTNULL(out_paddings); + MACE_CHECK_NOTNULL(input_shape); + MACE_CHECK_NOTNULL(filter_shape); + MACE_CHECK_NOTNULL(strides); + + const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; + const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; + + const index_t output_channel = filter_shape[0] * group; + + const index_t kernel_h = filter_shape[2]; + const index_t kernel_w = filter_shape[3]; + + index_t padded_out_height = + (in_height - 1) * strides[0] + kernel_h; + index_t padded_out_width = + (in_width - 1) * strides[1] + kernel_w; + + if (in_paddings != nullptr) { + in_paddings[0] = static_cast((kernel_h - 1) * 2 - out_paddings[0]); + in_paddings[1] = static_cast((kernel_w - 1) * 2 - out_paddings[1]); + in_paddings[0] = std::max(0, in_paddings[0]); + in_paddings[1] = std::max(0, in_paddings[1]); + } + + if (padded_out_shape != nullptr) { + padded_out_shape[0] = input_shape[0]; + padded_out_shape[1] = isNCHW ? output_channel : padded_out_height; + padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width; + padded_out_shape[3] = isNCHW ? padded_out_width : output_channel; + } + + if (out_shape != nullptr) { + index_t out_height = padded_out_height - out_paddings[0]; + index_t out_width = padded_out_width - out_paddings[1]; + out_shape[0] = input_shape[0]; + out_shape[1] = isNCHW ? output_channel : out_height; + out_shape[2] = isNCHW ? out_height : out_width; + out_shape[3] = isNCHW ? out_width : output_channel; + } + } + + static void CalcDeconvShape_TF( + const index_t *input_shape, // NHWC + const index_t *filter_shape, // OIHW + const index_t *output_shape, + const int *strides, + const int group, + Padding padding_type, + int *in_paddings, + int *out_paddings, + index_t *padded_out_shape, + const bool isNCHW = false) { + MACE_CHECK_NOTNULL(output_shape); + MACE_CHECK_NOTNULL(input_shape); + MACE_CHECK_NOTNULL(filter_shape); + MACE_CHECK_NOTNULL(strides); + + const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; + const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; + + const index_t out_height = isNCHW ? output_shape[2] : output_shape[1]; + const index_t out_width = isNCHW ? output_shape[3] : output_shape[2]; + + const index_t extended_in_height = (in_height - 1) * strides[0] + 1; + const index_t extended_in_width = (in_width - 1) * strides[1] + 1; + + const index_t kernel_h = filter_shape[2]; + const index_t kernel_w = filter_shape[3]; + + index_t expected_input_height = 0, expected_input_width = 0; + + switch (padding_type) { + case VALID: + expected_input_height = + (out_height - kernel_h + strides[0]) / strides[0]; + expected_input_width = + (out_width - kernel_w + strides[1]) / strides[1]; + break; + case SAME: + expected_input_height = + (out_height + strides[0] - 1) / strides[0]; + expected_input_width = + (out_width + strides[1] - 1) / strides[1]; + break; + default: + MACE_CHECK(false, "Unsupported padding type: ", padding_type); + } + + MACE_CHECK(expected_input_height == in_height, + expected_input_height, "!=", in_height); + MACE_CHECK(expected_input_width == in_width, + expected_input_width, "!=", in_width); + + const index_t padded_out_height = + (in_height - 1) * strides[0] + kernel_h; + const index_t padded_out_width = + (in_width - 1) * strides[1] + kernel_w; + + if (in_paddings != nullptr) { + const int p_h = + static_cast(out_height + kernel_h - 1 - extended_in_height); + const int p_w = + static_cast(out_width + kernel_w - 1 - extended_in_width); + in_paddings[0] = std::max(0, p_h); + in_paddings[1] = std::max(0, p_w); + } + + if (out_paddings != nullptr) { + const int o_p_h = static_cast(padded_out_height - out_height); + const int o_p_w = static_cast(padded_out_width - out_width); + out_paddings[0] = std::max(0, o_p_h); + out_paddings[1] = std::max(0, o_p_w); + } + + if (padded_out_shape != nullptr) { + index_t output_channel = filter_shape[0] * group; + padded_out_shape[0] = output_shape[0]; + padded_out_shape[1] = isNCHW ? output_channel : padded_out_height; + padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width; + padded_out_shape[3] = isNCHW ? padded_out_width : output_channel; + } + } + + protected: + std::vector strides_; // [stride_h, stride_w] + const Padding padding_type_; + std::vector paddings_; + const int group_; + const FrameworkType model_type_; + const ActivationType activation_; + const float relux_max_limit_; +}; + template void CropPadOut(const T *input, const index_t *in_shape, diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc index a33b2f7baf6db29172228a072f537c2710690147..9aadd42c0c345da3bac268f1c645639850fafc80 100644 --- a/mace/ops/deconv_2d_test.cc +++ b/mace/ops/deconv_2d_test.cc @@ -370,9 +370,9 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingValid_S2) { namespace { template -void TestComplexDeconvNxNS12(const int batch, - const std::vector &shape, - const int stride) { +void TestComplexDeconvNxN(const int batch, + const std::vector &shape, + const int stride) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, Padding type, int padding) { @@ -415,8 +415,6 @@ void TestComplexDeconvNxNS12(const int batch, output_shape.push_back(output_channels); net.AddInputFromArray("OutputShape", {4}, output_shape); } else { - // out_h = (height - 1) * stride + 1 + padding - kernel_h + 1; - // out_w = (width -1) * stride + 1 + padding - kernel_w + 1; paddings.push_back(padding); paddings.push_back(padding); } @@ -497,38 +495,42 @@ void TestComplexDeconvNxNS12(const int batch, 1e-4); }; - for (int kernel_size : {3, 4, 5, 7}) { - func(kernel_size, kernel_size, stride, stride, VALID, -1); - func(kernel_size, kernel_size, stride, stride, SAME, -1); - func(kernel_size, kernel_size, stride, stride, VALID, 2); - func(kernel_size, kernel_size, stride, stride, VALID, 3); + for (int kernel_size : {2, 3, 4, 5, 7}) { + if (kernel_size >= stride) { + func(kernel_size, kernel_size, stride, stride, VALID, -1); + func(kernel_size, kernel_size, stride, stride, SAME, -1); + func(kernel_size, kernel_size, stride, stride, VALID, 1); + func(kernel_size, kernel_size, stride, stride, VALID, 2); + func(kernel_size, kernel_size, stride, stride, VALID, 3); + } } } } // namespace + TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS12) { - TestComplexDeconvNxNS12(1, {32, 16, 16, 32}, 1); - TestComplexDeconvNxNS12(1, {32, 16, 16, 32}, 2); + TestComplexDeconvNxN(1, {32, 16, 16, 32}, 1); + TestComplexDeconvNxN(1, {32, 16, 16, 32}, 2); } TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS34) { - TestComplexDeconvNxNS12(1, {32, 16, 16, 32}, 3); - TestComplexDeconvNxNS12(1, {32, 16, 16, 32}, 4); + TestComplexDeconvNxN(1, {32, 16, 16, 32}, 3); + TestComplexDeconvNxN(1, {32, 16, 16, 32}, 4); } TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS12) { - TestComplexDeconvNxNS12(1, {17, 113, 5, 7}, 1); - TestComplexDeconvNxNS12(1, {17, 113, 5, 7}, 2); + TestComplexDeconvNxN(1, {17, 113, 5, 7}, 1); + TestComplexDeconvNxN(1, {17, 113, 5, 7}, 2); } TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS34) { - TestComplexDeconvNxNS12(1, {17, 113, 5, 7}, 3); - TestComplexDeconvNxNS12(1, {17, 113, 5, 7}, 4); + TestComplexDeconvNxN(1, {17, 113, 5, 7}, 3); + TestComplexDeconvNxN(1, {17, 113, 5, 7}, 4); } TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNMultiBatch) { - TestComplexDeconvNxNS12(3, {17, 113, 5, 7}, 1); - TestComplexDeconvNxNS12(5, {17, 113, 5, 7}, 2); + TestComplexDeconvNxN(3, {17, 113, 5, 7}, 1); + TestComplexDeconvNxN(5, {17, 113, 5, 7}, 2); } } // namespace test diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc index a4b2ba1c9f9689c5591bc1d81c697798bd800592..e3dcb1e00bcdb24208fd2e501c9a12439677f8cb 100644 --- a/mace/ops/depthwise_deconv2d.cc +++ b/mace/ops/depthwise_deconv2d.cc @@ -24,11 +24,8 @@ #include #include "mace/core/future.h" -#include "mace/core/operator.h" #include "mace/core/tensor.h" -#include "mace/ops/activation.h" #include "mace/ops/arm/depthwise_deconv2d_neon.h" -#include "mace/ops/conv_pool_2d_util.h" #include "mace/utils/utils.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -38,90 +35,15 @@ namespace mace { namespace ops { -class DepthwiseDeconv2dOpBase : public Operation { - public: - explicit DepthwiseDeconv2dOpBase(OpConstructContext *context) - : Operation(context), - strides_(Operation::GetRepeatedArgs("strides")), - paddings_(Operation::GetRepeatedArgs("padding_values")), - group_(Operation::GetOptionalArg("group", 1)), - activation_(ops::StringToActivationType( - Operation::GetOptionalArg("activation", - "NOOP"))), - relux_max_limit_(Operation::GetOptionalArg("max_limit", - 0.0f)) {} - - static void CalcGroupDeconvOutputSize( - const index_t *input_shape, // NHWC - const index_t *filter_shape, // OIHW - const int group, - const int *strides, - const int *paddings, - int *pre_paddings, - index_t *out_shape, - index_t *padded_out_shape, - const bool isNCHW = false) { - MACE_CHECK_NOTNULL(paddings); - MACE_CHECK_NOTNULL(input_shape); - MACE_CHECK_NOTNULL(filter_shape); - MACE_CHECK_NOTNULL(strides); - - const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; - const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; - - const index_t output_channel = filter_shape[0] * group; - - const index_t kernel_h = filter_shape[2]; - const index_t kernel_w = filter_shape[3]; - - index_t padded_out_height = - (in_height - 1) * strides[0] + kernel_h; - index_t padded_out_width = - (in_width - 1) * strides[1] + kernel_w; - - if (pre_paddings != nullptr) { - pre_paddings[0] = static_cast((kernel_h - 1) * 2 - paddings[0]); - pre_paddings[1] = static_cast((kernel_w - 1) * 2 - paddings[1]); - pre_paddings[0] = std::max(0, pre_paddings[0]); - pre_paddings[1] = std::max(0, pre_paddings[1]); - } - - if (padded_out_shape != nullptr) { - padded_out_shape[0] = input_shape[0]; - padded_out_shape[1] = isNCHW ? output_channel : padded_out_height; - padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width; - padded_out_shape[3] = isNCHW ? padded_out_width : output_channel; - } - - if (out_shape != nullptr) { - index_t out_height = padded_out_height - paddings[0]; - index_t out_width = padded_out_width - paddings[1]; - out_shape[0] = input_shape[0]; - out_shape[1] = isNCHW ? output_channel : out_height; - out_shape[2] = isNCHW ? out_height : out_width; - out_shape[3] = isNCHW ? out_width : output_channel; - } - } - - protected: - std::vector strides_; // [stride_h, stride_w] - std::vector paddings_; - const int group_; - const ActivationType activation_; - const float relux_max_limit_; -}; - - - template class DepthwiseDeconv2dOp; template<> class DepthwiseDeconv2dOp - : public DepthwiseDeconv2dOpBase { + : public Deconv2dOpBase { public: explicit DepthwiseDeconv2dOp(OpConstructContext *context) - : DepthwiseDeconv2dOpBase(context) {} + : Deconv2dOpBase(context) {} MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(0); @@ -138,15 +60,17 @@ class DepthwiseDeconv2dOp std::vector padded_out_shape(4, 0); if (!paddings_.empty()) out_paddings = paddings_; - CalcGroupDeconvOutputSize(input->shape().data(), - filter->shape().data(), - group_, - strides_.data(), - out_paddings.data(), - nullptr, - out_shape.data(), - padded_out_shape.data(), - true); + CalcDeconvShape_Caffe( + input->shape().data(), + filter->shape().data(), + strides_.data(), + out_paddings.data(), + group_, + nullptr, + out_shape.data(), + padded_out_shape.data(), + true); + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); output->Clear(); index_t kernel_h = filter->dim(2); @@ -480,10 +404,10 @@ class DepthwiseDeconv2dOp #ifdef MACE_ENABLE_OPENCL template -class DepthwiseDeconv2dOp : public DepthwiseDeconv2dOpBase { +class DepthwiseDeconv2dOp : public Deconv2dOpBase { public: explicit DepthwiseDeconv2dOp(OpConstructContext *context) - : DepthwiseDeconv2dOpBase(context) { + : Deconv2dOpBase(context) { if (context->device()->opencl_runtime()->UseImageMemory()) { kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel); } else { @@ -501,16 +425,18 @@ class DepthwiseDeconv2dOp : public DepthwiseDeconv2dOpBase { MACE_CHECK_NOTNULL(output); std::vector in_paddings(2, 0); + std::vector out_paddings(2, 0); std::vector out_shape(4, 0); - CalcGroupDeconvOutputSize(input->shape().data(), - filter->shape().data(), - group_, - strides_.data(), - paddings_.data(), - in_paddings.data(), - out_shape.data(), - nullptr); + if (!paddings_.empty()) out_paddings = paddings_; + CalcDeconvShape_Caffe(input->shape().data(), + filter->shape().data(), + strides_.data(), + out_paddings.data(), + group_, + in_paddings.data(), + out_shape.data(), + nullptr); return kernel_->Compute(context, input,