diff --git a/mace/ops/arm/depthwise_conv2d_neon.h b/mace/ops/arm/depthwise_conv2d_neon.h deleted file mode 100644 index b610178c54fd097beb92bf0135d152cd4a96ed29..0000000000000000000000000000000000000000 --- a/mace/ops/arm/depthwise_conv2d_neon.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ARM_DEPTHWISE_CONV2D_NEON_H_ -#define MACE_OPS_ARM_DEPTHWISE_CONV2D_NEON_H_ - -#include "mace/core/types.h" - -namespace mace { -namespace ops { - -void DepthwiseConv2dNeonK3x3S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - const int *pad_hw, - const index_t valid_h_start, - const index_t valid_h_stop, - const index_t valid_w_start, - const index_t valid_w_stop, - float *output); - -void DepthwiseConv2dNeonK3x3S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - const int *pad_hw, - const index_t valid_h_start, - const index_t valid_h_stop, - const index_t valid_w_start, - const index_t valid_w_stop, - float *output); - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ARM_DEPTHWISE_CONV2D_NEON_H_ diff --git a/mace/ops/arm/fp32/conv_2d.cc b/mace/ops/arm/fp32/conv_2d.cc index 799ee521b83dc22b3e192dc364f486b929b7df7f..2602279423dc753be085c66bf67bb4cbee86bcc7 100644 --- a/mace/ops/arm/fp32/conv_2d.cc +++ b/mace/ops/arm/fp32/conv_2d.cc @@ -24,6 +24,49 @@ namespace ops { namespace arm { namespace fp32 { +void Conv2dBase::CalOutputShapeAndInputPadSize( + const std::vector &input_shape, + const std::vector &filter_shape, + std::vector *output_shape, + std::vector *in_pad_size) { + if (paddings_.empty()) { + CalcNCHWPaddingAndOutputSize(input_shape.data(), + filter_shape.data(), + dilations_.data(), + strides_.data(), + padding_type_, + output_shape->data(), + in_pad_size->data()); + } else { + *in_pad_size = paddings_; + CalcNCHWOutputSize(input_shape.data(), + filter_shape.data(), + paddings_.data(), + dilations_.data(), + strides_.data(), + RoundType::FLOOR, + output_shape->data()); + } +} + +void Conv2dBase::CalOutputBoundaryWithoutUsingInputPad( + const std::vector &output_shape, + const std::vector in_pad_size, + std::vector *out_bound) { + const int pad_top = in_pad_size[0] >> 1; + const int pad_bottom = in_pad_size[0] - pad_top; + const int pad_left = in_pad_size[1] >> 1; + const int pad_right = in_pad_size[1] - pad_left; + const index_t height = output_shape[2]; + const index_t width = output_shape[3]; + *out_bound = { + pad_top == 0 ? 0 : (pad_top - 1) / strides_[0] + 1, + pad_bottom == 0 ? height : height - ((pad_bottom - 1) / strides_[0] + 1), + pad_left == 0 ? 0 : (pad_left - 1) / strides_[1] + 1, + pad_right == 0 ? width : width - ((pad_right - 1) / strides_[1] + 1), + }; +} + void Conv2dBase::CalOutputShapeAndPadSize(const Tensor *input, const Tensor *filter, const int out_tile_height, @@ -46,24 +89,11 @@ void Conv2dBase::CalOutputShapeAndPadSize(const Tensor *input, const index_t filter_w = filter->dim(3); std::vector paddings(2); - if (paddings_.empty()) { - CalcNCHWPaddingAndOutputSize(input->shape().data(), - filter->shape().data(), - dilations_.data(), - strides_.data(), - padding_type_, - output_shape->data(), - paddings.data()); - } else { - paddings = paddings_; - CalcNCHWOutputSize(input->shape().data(), - filter->shape().data(), - paddings_.data(), - dilations_.data(), - strides_.data(), - RoundType::FLOOR, - output_shape->data()); - } + CalOutputShapeAndInputPadSize(input->shape(), + filter->shape(), + output_shape, + &paddings); + const index_t out_height = (*output_shape)[2]; const index_t out_width = (*output_shape)[3]; const index_t @@ -96,9 +126,9 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context, const int out_tile_height, const int out_tile_width, std::unique_ptr - *padded_input, + *padded_input, std::unique_ptr - *padded_output) { + *padded_output) { std::vector output_shape; std::vector in_pad_size; std::vector out_pad_size; @@ -152,8 +182,9 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context, } if (is_out_padded) { std::unique_ptr - padded_out = make_unique(scratch_buffer->Scratch(padded_out_size), - DataType::DT_FLOAT); + padded_out = + make_unique(scratch_buffer->Scratch(padded_out_size), + DataType::DT_FLOAT); padded_out->Resize({batch, out_channels, padded_out_height, padded_out_width}); *padded_output = std::move(padded_out); diff --git a/mace/ops/arm/fp32/conv_2d.h b/mace/ops/arm/fp32/conv_2d.h index 832f6f2fa35d999ee6192e61f340f070776f5d1f..1383767bf278f1f6c11aec8047732aca98afa45a 100644 --- a/mace/ops/arm/fp32/conv_2d.h +++ b/mace/ops/arm/fp32/conv_2d.h @@ -49,6 +49,18 @@ class Conv2dBase { Tensor *output) = 0; protected: + void CalOutputShapeAndInputPadSize(const std::vector &input_shape, + const std::vector &filter_shape, + std::vector *output_shape, + std::vector *in_pad_size); + + void CalOutputBoundaryWithoutUsingInputPad(const std::vector + &output_shape, + const std::vector + in_pad_size, + std::vector + *out_bound); + void CalOutputShapeAndPadSize(const Tensor *input, const Tensor *filter, const int out_tile_height, diff --git a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc similarity index 79% rename from mace/ops/arm/depthwise_conv2d_neon_3x3.cc rename to mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc index ced509e0d87d796b7ff2ecedc5ae187a926502af..3ac8eb5de20503a89b9b25202b91ddbf8e305031 100644 --- a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc +++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc @@ -12,15 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(MACE_ENABLE_NEON) #include -#endif - -#include "mace/utils/macros.h" -#include "mace/ops/arm/depthwise_conv2d_neon.h" +#include "mace/ops/arm/fp32/depthwise_conv_2d_3x3.h" namespace mace { namespace ops { +namespace arm { +namespace fp32 { namespace { void DepthwiseConv2dPixel(const float *in_base, @@ -49,42 +47,58 @@ void DepthwiseConv2dPixel(const float *in_base, } } // namespace -// Ho = 2, Wo = 4, Co = 1 -void DepthwiseConv2dNeonK3x3S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - const int *pad_hw, - const index_t valid_h_start, - const index_t valid_h_stop, - const index_t valid_w_start, - const index_t valid_w_stop, - float *output) { -#if !defined(MACE_ENABLE_NEON) - MACE_UNUSED(valid_w_start); - MACE_UNUSED(valid_w_stop); -#endif +MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, + const mace::Tensor *input, + const mace::Tensor *filter, + mace::Tensor *output) { + MACE_UNUSED(context); + std::vector out_shape(4); + std::vector paddings(2); + auto &in_shape = input->shape(); + auto &filter_shape = filter->shape(); + CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings); + out_shape[1] *= filter_shape[1]; + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + output->Clear(); + + const int pad_top = paddings[0] / 2; + const int pad_left = paddings[1] / 2; + const index_t multiplier = out_shape[1] / in_shape[1]; const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; + std::vector out_bounds; + CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = input->data(); + auto output_data = output->mutable_data(); + #pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < in_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; ++m) { - index_t c = m / multiplier; - index_t multi_index = m % multiplier; - const float *in_base = input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9; - float *out_base = output + b * out_batch_size + m * out_image_size; + const index_t c = m / multiplier; + const index_t multi_index = m % multiplier; + const float *in_base = input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr = filter_data + multi_index * in_shape[1] * 9 + c * 9; + float *out_base = output_data + b * out_batch_size + m * out_image_size; index_t h, w; - const index_t pad_top = pad_hw[0]; - const index_t pad_left = pad_hw[1]; const index_t out_width = out_shape[3]; const index_t in_height = in_shape[2]; const index_t in_width = in_shape[3]; + const index_t valid_h_start = out_bounds[0]; + const index_t valid_h_stop = out_bounds[1]; + const index_t valid_w_start = out_bounds[2]; + const index_t valid_w_stop = out_bounds[3]; + // top for (h = 0; h < valid_h_start; ++h) { for (w = 0; w < out_shape[3]; ++w) { @@ -94,7 +108,6 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, } } -#if defined(MACE_ENABLE_NEON) // load filter (1 outch x 3 height x 3 width): vf_outch_height float32x4_t vf00, vf01, vf02; vf00 = vld1q_f32(filter_ptr); @@ -208,15 +221,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, 3, out_base); } } // h -#else - for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { - for (index_t iw = 0; iw < out_shape[3]; ++iw) { - DepthwiseConv2dPixel(in_base, filter_ptr, ih, iw, ih - pad_top, - iw - pad_left, out_width, in_height, in_width, 3, - 3, out_base); - } - } -#endif + // bottom for (; h < out_shape[2]; ++h) { @@ -228,42 +233,64 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, } } // m } // b + + return MaceStatus::MACE_SUCCESS; } -void DepthwiseConv2dNeonK3x3S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - const int *pad_hw, - const index_t valid_h_start, - const index_t valid_h_stop, - const index_t valid_w_start, - const index_t valid_w_stop, - float *output) { -#if !defined(MACE_ENABLE_NEON) - MACE_UNUSED(valid_w_start); - MACE_UNUSED(valid_w_stop); -#endif +MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, + const mace::Tensor *input, + const mace::Tensor *filter, + mace::Tensor *output) { + MACE_UNUSED(context); + + std::vector out_shape(4); + std::vector paddings(2); + auto &in_shape = input->shape(); + auto &filter_shape = filter->shape(); + + CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings); + out_shape[1] *= in_shape[1]; + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + output->Clear(); + + const int pad_top = paddings[0] / 2; + const int pad_left = paddings[1] / 2; + const index_t multiplier = out_shape[1] / in_shape[1]; const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; + std::vector out_bounds; + CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = input->data(); + auto output_data = output->mutable_data(); + #pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < in_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; ++m) { index_t c = m / multiplier; index_t multi_index = m % multiplier; - const float *in_base = input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9; - float *out_base = output + b * out_batch_size + m * out_image_size; + const float *in_base = input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr = filter_data + multi_index * in_shape[1] * 9 + c * 9; + float *out_base = output_data + b * out_batch_size + m * out_image_size; index_t h, w; - const index_t pad_top = pad_hw[0]; - const index_t pad_left = pad_hw[1]; const index_t out_width = out_shape[3]; const index_t in_height = in_shape[2]; const index_t in_width = in_shape[3]; + + const index_t valid_h_start = out_bounds[0]; + const index_t valid_h_stop = out_bounds[1]; + const index_t valid_w_start = out_bounds[2]; + const index_t valid_w_stop = out_bounds[3]; + // top for (h = 0; h < valid_h_start; ++h) { for (w = 0; w < out_width; ++w) { @@ -273,7 +300,6 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, } } -#if defined(MACE_ENABLE_NEON) // load filter (1 outch x 3 height x 3 width): vf_outch_height float32x4_t vf00, vf01, vf02; vf00 = vld1q_f32(filter_ptr); @@ -359,15 +385,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, 3, 3, out_base); } } // h -#else - for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { - for (index_t iw = 0; iw < out_width; ++iw) { - DepthwiseConv2dPixel(in_base, filter_ptr, ih, iw, ih * 2 - pad_top, - iw * 2 - pad_left, out_width, in_height, - in_width, 3, 3, out_base); - } - } -#endif + // bottom for (; h < out_shape[2]; ++h) { @@ -379,7 +397,11 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, } } // m } // b + + return MaceStatus::MACE_SUCCESS; } +} // namespace fp32 +} // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h new file mode 100644 index 0000000000000000000000000000000000000000..e89ae2b969cdbdb7b32f34eeeada62d3ec14af3b --- /dev/null +++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h @@ -0,0 +1,62 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_ +#define MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_ + +#include +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class DepthwiseConv2dK3x3S1 : public Conv2dBase { + public: + DepthwiseConv2dK3x3S1(const std::vector paddings, + const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~DepthwiseConv2dK3x3S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class DepthwiseConv2dK3x3S2 : public Conv2dBase { + public: + DepthwiseConv2dK3x3S2(const std::vector paddings, + const Padding padding_type) + : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {} + virtual ~DepthwiseConv2dK3x3S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_ diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 22b13c268de07a10ffc4dfc06fdad69c953a37f5..24b5d4192265a35397c54cc58e009e870943ad64 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -12,14 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) -#include -#endif #include #include #include #include +#include "mace/ops/ref/depthwise_conv_2d.h" + +#if defined(MACE_ENABLE_NEON) +#include "mace/ops/arm/fp32/depthwise_conv_2d_3x3.h" +#endif // MACE_ENABLE_NEON + #ifdef MACE_ENABLE_QUANTIZE #include "mace/ops/quantization_util.h" // We reuse TensorFlow Lite's optimized depthwiseconv_uint8 and parallelized it @@ -30,7 +33,6 @@ #include "mace/core/future.h" #include "mace/core/operator.h" #include "mace/ops/activation.h" -#include "mace/ops/arm/depthwise_conv2d_neon.h" #include "mace/ops/conv_pool_2d_base.h" #include "mace/public/mace.h" #include "mace/utils/memory.h" @@ -50,20 +52,20 @@ class DepthwiseConv2dOpBase : public ConvPool2dOpBase { : ConvPool2dOpBase(context), activation_(ops::StringToActivationType( Operation::GetOptionalArg("activation", - "NOOP"))), + "NOOP"))), relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)), leakyrelu_coefficient_(Operation::GetOptionalArg( - "leakyrelu_coefficient", 0.0f)) {} + "leakyrelu_coefficient", 0.0f)) {} protected: const ActivationType activation_; const float relux_max_limit_; const float leakyrelu_coefficient_; }; -template +template class DepthwiseConv2dOp; -template <> +template<> class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { public: explicit DepthwiseConv2dOp(OpConstructContext *context) @@ -82,133 +84,60 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); - std::vector output_shape(4); - std::vector paddings(2); - std::vector filter_shape - {filter->dim(0) * filter->dim(1), filter->dim(1), filter->dim(2), - filter->dim(3)}; +#ifdef MACE_ENABLE_NEON + const index_t filter_h = filter->dim(2); + const index_t filter_w = filter->dim(3); + const index_t stride_h = strides_[0]; + const index_t stride_w = strides_[1]; + const index_t dilation_h = dilations_[0]; + const index_t dilation_w = dilations_[1]; - if (paddings_.empty()) { - CalcNCHWPaddingAndOutputSize(input->shape().data(), - filter_shape.data(), - dilations_.data(), - strides_.data(), - padding_type_, - output_shape.data(), - paddings.data()); + if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 + && dilation_h == 1 && dilation_w == 1) { + if (conv2d_delegator_.get() == nullptr) { + conv2d_delegator_ = + make_unique(paddings_, + padding_type_); + } + conv2d_delegator_->Compute(context, input, filter, output); + } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2 + && dilation_h == 1 && dilation_w == 1) { + if (conv2d_delegator_.get() == nullptr) { + conv2d_delegator_ = + make_unique(paddings_, + padding_type_); + } + conv2d_delegator_->Compute(context, input, filter, output); } else { - paddings = paddings_; - CalcNCHWOutputSize(input->shape().data(), - filter_shape.data(), - paddings_.data(), - dilations_.data(), - strides_.data(), - RoundType::FLOOR, - output_shape.data()); + if (ref_conv2d_delegator_.get() == nullptr) { + ref_conv2d_delegator_ = + make_unique>(strides_, + dilations_, + paddings_, + padding_type_); + } + ref_conv2d_delegator_->Compute(context, input, filter, output); } - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - output->Clear(); - - index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t height = output->dim(2); - index_t width = output->dim(3); - - index_t input_batch = input->dim(0); - index_t input_channels = input->dim(1); - index_t input_height = input->dim(2); - index_t input_width = input->dim(3); - - index_t filter_h = filter_shape[2]; - index_t filter_w = filter_shape[3]; - MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels); - MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ", - input_channels); - - index_t stride_h = strides_[0]; - index_t stride_w = strides_[1]; - - index_t dilation_h = dilations_[0]; - index_t dilation_w = dilations_[1]; - - MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); - - int pad_top = paddings[0] >> 1; - int pad_bottom = paddings[0] - pad_top; - int pad_left = paddings[1] >> 1; - int pad_right = paddings[1] - pad_left; - - index_t valid_h_start = pad_top == 0 ? 0 : (pad_top - 1) / stride_h + 1; - index_t valid_h_stop = pad_bottom == 0 - ? height - : height - ((pad_bottom - 1) / stride_h + 1); - index_t valid_w_start = pad_left == 0 ? 0 : (pad_left - 1) / stride_w + 1; - index_t valid_w_stop = pad_right == 0 - ? width - : width - ((pad_right - 1) / stride_w + 1); - - std::function conv_func; +#else + if (ref_conv2d_delegator_.get() == nullptr) { + ref_conv2d_delegator_ = + make_unique>(strides_, + dilations_, + paddings_, + padding_type_); + } + ref_conv2d_delegator_->Compute(context, input, filter, output); +#endif // MACE_ENABLE_NEON - Tensor::MappingGuard input_guard(input); - Tensor::MappingGuard filter_guard(filter); Tensor::MappingGuard bias_guard(bias); Tensor::MappingGuard output_guard(output); - auto input_data = input->data(); - auto filter_data = filter->data(); auto bias_data = bias == nullptr ? nullptr : bias->data(); auto output_data = output->mutable_data(); - const int pad_hw[2] = {pad_top, pad_left}; - const index_t input_shape[4] = - {batch, input_channels, input_height, input_width}; - - // make host compiler happy - MACE_UNUSED(pad_hw); - MACE_UNUSED(input_shape); - - if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 - && dilation_h == 1 && dilation_w == 1) { - conv_func = [=](const float *input, float *output) { - DepthwiseConv2dNeonK3x3S1(input, - filter_data, - input_shape, - output_shape.data(), - pad_hw, - valid_h_start, - valid_h_stop, - valid_w_start, - valid_w_stop, - output); - }; - } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2 - && dilation_h == 1 && dilation_w == 1) { - conv_func = [=](const float *input, float *output) { - DepthwiseConv2dNeonK3x3S2(input, - filter_data, - input_shape, - output_shape.data(), - pad_hw, - valid_h_start, - valid_h_stop, - valid_w_start, - valid_w_stop, - output); - }; - } else { - conv_func = [=](const float *input, float *output) { - DepthwiseConv2dGeneral(input, - filter_data, - input_shape, - output_shape.data(), - filter_shape.data(), - strides_.data(), - dilations_.data(), - pad_hw, - output); - }; - } - - conv_func(input_data, output_data); + const index_t batch = output->dim(0); + const index_t channels = output->dim(1); + const index_t height = output->dim(2); + const index_t width = output->dim(3); if (bias_data != nullptr) { #pragma omp parallel for collapse(2) @@ -229,55 +158,10 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { } private: - void DepthwiseConv2dGeneral(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - const index_t *filter_shape, - const int *stride_hw, - const int *dilation_hw, - const int *pad_hw, - float *output) { - const index_t multiplier = filter_shape[0] / filter_shape[1]; -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < in_shape[0]; ++b) { - for (index_t m = 0; m < filter_shape[0]; ++m) { - for (index_t h = 0; h < out_shape[2]; ++h) { - for (index_t w = 0; w < out_shape[3]; ++w) { - const index_t out_channels = filter_shape[0]; - const index_t in_channels = filter_shape[1]; - const index_t filter_height = filter_shape[2]; - const index_t filter_width = filter_shape[3]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - index_t out_offset = - ((b * out_channels + m) * out_height + h) * out_width + w; - index_t c = m / multiplier; - index_t o = m % multiplier; - float sum = 0; - for (index_t kh = 0; kh < filter_height; ++kh) { - for (index_t kw = 0; kw < filter_width; ++kw) { - index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0]; - index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1]; - if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { - index_t in_offset = - ((b * in_channels + c) * in_height + ih) * in_width + iw; - index_t filter_offset = - (((o * in_channels) + c) * filter_height + kh) - * filter_width + kw; - - sum += input[in_offset] * filter[filter_offset]; - } - } - } - output[out_offset] = sum; - } - } - } - } - } +#ifdef MACE_ENABLE_NEON + std::unique_ptr conv2d_delegator_; +#endif // MACE_ENABLE_NEON + std::unique_ptr> ref_conv2d_delegator_; protected: MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); @@ -542,7 +426,6 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { }; #endif // MACE_ENABLE_OPENCL - void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp, DeviceType::CPU, float); diff --git a/mace/ops/ref/conv_2d.h b/mace/ops/ref/conv_2d.h index 10baac8cb86abcdd1f88993ae12fb752f589fcb7..c04eff0fdecef6579f8065f1eb91a0dfef60a8b2 100644 --- a/mace/ops/ref/conv_2d.h +++ b/mace/ops/ref/conv_2d.h @@ -64,7 +64,7 @@ class Conv2d { paddings_(paddings), padding_type_(padding_type) {} ~Conv2d() {} - // Always row-major after transpose + MaceStatus Compute( const OpContext *context, const Tensor *input, diff --git a/mace/ops/ref/depthwise_conv_2d.cc b/mace/ops/ref/depthwise_conv_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..b9f8b31f6ad517ae07ae15295dcc1f7688584861 --- /dev/null +++ b/mace/ops/ref/depthwise_conv_2d.cc @@ -0,0 +1,123 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "mace/ops/ref/depthwise_conv_2d.h" + +#include + +namespace mace { +namespace ops { +namespace ref { + +MaceStatus DepthwiseConv2d::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + MACE_UNUSED(context); + + const std::vector in_shape = input->shape(); + const std::vector filter_shape = filter->shape(); + std::vector out_shape(4); + + std::vector paddings(2); + if (paddings_.empty()) { + CalcNCHWPaddingAndOutputSize(input->shape().data(), + filter->shape().data(), + dilations_.data(), + strides_.data(), + padding_type_, + out_shape.data(), + paddings.data()); + } else { + paddings = paddings_; + CalcNCHWOutputSize(input->shape().data(), + filter->shape().data(), + paddings_.data(), + dilations_.data(), + strides_.data(), + RoundType::FLOOR, + out_shape.data()); + } + out_shape[1] *= in_shape[1]; + const index_t pad_top = paddings[0] >> 1; + const index_t pad_left = paddings[1] >> 1; + output->Resize(out_shape); + + const index_t multiplier = filter_shape[0]; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + const index_t filter_size = filter_shape[2] * filter_shape[3]; + + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard output_guard(output); + auto input_data = input->data(); + auto filter_data = filter->data(); + auto output_data = output->mutable_data(); + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t b = 0; b < in_shape[0]; b++) { + for (index_t m = 0; m < out_shape[1]; ++m) { + const index_t c = m / multiplier; + const index_t multi_index = m % multiplier; + + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + + float *out_ptr_base = + output_data + b * out_batch_size + m * out_image_size; + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + float sum = 0; + + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr = + filter_data + multi_index * in_channels * filter_size + + c * filter_size; + + for (index_t kh = 0; kh < filter_shape[2]; ++kh) { + for (index_t kw = 0; kw < filter_shape[3]; ++kw) { + const index_t + ih = -pad_top + h * strides_[0] + kh * dilations_[0]; + const index_t + iw = -pad_left + w * strides_[1] + kw * dilations_[1]; + if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { + sum += in_ptr_base[ih * in_width + iw] * filter_ptr[kw]; + } + } // kw + filter_ptr += filter_shape[3]; + } // kh + + + out_ptr_base[h * out_width + w] = sum; + } // w + } // h + } // m + } // b + return MaceStatus::MACE_SUCCESS; +} + +} // namespace ref +} // namespace ops +} // namespace mace + + diff --git a/mace/ops/ref/depthwise_conv_2d.h b/mace/ops/ref/depthwise_conv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..ad493eb207ac8a8edaaada7589aa364d080e5b16 --- /dev/null +++ b/mace/ops/ref/depthwise_conv_2d.h @@ -0,0 +1,86 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef MACE_OPS_REF_DEPTHWISE_CONV_2D_H_ +#define MACE_OPS_REF_DEPTHWISE_CONV_2D_H_ + +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/common/conv_pool_2d_util.h" + +namespace mace { +namespace ops { +namespace ref { + +template +class DepthwiseConv2d { + public: + DepthwiseConv2d(const std::vector strides, + const std::vector dilations, + const std::vector paddings, + const Padding padding_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type) {} + ~DepthwiseConv2d() {} + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); + + private: + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; +}; + +template<> +class DepthwiseConv2d { + public: + DepthwiseConv2d(const std::vector strides, + const std::vector dilations, + const std::vector paddings, + const Padding padding_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type) {} + ~DepthwiseConv2d() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); + + private: + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; +}; + +} // namespace ref +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_REF_DEPTHWISE_CONV_2D_H_ +