diff --git a/mace/core/registry/op_delegator_registry.cc b/mace/core/registry/op_delegator_registry.cc index 7aed0cb08254c98cdde3b8faf2d7811e46a80d63..f853527ff23af6398c02909f5472d16df8c0e2db 100644 --- a/mace/core/registry/op_delegator_registry.cc +++ b/mace/core/registry/op_delegator_registry.cc @@ -60,6 +60,7 @@ MaceStatus OpDelegatorRegistry::Register(const DelegatorInfo &key, DelegatorCreator OpDelegatorRegistry::GetCreator( const DelegatorInfo &key) const { if (registry_.count(key) > 0) { + VLOG(3) << "find delegator creator: " << key.ToString(); return registry_.at(key); } diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel index a3b8ec79e3ea8537f0b8ebc8f002bc0ff2249a23..32226af45b1d7798e9a8abc90f6c6381ad0e6b03 100644 --- a/mace/ops/BUILD.bazel +++ b/mace/ops/BUILD.bazel @@ -105,6 +105,7 @@ cc_library( name = "arm_neon_kernels", srcs = glob( [ + "arm/base/*.cc", "arm/fp32/*.cc", "arm/fp16/gemv.h", ], @@ -121,6 +122,7 @@ cc_library( )), hdrs = glob( [ + "arm/base/*.h", "arm/fp32/*.h", ], ) + if_quantize_enabled(glob( diff --git a/mace/ops/CMakeLists.txt b/mace/ops/CMakeLists.txt index 7de9661d61d05cd6e4ac9d551cbccbb38904f7d4..61b3b15390caa9413201020066febfc888506035 100644 --- a/mace/ops/CMakeLists.txt +++ b/mace/ops/CMakeLists.txt @@ -5,6 +5,9 @@ file(GLOB OPS_REF_Q8_KERNELS_SRCS ref/q8/*.cc ) +file(GLOB OPS_ARM_NEON_BASE_KERNELS_SRCS + arm/base/*.cc +) file(GLOB OPS_ARM_NEON_FP32_KERNELS_SRCS arm/fp32/*.cc ) @@ -32,7 +35,7 @@ if(MACE_ENABLE_QUANTIZE) endif(MACE_ENABLE_QUANTIZE) if(MACE_ENABLE_NEON) - set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS}) + set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_BASE_KERNELS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS}) if(MACE_ENABLE_QUANTIZE) set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_Q8_KERNELS_SRCS}) endif(MACE_ENABLE_QUANTIZE) diff --git a/mace/ops/arm/base/activation.cc b/mace/ops/arm/base/activation.cc new file mode 100644 index 0000000000000000000000000000000000000000..6531616ae0ab8b2b749e886a3e2f4431ceb50856 --- /dev/null +++ b/mace/ops/arm/base/activation.cc @@ -0,0 +1,91 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/activation.h" + +namespace mace { +namespace ops { +namespace arm { + +template +MaceStatus Activation::Compute(const OpContext *context, + const Tensor *input, Tensor *output) { + Tensor::MappingGuard input_guard(input); + if (input != output) { + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + Tensor::MappingGuard output_guard(output); + DoActivation(context, input, output); + } else { + DoActivation(context, input, output); + } + + return MaceStatus::MACE_SUCCESS; +} + +template +void Activation::DoActivation(const OpContext *context, + const Tensor *input, + Tensor *output) { + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + const index_t size = input->size(); + + utils::ThreadPool &thread_pool = + context->device()->cpu_runtime()->thread_pool(); + + switch (type_) { + case RELU: { + ActivateRelu(&thread_pool, input_data, size, output_data); + break; + } + + case RELUX: { + ActivateRelux(&thread_pool, input_data, size, output_data); + break; + } + + case LEAKYRELU: { + ActivateLeakyRelu(&thread_pool, input_data, size, output_data); + break; + } + + case TANH: { + ActivateTanh(&thread_pool, input_data, size, output_data); + break; + } + + case SIGMOID: { + ActivateSigmoid(&thread_pool, input_data, size, output_data); + break; + } + + case NOOP: { + break; + } + + default: { + MACE_NOT_IMPLEMENTED; + } + } +} + +void RegisterActivationDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Activation, delegator::ActivationParam, + MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/base/activation.h b/mace/ops/arm/base/activation.h new file mode 100644 index 0000000000000000000000000000000000000000..aac917c9b37642ae8b452331cf86d5b0e51407f4 --- /dev/null +++ b/mace/ops/arm/base/activation.h @@ -0,0 +1,54 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_ACTIVATION_H_ +#define MACE_OPS_ARM_BASE_ACTIVATION_H_ + +#include "mace/ops/delegator/activation.h" + +namespace mace { +namespace ops { +namespace arm { + +template +class Activation : public delegator::Activation { + public: + explicit Activation(const delegator::ActivationParam ¶m) + : delegator::Activation(param) {} + ~Activation() = default; + + MaceStatus Compute(const OpContext *context, + const Tensor *input, Tensor *output) override; + + private: + void DoActivation(const OpContext *context, + const Tensor *input, Tensor *output); + + void ActivateRelu(utils::ThreadPool *thread_pool, const T *input_data, + const index_t input_size, T *output_data); + void ActivateRelux(utils::ThreadPool *thread_pool, const T *input_data, + const index_t input_size, T *output_data); + void ActivateLeakyRelu(utils::ThreadPool *thread_pool, const T *input_data, + const index_t input_size, T *output_data); + void ActivateTanh(utils::ThreadPool *thread_pool, const T *input_data, + const index_t input_size, T *output_data); + void ActivateSigmoid(utils::ThreadPool *thread_pool, const T *input_data, + const index_t input_size, T *output_data); +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_ACTIVATION_H_ diff --git a/mace/ops/arm/base/bias_add.cc b/mace/ops/arm/base/bias_add.cc new file mode 100644 index 0000000000000000000000000000000000000000..42357a48e8ce04f5199c39e0c428abcd1562f6e6 --- /dev/null +++ b/mace/ops/arm/base/bias_add.cc @@ -0,0 +1,79 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/bias_add.h" + +namespace mace { +namespace ops { +namespace arm { + +template +MaceStatus BiasAdd::Compute(const OpContext *context, const Tensor *input, + const Tensor *bias, Tensor *output) { + if (input != output) { + if (bias == nullptr) { + output->Copy(*input); + } else { + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard bias_guard(bias); + Tensor::MappingGuard output_guard(output); + AddBias(context, input, bias, output); + } + } else { + if (bias != nullptr) { + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard bias_guard(bias); + AddBias(context, input, bias, output); + } + } + + return MaceStatus::MACE_SUCCESS; +} + +template +void BiasAdd::AddBias(const OpContext *context, const Tensor *input, + const Tensor *bias, mace::Tensor *output) { + auto input_data = input->data(); + auto bias_data = bias->data(); + auto output_data = output->mutable_data(); + + const index_t batch = input->dim(0); + const index_t channels = input->dim(1); + + const index_t height = input->dim(2); + const index_t width = input->dim(3); + const index_t image_size = height * width; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + if (bias->dim_size() == 1) { + Add1DimBias(&thread_pool, input_data, bias_data, + output_data, batch, channels, image_size); + } else { + Add2DimsBias(&thread_pool, input_data, bias_data, + output_data, batch, channels, image_size); + } +} + +void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, BiasAdd, DelegatorParam, + MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/base/bias_add.h b/mace/ops/arm/base/bias_add.h new file mode 100644 index 0000000000000000000000000000000000000000..b0e2e1c09ef19a0d77a817bf55f2992282973b31 --- /dev/null +++ b/mace/ops/arm/base/bias_add.h @@ -0,0 +1,52 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_BIAS_ADD_H_ +#define MACE_OPS_ARM_BASE_BIAS_ADD_H_ + +#include "mace/ops/delegator/bias_add.h" + +namespace mace { +namespace ops { +namespace arm { + +template +class BiasAdd : public delegator::BiasAdd { + public: + explicit BiasAdd(const DelegatorParam ¶m) : delegator::BiasAdd(param) {} + ~BiasAdd() = default; + + MaceStatus Compute(const OpContext *context, const Tensor *input, + const Tensor *bias, Tensor *output) override; + + private: + void AddBias(const OpContext *context, const Tensor *input, + const Tensor *bias, Tensor *output); + + void Add1DimBias(utils::ThreadPool *thread_pool, const T *input_data, + const T *bias_data, T *output_data, + const index_t batch, const index_t channels, + const index_t image_size); + + void Add2DimsBias(utils::ThreadPool *thread_pool, const T *input_data, + const T *bias_data, T *output_data, + const index_t batch, const index_t channels, + const index_t image_size); +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_BIAS_ADD_H_ diff --git a/mace/ops/arm/fp32/conv_2d.cc b/mace/ops/arm/base/conv_2d.cc similarity index 67% rename from mace/ops/arm/fp32/conv_2d.cc rename to mace/ops/arm/base/conv_2d.cc index 357b47754b0b9bf814302be042f56651883594a5..c5a69ac9704e1de3ea25537ca30c74eac629ddf3 100644 --- a/mace/ops/arm/fp32/conv_2d.cc +++ b/mace/ops/arm/base/conv_2d.cc @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,18 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/conv_2d.h" +#include "mace/ops/arm/base/conv_2d.h" +#include #include #include -#include #include "mace/utils/memory.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { void Conv2dBase::CalOutputShapeAndInputPadSize( const std::vector &input_shape, @@ -164,10 +163,10 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context, auto scratch_buffer = context->device()->scratch_buffer(); const index_t padded_in_size = MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize( - sizeof(float) * batch * in_channels * padded_in_height + type_size_ * batch * in_channels * padded_in_height * padded_in_width) : 0); const index_t padded_out_size = is_out_padded ? PadAlignSize( - sizeof(float) * batch * out_channels * padded_out_height + type_size_ * batch * out_channels * padded_out_height * padded_out_width) : 0; scratch_buffer->Rewind(); @@ -176,7 +175,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context, std::unique_ptr padded_in = make_unique(scratch_buffer->Scratch(padded_in_size), - DataType::DT_FLOAT); + input->dtype()); padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width}); PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get()); *padded_input = std::move(padded_in); @@ -185,7 +184,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context, std::unique_ptr padded_out = make_unique(scratch_buffer->Scratch(padded_out_size), - DataType::DT_FLOAT); + output->dtype()); padded_out->Resize({batch, out_channels, padded_out_height, padded_out_width}); *padded_output = std::move(padded_out); @@ -206,8 +205,8 @@ void Conv2dBase::PadInput(const Tensor &src, const index_t padded_width = dst->dim(3); const int pad_bottom = static_cast(padded_height - height - pad_top); const int pad_right = static_cast(padded_width - width - pad_left); - auto in_data = src.data(); - auto padded_in_data = dst->mutable_data(); + auto in_data = src.data(); + auto padded_in_data = dst->mutable_data(); const index_t img_size = height * width; const index_t padded_img_size = padded_height * padded_width; @@ -215,25 +214,26 @@ void Conv2dBase::PadInput(const Tensor &src, for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { const index_t bc = b * channels + c; - const float *in_base = in_data + bc * img_size; - float *padded_in_base = padded_in_data + bc * padded_img_size; + const uint8_t *in_base = in_data + bc * img_size * type_size_; + uint8_t *padded_in_base = + padded_in_data + bc * padded_img_size * type_size_; - memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width); - padded_in_base += pad_top * padded_width; + memset(padded_in_base, 0, type_size_ * pad_top * padded_width); + padded_in_base += pad_top * padded_width * type_size_; for (index_t h = 0; h < height; ++h) { memset(padded_in_base, 0, - sizeof(float) * pad_left); - memcpy(padded_in_base + pad_left, + type_size_ * pad_left); + memcpy(padded_in_base + pad_left * type_size_, in_base, - sizeof(float) * width); - memset(padded_in_base + pad_left + width, + type_size_ * width); + memset(padded_in_base + (pad_left + width) * type_size_, 0, - sizeof(float) * pad_right); - in_base += width; - padded_in_base += padded_width; + type_size_ * pad_right); + in_base += width * type_size_; + padded_in_base += padded_width * type_size_; } - memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width); + memset(padded_in_base, 0, type_size_ * pad_bottom * padded_width); } } } @@ -247,8 +247,8 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) { const index_t padded_height = src.dim(2); const index_t padded_width = src.dim(3); - auto padded_out_data = src.data(); - auto out_data = dst->mutable_data(); + auto padded_out_data = src.data(); + auto out_data = dst->mutable_data(); const index_t img_size = height * width; const index_t padded_img_size = padded_height * padded_width; @@ -256,21 +256,93 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) { for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { const index_t bc = (b * channels + c); - float *out_base = out_data + bc * img_size; - const float *padded_out_base = padded_out_data + bc * padded_img_size; + uint8_t *out_base = out_data + bc * img_size * type_size_; + const uint8_t *padded_out_base = + padded_out_data + bc * padded_img_size * type_size_; for (index_t h = 0; h < height; ++h) { - memcpy(out_base, - padded_out_base, - sizeof(float) * width); - out_base += width; - padded_out_base += padded_width; + memcpy(out_base, padded_out_base, type_size_ * width); + out_base += width * type_size_; + padded_out_base += padded_width * type_size_; } // h } // c } // b } -} // namespace fp32 +ConvComputeParam Conv2dBase::PreWorkAndGetConv2DParam( + const OpContext *context, const Tensor *in_tensor, Tensor *out_tensor) { + auto &in_shape = in_tensor->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + return ConvComputeParam(batch, in_channels, in_height, in_width, + out_channels, out_height, out_width, + in_image_size, out_image_size, + in_batch_size, out_batch_size, &thread_pool); +} + +DepthwiseConvComputeParam Conv2dBase::PreWorkAndGetDepthwiseConv2DParam( + const OpContext *context, const Tensor *input, + const Tensor *filter, Tensor *output) { + std::vector out_shape(4); + std::vector paddings(2); + auto &in_shape = input->shape(); + auto &filter_shape = filter->shape(); + CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings); + out_shape[1] *= filter_shape[1]; + MACE_CHECK(output->Resize(out_shape) == MaceStatus::MACE_SUCCESS, + "Resize failed."); + output->Clear(); + + const int pad_top = paddings[0] / 2; + const int pad_left = paddings[1] / 2; + + const index_t batch = in_shape[0]; + const index_t in_channels = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + const index_t multiplier = out_channels / in_channels; + + std::vector out_bounds; + CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds); + const index_t valid_h_start = out_bounds[0]; + const index_t valid_h_stop = out_bounds[1]; + const index_t valid_w_start = out_bounds[2]; + const index_t valid_w_stop = out_bounds[3]; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + return DepthwiseConvComputeParam( + batch, in_channels, in_height, in_width, out_channels, out_height, + out_width, in_image_size, out_image_size, in_batch_size, out_batch_size, + &thread_pool, pad_top, pad_left, multiplier, valid_h_start, valid_h_stop, + valid_w_start, valid_w_stop); +} + } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/base/conv_2d.h b/mace/ops/arm/base/conv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..e1cd0947031952aecdd9799653eff7a1e4989679 --- /dev/null +++ b/mace/ops/arm/base/conv_2d.h @@ -0,0 +1,159 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_CONV_2D_H_ +#define MACE_OPS_ARM_BASE_CONV_2D_H_ + +#include +#include + +#include "mace/core/ops/op_context.h" +#include "mace/core/tensor.h" +#include "mace/ops/arm/base/gemm.h" +#include "mace/ops/common/conv_pool_2d_util.h" +#include "mace/ops/delegator/conv_2d.h" +#include "mace/public/mace.h" + +namespace mace { +namespace ops { +namespace arm { + +struct ConvComputeParam { + const index_t batch; + const index_t in_channels; + const index_t in_height; + const index_t in_width; + const index_t out_channels; + const index_t out_height; + const index_t out_width; + + const index_t in_image_size; + const index_t out_image_size; + const index_t in_batch_size; + const index_t out_batch_size; + + utils::ThreadPool &thread_pool; + + ConvComputeParam(const index_t b, + const index_t in_c, + const index_t in_h, + const index_t in_w, + const index_t out_c, + const index_t out_h, + const index_t out_w, + const index_t in_size, + const index_t out_size, + const index_t in_b_size, + const index_t out_b_size, + utils::ThreadPool *thrd_pool) + : batch(b), in_channels(in_c), in_height(in_h), in_width(in_w), + out_channels(out_c), out_height(out_h), out_width(out_w), + in_image_size(in_size), out_image_size(out_size), + in_batch_size(in_b_size), out_batch_size(out_b_size), + thread_pool(*thrd_pool) {} +}; + +struct DepthwiseConvComputeParam : public ConvComputeParam { + const int pad_top; + const int pad_left; + const index_t multiplier; + const index_t valid_h_start; + const index_t valid_h_stop; + const index_t valid_w_start; + const index_t valid_w_stop; + DepthwiseConvComputeParam(const index_t b, + const index_t in_c, + const index_t in_h, + const index_t in_w, + const index_t out_c, + const index_t out_h, + const index_t out_w, + const index_t in_size, + const index_t out_size, + const index_t in_b_size, + const index_t out_b_size, + utils::ThreadPool *thrd_pool, + const int pad_top_data, + const int pad_left_data, + const index_t multiplier_data, + const index_t valid_height_start, + const index_t valid_height_stop, + const index_t valid_width_start, + const index_t valid_width_stop) + : ConvComputeParam(b, in_c, in_h, in_w, out_c, out_h, out_w, + in_size, out_size, in_b_size, out_b_size, thrd_pool), + pad_top(pad_top_data), pad_left(pad_left_data), + multiplier(multiplier_data), + valid_h_start(valid_height_start), valid_h_stop(valid_height_stop), + valid_w_start(valid_width_start), valid_w_stop(valid_width_stop) {} +}; + +class Conv2dBase : public delegator::Conv2d { + public: + explicit Conv2dBase(const delegator::Conv2dParam ¶m, int type_size) + : delegator::Conv2d(param), type_size_(type_size) {} + + virtual ~Conv2dBase() = default; + + protected: + void CalOutputShapeAndInputPadSize(const std::vector &input_shape, + const std::vector &filter_shape, + std::vector *output_shape, + std::vector *in_pad_size); + + void CalOutputBoundaryWithoutUsingInputPad(const std::vector + &output_shape, + const std::vector + in_pad_size, + std::vector + *out_bound); + + void CalOutputShapeAndPadSize(const Tensor *input, + const Tensor *filter, + const int out_tile_height, + const int out_tile_width, + std::vector *output_shape, + std::vector *in_pad_size, + std::vector *out_pad_size); + + MaceStatus ResizeOutAndPadInOut(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output, + const int out_tile_height, + const int out_tile_width, + std::unique_ptr *padded_input, + std::unique_ptr *padded_output); + + void PadInput(const Tensor &src, + const int pad_top, + const int pad_left, + Tensor *dst); + void UnPadOutput(const Tensor &src, Tensor *dst); + + ConvComputeParam PreWorkAndGetConv2DParam( + const OpContext *context, const Tensor *in_tensor, Tensor *out_tensor); + DepthwiseConvComputeParam PreWorkAndGetDepthwiseConv2DParam( + const OpContext *context, const Tensor *input, + const Tensor *filter, Tensor *output); + + private: + int type_size_; +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_CONV_2D_H_ diff --git a/mace/ops/arm/fp32/conv_2d_1x1.cc b/mace/ops/arm/base/conv_2d_1x1.cc similarity index 67% rename from mace/ops/arm/fp32/conv_2d_1x1.cc rename to mace/ops/arm/base/conv_2d_1x1.cc index fb3c8a26a738eaedadf7afea7ce1cb60c5c362f5..7fa96e8cf72cbe80c94b4f9f218800345842387c 100644 --- a/mace/ops/arm/fp32/conv_2d_1x1.cc +++ b/mace/ops/arm/base/conv_2d_1x1.cc @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,36 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/conv_2d.h" -#include "mace/ops/arm/fp32/gemm.h" -#include "mace/ops/delegator/conv_2d.h" +#include "mace/ops/arm/base/conv_2d_1x1.h" + +#include namespace mace { namespace ops { namespace arm { -namespace fp32 { - -class Conv2dK1x1 : public Conv2dBase { - public: - explicit Conv2dK1x1(const delegator::Conv2dParam ¶m) - : Conv2dBase(param), - gemm_(delegator::GemmParam()) {} - virtual ~Conv2dK1x1() {} - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; - - private: - Gemm gemm_; -}; -MaceStatus Conv2dK1x1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { +template +MaceStatus Conv2dK1x1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { index_t batch = input->dim(0); index_t in_height = input->dim(2); index_t in_width = input->dim(3); @@ -50,13 +33,8 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context, std::vector output_shape; std::vector in_pad_size; std::vector out_pad_size; - CalOutputShapeAndPadSize(input, - filter, - 1, - 1, - &output_shape, - &in_pad_size, - &out_pad_size); + CalOutputShapeAndPadSize(input, filter, 1, 1, + &output_shape, &in_pad_size, &out_pad_size); MACE_RETURN_IF_ERROR(output->Resize(output_shape)); const index_t out_channels = output_shape[1]; @@ -70,16 +48,16 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context, in_height != padded_in_height || in_width != padded_in_width; auto scratch_buffer = context->device()->scratch_buffer(); const index_t padded_in_size = is_in_padded ? PadAlignSize( - sizeof(float) * batch * in_channels * padded_in_height + sizeof(T) * batch * in_channels * padded_in_height * padded_in_width) : 0; const index_t pack_filter_size = - PadAlignSize(sizeof(float) * out_channels * in_channels); + PadAlignSize(sizeof(T) * out_channels * in_channels); const index_t pack_input_size = PadAlignSize( - sizeof(float) * in_channels * padded_in_height * padded_in_width); + sizeof(T) * in_channels * padded_in_height * padded_in_width); const index_t pack_output_size = PadAlignSize( - sizeof(float) * out_channels * padded_in_height * padded_in_width); + sizeof(T) * out_channels * padded_in_height * padded_in_width); const index_t gemm_pack_size = pack_filter_size + pack_input_size + pack_output_size; @@ -115,12 +93,11 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context, void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) { MACE_REGISTER_DELEGATOR( - registry, Conv2dK1x1, delegator::Conv2dParam, + registry, Conv2dK1x1, delegator::Conv2dParam, MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, float, ImplType::NEON, K1x1)); } -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/base/conv_2d_1x1.h b/mace/ops/arm/base/conv_2d_1x1.h new file mode 100644 index 0000000000000000000000000000000000000000..197e98e9464e36fb2a24acb0103ede98605e4a59 --- /dev/null +++ b/mace/ops/arm/base/conv_2d_1x1.h @@ -0,0 +1,47 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_CONV_2D_1X1_H_ +#define MACE_OPS_ARM_BASE_CONV_2D_1X1_H_ + +#include "mace/ops/arm/base/conv_2d.h" +#include "mace/ops/arm/base/gemm.h" + +namespace mace { +namespace ops { +namespace arm { + +template +class Conv2dK1x1 : public Conv2dBase { + public: + explicit Conv2dK1x1(const delegator::Conv2dParam ¶m) + : Conv2dBase(param, sizeof(T)), + gemm_(delegator::GemmParam()) {} + virtual ~Conv2dK1x1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) override; + + private: + Gemm gemm_; +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_CONV_2D_1X1_H_ diff --git a/mace/ops/arm/base/conv_2d_1xn.cc b/mace/ops/arm/base/conv_2d_1xn.cc new file mode 100644 index 0000000000000000000000000000000000000000..417bec8a53376378654c34c2932e18424d811efb --- /dev/null +++ b/mace/ops/arm/base/conv_2d_1xn.cc @@ -0,0 +1,45 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/conv_2d_1xn.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dK1x7S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K1x7S1)); + + MACE_REGISTER_DELEGATOR( + registry, Conv2dK7x1S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K7x1S1)); + + MACE_REGISTER_DELEGATOR( + registry, Conv2dK1x15S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K1x15S1)); + + MACE_REGISTER_DELEGATOR( + registry, Conv2dK15x1S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K15x1S1)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_1xn.h b/mace/ops/arm/base/conv_2d_1xn.h similarity index 50% rename from mace/ops/arm/fp32/conv_2d_1xn.h rename to mace/ops/arm/base/conv_2d_1xn.h index c0a6da637e3ecffd74da458c71730a8646e365c3..ef18e0f7aacfc5f97dab15b70b39b71e8e4870cc 100644 --- a/mace/ops/arm/fp32/conv_2d_1xn.h +++ b/mace/ops/arm/base/conv_2d_1xn.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,76 +12,66 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ -#define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ +#ifndef MACE_OPS_ARM_BASE_CONV_2D_1XN_H_ +#define MACE_OPS_ARM_BASE_CONV_2D_1XN_H_ #include #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/arm/fp32/conv_2d.h" +#include "mace/ops/arm/base/conv_2d_mxn.h" #include "mace/public/mace.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { -class Conv2dK1x7S1 : public Conv2dBase { +template +class Conv2dK1x7S1 : public Conv2dKMxN { public: explicit Conv2dK1x7S1(const delegator::Conv2dParam ¶m) - : Conv2dBase(param) {} + : Conv2dKMxN(param, 1, 4) {} virtual ~Conv2dK1x7S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; + MaceStatus DoCompute(const ConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; }; -class Conv2dK7x1S1 : public Conv2dBase { +template +class Conv2dK7x1S1 : public Conv2dKMxN { public: explicit Conv2dK7x1S1(const delegator::Conv2dParam ¶m) - : Conv2dBase(param) {} + : Conv2dKMxN(param, 4, 1) {} virtual ~Conv2dK7x1S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; + MaceStatus DoCompute(const ConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; }; -class Conv2dK1x15S1 : public Conv2dBase { +template +class Conv2dK1x15S1 : public Conv2dKMxN { public: explicit Conv2dK1x15S1(const delegator::Conv2dParam ¶m) - : Conv2dBase(param) {} + : Conv2dKMxN(param, 1, 4) {} virtual ~Conv2dK1x15S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; + MaceStatus DoCompute(const ConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; }; -class Conv2dK15x1S1 : public Conv2dBase { +template +class Conv2dK15x1S1 : public Conv2dKMxN { public: explicit Conv2dK15x1S1(const delegator::Conv2dParam ¶m) - : Conv2dBase(param) {} + : Conv2dKMxN(param, 4, 1) {} virtual ~Conv2dK15x1S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; + MaceStatus DoCompute(const ConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; }; -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ +#endif // MACE_OPS_ARM_BASE_CONV_2D_1XN_H_ diff --git a/mace/ops/arm/base/conv_2d_3x3.cc b/mace/ops/arm/base/conv_2d_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..f2c02b3a9a9ebf6db87100f19f970b9c59774666 --- /dev/null +++ b/mace/ops/arm/base/conv_2d_3x3.cc @@ -0,0 +1,34 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/conv_2d_3x3.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dK3x3S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S1)); + MACE_REGISTER_DELEGATOR( + registry, Conv2dK3x3S2, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S2)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_3x3.h b/mace/ops/arm/base/conv_2d_3x3.h similarity index 58% rename from mace/ops/arm/fp32/conv_2d_3x3.h rename to mace/ops/arm/base/conv_2d_3x3.h index e64d061e3e6103f78901c144d9866d047e8dfc96..9aaf66f0b7dc84e95d204b90c9aab2ee658eedec 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3.h +++ b/mace/ops/arm/base/conv_2d_3x3.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,50 +12,44 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ -#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ +#ifndef MACE_OPS_ARM_BASE_CONV_2D_3X3_H_ +#define MACE_OPS_ARM_BASE_CONV_2D_3X3_H_ #include #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/arm/fp32/conv_2d.h" +#include "mace/ops/arm/base/conv_2d_mxn.h" #include "mace/public/mace.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { -class Conv2dK3x3S1 : public Conv2dBase { +template +class Conv2dK3x3S1 : public Conv2dKMxN { public: explicit Conv2dK3x3S1(const delegator::Conv2dParam ¶m) - : Conv2dBase(param) {} + : Conv2dKMxN(param, 2, 4) {} virtual ~Conv2dK3x3S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; + MaceStatus DoCompute(const ConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; }; -class Conv2dK3x3S2 : public Conv2dBase { +template +class Conv2dK3x3S2 : public Conv2dKMxN { public: explicit Conv2dK3x3S2(const delegator::Conv2dParam ¶m) - : Conv2dBase(param) {} + : Conv2dKMxN(param, 1, 4) {} virtual ~Conv2dK3x3S2() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; + MaceStatus DoCompute(const ConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; }; -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ +#endif // MACE_OPS_ARM_BASE_CONV_2D_3X3_H_ diff --git a/mace/ops/arm/base/conv_2d_5x5.cc b/mace/ops/arm/base/conv_2d_5x5.cc new file mode 100644 index 0000000000000000000000000000000000000000..5db15881552dd2e044353ee118080afd9fc6b54f --- /dev/null +++ b/mace/ops/arm/base/conv_2d_5x5.cc @@ -0,0 +1,30 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/conv_2d_5x5.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dK5x5S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K5x5S1)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/base/conv_2d_5x5.h b/mace/ops/arm/base/conv_2d_5x5.h new file mode 100644 index 0000000000000000000000000000000000000000..1528927e39f77a64c673da8c14a6fd1724fa98ac --- /dev/null +++ b/mace/ops/arm/base/conv_2d_5x5.h @@ -0,0 +1,44 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_CONV_2D_5X5_H_ +#define MACE_OPS_ARM_BASE_CONV_2D_5X5_H_ + +#include + +#include "mace/core/ops/op_context.h" +#include "mace/core/tensor.h" +#include "mace/ops/arm/base/conv_2d_mxn.h" +#include "mace/public/mace.h" + +namespace mace { +namespace ops { +namespace arm { + +template +class Conv2dK5x5S1 : public Conv2dKMxN { + public: + explicit Conv2dK5x5S1(const delegator::Conv2dParam ¶m) + : Conv2dKMxN(param, 1, 4) {} + virtual ~Conv2dK5x5S1() {} + + MaceStatus DoCompute(const ConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_CONV_2D_5X5_H_ diff --git a/mace/ops/arm/base/conv_2d_7x7.cc b/mace/ops/arm/base/conv_2d_7x7.cc new file mode 100644 index 0000000000000000000000000000000000000000..611f7acc910af4cac516e461ccd54af546ebd6da --- /dev/null +++ b/mace/ops/arm/base/conv_2d_7x7.cc @@ -0,0 +1,38 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/conv_2d_7x7.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dK7x7S1, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K7x7S1)); + MACE_REGISTER_DELEGATOR( + registry, Conv2dK7x7S2, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K7x7S2)); + MACE_REGISTER_DELEGATOR( + registry, Conv2dK7x7S3, delegator::Conv2dParam, + MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, + float, ImplType::NEON, K7x7S3)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_7x7.h b/mace/ops/arm/base/conv_2d_7x7.h similarity index 53% rename from mace/ops/arm/fp32/conv_2d_7x7.h rename to mace/ops/arm/base/conv_2d_7x7.h index 0d0467fc5b38a354bab744503dafbe28b5f180f3..f9b8374287000e7ab391f522b4b447daf65faba3 100644 --- a/mace/ops/arm/fp32/conv_2d_7x7.h +++ b/mace/ops/arm/base/conv_2d_7x7.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,63 +12,55 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ -#define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ +#ifndef MACE_OPS_ARM_BASE_CONV_2D_7X7_H_ +#define MACE_OPS_ARM_BASE_CONV_2D_7X7_H_ #include #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/arm/fp32/conv_2d.h" +#include "mace/ops/arm/base/conv_2d_mxn.h" #include "mace/public/mace.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { -class Conv2dK7x7S1 : public Conv2dBase { +template +class Conv2dK7x7S1 : public Conv2dKMxN { public: explicit Conv2dK7x7S1(const delegator::Conv2dParam ¶m) - : Conv2dBase(param) {} + : Conv2dKMxN(param, 1, 4) {} virtual ~Conv2dK7x7S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; + MaceStatus DoCompute(const ConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; }; -class Conv2dK7x7S2 : public Conv2dBase { +template +class Conv2dK7x7S2 : public Conv2dKMxN { public: explicit Conv2dK7x7S2(const delegator::Conv2dParam ¶m) - : Conv2dBase(param) {} + : Conv2dKMxN(param, 1, 4) {} virtual ~Conv2dK7x7S2() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; + MaceStatus DoCompute(const ConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; }; -class Conv2dK7x7S3 : public Conv2dBase { +template +class Conv2dK7x7S3 : public Conv2dKMxN { public: explicit Conv2dK7x7S3(const delegator::Conv2dParam ¶m) - : Conv2dBase(param) {} + : Conv2dKMxN(param, 1, 4) {} virtual ~Conv2dK7x7S3() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; + MaceStatus DoCompute(const ConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; }; -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ +#endif // MACE_OPS_ARM_BASE_CONV_2D_7X7_H_ diff --git a/mace/ops/arm/base/conv_2d_general.cc b/mace/ops/arm/base/conv_2d_general.cc new file mode 100644 index 0000000000000000000000000000000000000000..04121b8c003c47d4596c96586ccb3071aeeae171 --- /dev/null +++ b/mace/ops/arm/base/conv_2d_general.cc @@ -0,0 +1,68 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/conv_2d_general.h" + +#include + +namespace mace { +namespace ops { +namespace arm { + +template +MaceStatus Conv2dGeneral::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, input, filter, output, 1, 4, + &padded_input, &padded_output); + const Tensor *in_tensor = input; + if (padded_input != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + + const T *filter_data = filter->data(); + const T *input_data = in_tensor->data(); + T *output_data = out_tensor->mutable_data(); + + const ConvComputeParam p = + PreWorkAndGetConv2DParam(context, in_tensor, out_tensor); + auto &filter_shape = filter->shape(); + + DoCompute(p, filter_data, input_data, output_data, filter_shape); + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; +} + +void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Conv2dGeneral, delegator::Conv2dParam, + MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/base/conv_2d_general.h b/mace/ops/arm/base/conv_2d_general.h new file mode 100644 index 0000000000000000000000000000000000000000..f0944d9b1056cb3a6762009c768fb643ef064f2b --- /dev/null +++ b/mace/ops/arm/base/conv_2d_general.h @@ -0,0 +1,50 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_ +#define MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_ + +#include + +#include "mace/core/ops/op_context.h" +#include "mace/core/tensor.h" +#include "mace/ops/arm/base/conv_2d.h" +#include "mace/public/mace.h" + +namespace mace { +namespace ops { +namespace arm { + +template +class Conv2dGeneral : public Conv2dBase { + public: + explicit Conv2dGeneral(const delegator::Conv2dParam ¶m) + : Conv2dBase(param, sizeof(T)) {} + virtual ~Conv2dGeneral() {} + + MaceStatus Compute(const OpContext *context, const Tensor *input, + const Tensor *filter, Tensor *output) override; + + protected: + MaceStatus DoCompute( + const ConvComputeParam &p, const T *filter_data, + const T *input_data, T *output_data, + const std::vector &filter_shape); +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_ diff --git a/mace/ops/arm/base/conv_2d_mxn.h b/mace/ops/arm/base/conv_2d_mxn.h new file mode 100644 index 0000000000000000000000000000000000000000..0941cfa71f1f3513612b5b45d8448f23e4b19d51 --- /dev/null +++ b/mace/ops/arm/base/conv_2d_mxn.h @@ -0,0 +1,85 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_CONV_2D_MXN_H_ +#define MACE_OPS_ARM_BASE_CONV_2D_MXN_H_ + +#include +#include + +#include "mace/core/ops/op_context.h" +#include "mace/core/tensor.h" +#include "mace/ops/arm/base/conv_2d.h" +#include "mace/public/mace.h" + +namespace mace { +namespace ops { +namespace arm { + +template +class Conv2dKMxN : public Conv2dBase { + public: + explicit Conv2dKMxN(const delegator::Conv2dParam ¶m, + const int tile_h, const int tile_w) + : Conv2dBase(param, sizeof(T)), + out_tile_h_(tile_h), out_tile_w_(tile_w) {} + + virtual ~Conv2dKMxN() {} + + MaceStatus Compute(const OpContext *context, const Tensor *input, + const Tensor *filter, Tensor *output) override { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, input, filter, output, out_tile_h_, + out_tile_w_, &padded_input, &padded_output); + const Tensor *in_tensor = input; + if (padded_input != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + + const T *filter_data = filter->data(); + const T *input_data = in_tensor->data(); + T *output_data = out_tensor->mutable_data(); + + const ConvComputeParam p = + PreWorkAndGetConv2DParam(context, in_tensor, out_tensor); + + DoCompute(p, filter_data, input_data, output_data); + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; + } + + virtual MaceStatus DoCompute(const ConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) = 0; + + private: + const int out_tile_h_; + const int out_tile_w_; +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_CONV_2D_MXN_H_ diff --git a/mace/ops/arm/fp32/deconv_2d.cc b/mace/ops/arm/base/deconv_2d.cc similarity index 53% rename from mace/ops/arm/fp32/deconv_2d.cc rename to mace/ops/arm/base/deconv_2d.cc index 41a01a6ca3c653e3412c6c1f27403c0d4c04bd11..1fc14db618cef8468a2e6b6c16c582bae2891afb 100644 --- a/mace/ops/arm/fp32/deconv_2d.cc +++ b/mace/ops/arm/base/deconv_2d.cc @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,17 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/arm/base/deconv_2d.h" -#include #include -#include "mace/utils/memory.h" +#include + #include "mace/ops/common/conv_pool_2d_util.h" +#include "mace/utils/memory.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { MaceStatus Deconv2dBase::ResizeOutAndPadOut( const OpContext *context, @@ -67,7 +67,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut( std::accumulate(padded_out_shape.begin(), padded_out_shape.end(), 1, - std::multiplies()) * sizeof(float); + std::multiplies()) * type_size_; ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); index_t scratch_size = PadAlignSize(padded_out_size); @@ -75,7 +75,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut( std::unique_ptr padded_out - (make_unique(scratch->Scratch(scratch_size), DT_FLOAT)); + (make_unique(scratch->Scratch(scratch_size), output->dtype())); padded_out->Reshape(padded_out_shape); *padded_output = std::move(padded_out); } @@ -97,24 +97,97 @@ void Deconv2dBase::UnPadOutput(const Tensor &src, const index_t padded_height = src.dim(2); const index_t padded_width = src.dim(3); - auto padded_out_data = src.data(); - auto out_data = dst->mutable_data(); + auto padded_out_data = src.data(); + auto out_data = dst->mutable_data(); for (index_t i = 0; i < batch; ++i) { for (index_t j = 0; j < channels; ++j) { for (index_t k = 0; k < height; ++k) { - const float *input_base = + const uint8_t *input_base = padded_out_data + ((i * channels + j) * padded_height - + (k + pad_h)) * padded_width; - float *output_base = - out_data + ((i * channels + j) * height + k) * width; - memcpy(output_base, input_base + pad_w, width * sizeof(float)); + + (k + pad_h)) * padded_width * type_size_; + uint8_t *output_base = + out_data + ((i * channels + j) * height + k) * width * type_size_; + memcpy(output_base, + input_base + pad_w * type_size_, + width * type_size_); } } } } -} // namespace fp32 +DeconvComputeParam Deconv2dBase::PreWorkAndGetDeconvParam( + const OpContext *context, const Tensor *input, Tensor *out_tensor) { + + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + return DeconvComputeParam(batch, inch, h, w, outch, outh, outw, + out_img_size, &thread_pool); +} + +DepthwiseDeconvComputeParam Deconv2dBase::PreWorkAndGetDepthwiseDeconvParam( + const OpContext *context, const Tensor *input, Tensor *out_tensor) { + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t channels = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + const index_t in_img_size = h * w; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + const index_t out_img_size = outh * outw; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + return DepthwiseDeconvComputeParam(batch, channels, h, w, in_img_size, + outh, outw, out_img_size, &thread_pool); +} + +GroupDeconvComputeParam Deconv2dBase::PreWorkAndGetGroupDeconvParam( + const OpContext *context, const Tensor *input, Tensor *out_tensor) { + auto &in_shape = input->shape(); + auto &out_shape = out_tensor->shape(); + + const index_t batch = in_shape[0]; + const index_t inch = in_shape[1]; + const index_t h = in_shape[2]; + const index_t w = in_shape[3]; + + const index_t outch = out_shape[1]; + const index_t outh = out_shape[2]; + const index_t outw = out_shape[3]; + + const index_t in_img_size = h * w; + const index_t out_img_size = outh * outw; + + const index_t inch_g = inch / group_; + const index_t outch_g = outch / group_; + + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + return GroupDeconvComputeParam(batch, inch, h, w, outch, outh, outw, + in_img_size, out_img_size, inch_g, + outch_g, &thread_pool); +} + } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/base/deconv_2d.h b/mace/ops/arm/base/deconv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..19d971177adf7eaa75a52b4745925dfd19901098 --- /dev/null +++ b/mace/ops/arm/base/deconv_2d.h @@ -0,0 +1,172 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_DECONV_2D_H_ +#define MACE_OPS_ARM_BASE_DECONV_2D_H_ + +#include +#include + +#include "mace/core/ops/op_context.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/ops/arm/base/gemm.h" +#include "mace/ops/common/conv_pool_2d_util.h" +#include "mace/ops/delegator/deconv_2d.h" +#include "mace/public/mace.h" + +namespace mace { +namespace ops { +namespace arm { + +struct DeconvComputeParam { + const index_t batch; + const index_t in_channels; + const index_t in_height; + const index_t in_width; + const index_t out_channels; + const index_t out_height; + const index_t out_width; + const index_t out_img_size; + + utils::ThreadPool &thread_pool; + + DeconvComputeParam(const index_t b, + const index_t in_c, + const index_t in_h, + const index_t in_w, + const index_t out_c, + const index_t out_h, + const index_t out_w, + const index_t out_size, + utils::ThreadPool *thrd_pool) + : batch(b), in_channels(in_c), in_height(in_h), in_width(in_w), + out_channels(out_c), out_height(out_h), out_width(out_w), + out_img_size(out_size), thread_pool(*thrd_pool) {} +}; + +struct DepthwiseDeconvComputeParam { + const index_t batch; + const index_t in_channels; + const index_t in_height; + const index_t in_width; + const index_t in_img_size; + const index_t out_height; + const index_t out_width; + const index_t out_img_size; + utils::ThreadPool &thread_pool; + + DepthwiseDeconvComputeParam(const index_t b, + const index_t in_c, + const index_t in_h, + const index_t in_w, + const index_t in_size, + const index_t out_h, + const index_t out_w, + const index_t out_size, + utils::ThreadPool *thrd_pool) + : batch(b), + in_channels(in_c), + in_height(in_h), + in_width(in_w), + in_img_size(in_size), + out_height(out_h), + out_width(out_w), + out_img_size(out_size), + thread_pool(*thrd_pool) {} +}; + +struct GroupDeconvComputeParam { + const index_t batch; + const index_t in_channels; + const index_t in_height; + const index_t in_width; + + const index_t out_channels; + const index_t out_height; + const index_t out_width; + + const index_t in_img_size; + const index_t out_img_size; + + const index_t inch_g; + const index_t outch_g; + utils::ThreadPool &thread_pool; + + GroupDeconvComputeParam(const index_t in_b, + const index_t in_ch, + const index_t in_h, + const index_t in_w, + const index_t out_ch, + const index_t out_h, + const index_t out_w, + const index_t in_size, + const index_t out_size, + const index_t in_ch_g, + const index_t out_ch_g, + utils::ThreadPool *thrd_pool) + : batch(in_b), + in_channels(in_ch), + in_height(in_h), + in_width(in_w), + out_channels(out_ch), + out_height(out_h), + out_width(out_w), + in_img_size(in_size), + out_img_size(out_size), + inch_g(in_ch_g), + outch_g(out_ch_g), + thread_pool(*thrd_pool) {} +}; + +class Deconv2dBase : public delegator::Deconv2d { + public: + explicit Deconv2dBase(const delegator::Deconv2dParam ¶m, int type_size) + : delegator::Deconv2d(param), + group_(param.group_), type_size_(type_size) {} + + virtual ~Deconv2dBase() = default; + + protected: + MaceStatus ResizeOutAndPadOut(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output, + std::vector *out_pad_size, + std::unique_ptr *padded_output); + + void UnPadOutput(const Tensor &src, + const std::vector &out_pad_size, + Tensor *dst); + + DeconvComputeParam PreWorkAndGetDeconvParam( + const OpContext *context, const Tensor *input, Tensor *out_tensor); + DepthwiseDeconvComputeParam PreWorkAndGetDepthwiseDeconvParam( + const OpContext *context, const Tensor *input, Tensor *out_tensor); + GroupDeconvComputeParam PreWorkAndGetGroupDeconvParam( + const OpContext *context, const Tensor *input, Tensor *out_tensor); + + protected: + index_t group_; + + private: + int type_size_; +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_DECONV_2D_H_ diff --git a/mace/ops/arm/base/deconv_2d_2x2.cc b/mace/ops/arm/base/deconv_2d_2x2.cc new file mode 100644 index 0000000000000000000000000000000000000000..d8181034b38835401c9c2047bc3d661311a9b90b --- /dev/null +++ b/mace/ops/arm/base/deconv_2d_2x2.cc @@ -0,0 +1,34 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/deconv_2d_2x2.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK2x2S1, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K2x2S1)); + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK2x2S2, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K2x2S2)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/deconv_2d_2x2.h b/mace/ops/arm/base/deconv_2d_2x2.h similarity index 58% rename from mace/ops/arm/fp32/deconv_2d_2x2.h rename to mace/ops/arm/base/deconv_2d_2x2.h index 6fd533444a2e1a1e910c2d527987112940ddb4cc..6d1a416a114f3771f5ed1d120b3be1542b7aa1c2 100644 --- a/mace/ops/arm/fp32/deconv_2d_2x2.h +++ b/mace/ops/arm/base/deconv_2d_2x2.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_ -#define MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_ +#ifndef MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_ +#define MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_ #include #include @@ -21,46 +21,38 @@ #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" #include "mace/core/types.h" -#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/arm/base/deconv_2d_mxn.h" #include "mace/ops/common/conv_pool_2d_util.h" #include "mace/public/mace.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { -class Deconv2dK2x2S1 : public Deconv2dBase { +template +class Deconv2dK2x2S1 : public Deconv2dKMxN { public: explicit Deconv2dK2x2S1(const delegator::Deconv2dParam ¶m) - : Deconv2dBase(param) {} + : Deconv2dKMxN(param) {} virtual ~Deconv2dK2x2S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -class Deconv2dK2x2S2 : public Deconv2dBase { +template +class Deconv2dK2x2S2 : public Deconv2dKMxN { public: explicit Deconv2dK2x2S2(const delegator::Deconv2dParam ¶m) - : Deconv2dBase(param) {} + : Deconv2dKMxN(param) {} virtual ~Deconv2dK2x2S2() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_ +#endif // MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_ diff --git a/mace/ops/arm/base/deconv_2d_3x3.cc b/mace/ops/arm/base/deconv_2d_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..05d936c29b793d326d8d30e845a3df94c3a5dec4 --- /dev/null +++ b/mace/ops/arm/base/deconv_2d_3x3.cc @@ -0,0 +1,34 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/deconv_2d_3x3.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK3x3S1, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S1)); + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK3x3S2, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S2)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/deconv_2d_3x3.h b/mace/ops/arm/base/deconv_2d_3x3.h similarity index 58% rename from mace/ops/arm/fp32/deconv_2d_3x3.h rename to mace/ops/arm/base/deconv_2d_3x3.h index 65cc23e6f365d9809d983c94bc12855760046a17..00b33b429a28af55f802dedc85d67b65c614d82e 100644 --- a/mace/ops/arm/fp32/deconv_2d_3x3.h +++ b/mace/ops/arm/base/deconv_2d_3x3.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_ -#define MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_ +#ifndef MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_ +#define MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_ #include #include @@ -21,46 +21,38 @@ #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" #include "mace/core/types.h" -#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/arm/base/deconv_2d_mxn.h" #include "mace/ops/common/conv_pool_2d_util.h" #include "mace/public/mace.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { -class Deconv2dK3x3S1 : public Deconv2dBase { +template +class Deconv2dK3x3S1 : public Deconv2dKMxN { public: explicit Deconv2dK3x3S1(const delegator::Deconv2dParam ¶m) - : Deconv2dBase(param) {} + : Deconv2dKMxN(param) {} virtual ~Deconv2dK3x3S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -class Deconv2dK3x3S2 : public Deconv2dBase { +template +class Deconv2dK3x3S2 : public Deconv2dKMxN { public: explicit Deconv2dK3x3S2(const delegator::Deconv2dParam ¶m) - : Deconv2dBase(param) {} + : Deconv2dKMxN(param) {} virtual ~Deconv2dK3x3S2() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_ +#endif // MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_ diff --git a/mace/ops/arm/base/deconv_2d_4x4.cc b/mace/ops/arm/base/deconv_2d_4x4.cc new file mode 100644 index 0000000000000000000000000000000000000000..9cc42b3ddfd12843a405292fc65d283384e38cda --- /dev/null +++ b/mace/ops/arm/base/deconv_2d_4x4.cc @@ -0,0 +1,34 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/deconv_2d_4x4.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK4x4S1, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S1)); + MACE_REGISTER_DELEGATOR( + registry, Deconv2dK4x4S2, delegator::Deconv2dParam, + MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S2)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/deconv_2d_4x4.h b/mace/ops/arm/base/deconv_2d_4x4.h similarity index 58% rename from mace/ops/arm/fp32/deconv_2d_4x4.h rename to mace/ops/arm/base/deconv_2d_4x4.h index bf86a62ab4575ef20072dc6f1fd648f2bd65da14..692ff73865a5ac6e7ef5651634874190ed85c964 100644 --- a/mace/ops/arm/fp32/deconv_2d_4x4.h +++ b/mace/ops/arm/base/deconv_2d_4x4.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,55 +12,47 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_ -#define MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_ +#ifndef MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_ +#define MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_ -#include #include +#include #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" #include "mace/core/types.h" -#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/arm/base/deconv_2d_mxn.h" #include "mace/ops/common/conv_pool_2d_util.h" #include "mace/public/mace.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { -class Deconv2dK4x4S1 : public Deconv2dBase { +template +class Deconv2dK4x4S1 : public Deconv2dKMxN { public: explicit Deconv2dK4x4S1(const delegator::Deconv2dParam ¶m) - : Deconv2dBase(param) {} + : Deconv2dKMxN(param) {} virtual ~Deconv2dK4x4S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -class Deconv2dK4x4S2 : public Deconv2dBase { +template +class Deconv2dK4x4S2 : public Deconv2dKMxN { public: explicit Deconv2dK4x4S2(const delegator::Deconv2dParam ¶m) - : Deconv2dBase(param) {} + : Deconv2dKMxN(param) {} virtual ~Deconv2dK4x4S2() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_ +#endif // MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_ diff --git a/mace/ops/arm/fp32/deconv_2d_general.cc b/mace/ops/arm/base/deconv_2d_general.cc similarity index 76% rename from mace/ops/arm/fp32/deconv_2d_general.cc rename to mace/ops/arm/base/deconv_2d_general.cc index d090ba23104869712fa2af1e9fc9e6dc203f0276..ec95f186aef7d29f9789d8a53a69e773fca8cb6d 100644 --- a/mace/ops/arm/fp32/deconv_2d_general.cc +++ b/mace/ops/arm/base/deconv_2d_general.cc @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,34 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/arm/base/deconv_2d_general.h" -// TODO(liutuo): optimize it +#include +#include namespace mace { namespace ops { namespace arm { -namespace fp32 { - -class Deconv2dGeneral : public Deconv2dBase { - public: - explicit Deconv2dGeneral(const delegator::Deconv2dParam ¶m) - : Deconv2dBase(param) {} - virtual ~Deconv2dGeneral() {} - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; -}; - -MaceStatus Deconv2dGeneral::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { + +template +MaceStatus Deconv2dGeneral::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { std::unique_ptr padded_out; std::vector out_pad_size; ResizeOutAndPadOut(context, @@ -60,9 +47,9 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context, Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard output_mapper(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); auto &in_shape = input->shape(); auto &out_shape = out_tensor->shape(); @@ -95,7 +82,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context, index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t oc = start1; oc < end1; oc += step1) { - float *out_base = + T *out_base = padded_out_data + (b * out_channels + oc) * out_img_size; for (index_t i = 0; i < in_height; ++i) { for (index_t j = 0; j < in_width; ++j) { @@ -104,7 +91,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context, for (int ic = 0; ic < in_channels; ++ic) { const index_t input_idx = (b * in_channels + ic) * in_img_size + i * in_width + j; - const float val = input_data[input_idx]; + const T val = input_data[input_idx]; const index_t kernel_offset = (oc * in_channels + ic) * kernel_size; for (int k = 0; k < kernel_size; ++k) { @@ -126,11 +113,10 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context, void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_DELEGATOR( - registry, Deconv2dGeneral, delegator::Deconv2dParam, + registry, Deconv2dGeneral, delegator::Deconv2dParam, MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::NEON)); } -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/base/deconv_2d_general.h b/mace/ops/arm/base/deconv_2d_general.h new file mode 100644 index 0000000000000000000000000000000000000000..fe1786dd96a62447ae8cfe6c4dfa99123c6432fc --- /dev/null +++ b/mace/ops/arm/base/deconv_2d_general.h @@ -0,0 +1,46 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_ +#define MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_ + +#include "mace/ops/arm/base/deconv_2d.h" + +// TODO(liutuo): optimize it + +namespace mace { +namespace ops { +namespace arm { + +template +class Deconv2dGeneral : public Deconv2dBase { + public: + explicit Deconv2dGeneral(const delegator::Deconv2dParam ¶m) + : Deconv2dBase(param, sizeof(T)) {} + virtual ~Deconv2dGeneral() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) override; +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_ + diff --git a/mace/ops/arm/base/deconv_2d_mxn.h b/mace/ops/arm/base/deconv_2d_mxn.h new file mode 100644 index 0000000000000000000000000000000000000000..89775dae9d5d159258ef572f56bee96fc12202bc --- /dev/null +++ b/mace/ops/arm/base/deconv_2d_mxn.h @@ -0,0 +1,77 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_ +#define MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_ + +#include +#include + +#include "mace/core/ops/op_context.h" +#include "mace/core/tensor.h" +#include "mace/ops/arm/base/deconv_2d.h" +#include "mace/public/mace.h" + +namespace mace { +namespace ops { +namespace arm { + +template +class Deconv2dKMxN : public Deconv2dBase { + public: + explicit Deconv2dKMxN(const delegator::Deconv2dParam ¶m) + : Deconv2dBase(param, sizeof(T)) {} + virtual ~Deconv2dKMxN() {} + + MaceStatus Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, input, filter, output_shape, + output, &out_pad_size, &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + const T *input_data = input->data(); + const T *filter_data = filter->data(); + T *padded_out_data = out_tensor->mutable_data(); + + const DeconvComputeParam p = + PreWorkAndGetDeconvParam(context, input, out_tensor); + DoCompute(p, filter_data, input_data, padded_out_data); + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; + } + + virtual MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) = 0; +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_ diff --git a/mace/ops/arm/base/depthwise_conv_2d_3x3.cc b/mace/ops/arm/base/depthwise_conv_2d_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..f94239596acca1027ddfdb685099e3d43d0326f5 --- /dev/null +++ b/mace/ops/arm/base/depthwise_conv_2d_3x3.cc @@ -0,0 +1,34 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S1)); + MACE_REGISTER_DELEGATOR( + registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S2)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h b/mace/ops/arm/base/depthwise_conv_2d_3x3.h similarity index 58% rename from mace/ops/arm/fp32/depthwise_conv_2d_3x3.h rename to mace/ops/arm/base/depthwise_conv_2d_3x3.h index 49412b808dde686c26fff1b80137ab86c78d65f9..c9edf26e9318de5d4a0baa5eb88a11f97c840e04 100644 --- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h +++ b/mace/ops/arm/base/depthwise_conv_2d_3x3.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,51 +12,47 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_ -#define MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_ +#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_ +#define MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_ #include #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/arm/fp32/conv_2d.h" +#include "mace/ops/arm/base/depthwise_conv_2d_mxn.h" #include "mace/ops/delegator/depthwise_conv_2d.h" #include "mace/public/mace.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { -class DepthwiseConv2dK3x3S1 : public Conv2dBase { +template +class DepthwiseConv2dK3x3S1 : public DepthwiseConv2dKMxN { public: explicit DepthwiseConv2dK3x3S1(const delegator::DepthwiseConv2dParam ¶m) - : Conv2dBase(param) {} + : DepthwiseConv2dKMxN(param) {} virtual ~DepthwiseConv2dK3x3S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; + MaceStatus DoCompute( + const DepthwiseConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; }; -class DepthwiseConv2dK3x3S2 : public Conv2dBase { +template +class DepthwiseConv2dK3x3S2 : public DepthwiseConv2dKMxN { public: explicit DepthwiseConv2dK3x3S2(const delegator::DepthwiseConv2dParam ¶m) - : Conv2dBase(param) {} + : DepthwiseConv2dKMxN(param) {} virtual ~DepthwiseConv2dK3x3S2() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; + MaceStatus DoCompute( + const DepthwiseConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) override; }; -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_ +#endif // MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_ diff --git a/mace/ops/arm/base/depthwise_conv_2d_mxn.h b/mace/ops/arm/base/depthwise_conv_2d_mxn.h new file mode 100644 index 0000000000000000000000000000000000000000..5f59802be83bce02e872c2e8836f2609f84eb9e5 --- /dev/null +++ b/mace/ops/arm/base/depthwise_conv_2d_mxn.h @@ -0,0 +1,64 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_ +#define MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_ + +#include + +#include "mace/core/ops/op_context.h" +#include "mace/core/tensor.h" +#include "mace/ops/arm/base/conv_2d.h" +#include "mace/ops/delegator/depthwise_conv_2d.h" +#include "mace/public/mace.h" + +namespace mace { +namespace ops { +namespace arm { + +template +class DepthwiseConv2dKMxN : public Conv2dBase { + public: + explicit DepthwiseConv2dKMxN(const delegator::DepthwiseConv2dParam ¶m) + : Conv2dBase(param, sizeof(T)) {} + virtual ~DepthwiseConv2dKMxN() {} + + MaceStatus Compute(const OpContext *context, const Tensor *input, + const Tensor *filter, Tensor *output) { + DepthwiseConvComputeParam p = + PreWorkAndGetDepthwiseConv2DParam(context, input, filter, output); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + const T *filter_data = filter->data(); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + + DoCompute(p, filter_data, input_data, output_data); + + return MaceStatus::MACE_SUCCESS; + } + + protected: + virtual MaceStatus DoCompute( + const DepthwiseConvComputeParam &p, const T *filter, + const T *input_data, T *output_data) = 0; +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_ diff --git a/mace/ops/arm/base/depthwise_deconv_2d_3x3.cc b/mace/ops/arm/base/depthwise_deconv_2d_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..c9a70467dbaafb39b8d76716c3ffa4ca201ea61b --- /dev/null +++ b/mace/ops/arm/base/depthwise_deconv_2d_3x3.cc @@ -0,0 +1,47 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/depthwise_deconv_2d_3x3.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, DepthwiseDeconv2dK3x3S1, + delegator::DepthwiseDeconv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S1)); + MACE_REGISTER_DELEGATOR( + registry, DepthwiseDeconv2dK3x3S2, + delegator::DepthwiseDeconv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S2)); +} + +void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam, + MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S1)); + MACE_REGISTER_DELEGATOR( + registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam, + MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K3x3S2)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h b/mace/ops/arm/base/depthwise_deconv_2d_3x3.h similarity index 51% rename from mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h rename to mace/ops/arm/base/depthwise_deconv_2d_3x3.h index eeb21d6c3c5d50502b268e61f3b0726066a963cb..afe9356eb33887ea850d35657605fa8bf2689ed4 100644 --- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h +++ b/mace/ops/arm/base/depthwise_deconv_2d_3x3.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_ -#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_ +#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_ +#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_ #include #include @@ -21,7 +21,7 @@ #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" #include "mace/core/types.h" -#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/arm/base/depthwise_deconv_2d_mxn.h" #include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/delegator/depthwise_deconv_2d.h" #include "mace/public/mace.h" @@ -29,70 +29,56 @@ namespace mace { namespace ops { namespace arm { -namespace fp32 { -class DepthwiseDeconv2dK3x3S1 : public Deconv2dBase { +template +class DepthwiseDeconv2dK3x3S1 : public DepthwiseDeconv2dKMxN { public: explicit DepthwiseDeconv2dK3x3S1( const delegator::DepthwiseDeconv2dParam ¶m) - : Deconv2dBase(param) {} + : DepthwiseDeconv2dKMxN(param) {} virtual ~DepthwiseDeconv2dK3x3S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -class DepthwiseDeconv2dK3x3S2 : public Deconv2dBase { +template +class DepthwiseDeconv2dK3x3S2 : public DepthwiseDeconv2dKMxN { public: explicit DepthwiseDeconv2dK3x3S2( const delegator::DepthwiseDeconv2dParam ¶m) - : Deconv2dBase(param) {} + : DepthwiseDeconv2dKMxN(param) {} virtual ~DepthwiseDeconv2dK3x3S2() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -class GroupDeconv2dK3x3S1 : public Deconv2dBase { +template +class GroupDeconv2dK3x3S1 : public GroupDeconv2dKMxN { public: explicit GroupDeconv2dK3x3S1( const delegator::GroupDeconv2dParam ¶m) - : Deconv2dBase(param) {} + : GroupDeconv2dKMxN(param) {} virtual ~GroupDeconv2dK3x3S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -class GroupDeconv2dK3x3S2 : public Deconv2dBase { +template +class GroupDeconv2dK3x3S2 : public GroupDeconv2dKMxN { public: explicit GroupDeconv2dK3x3S2(const delegator::GroupDeconv2dParam ¶m) - : Deconv2dBase(param) {} + : GroupDeconv2dKMxN(param) {} virtual ~GroupDeconv2dK3x3S2() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_ +#endif // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_ diff --git a/mace/ops/arm/base/depthwise_deconv_2d_4x4.cc b/mace/ops/arm/base/depthwise_deconv_2d_4x4.cc new file mode 100644 index 0000000000000000000000000000000000000000..c6d0605a4c82e34f4681b9f07b610dd1cd477e1b --- /dev/null +++ b/mace/ops/arm/base/depthwise_deconv_2d_4x4.cc @@ -0,0 +1,48 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/depthwise_deconv_2d_4x4.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, DepthwiseDeconv2dK4x4S1, + delegator::DepthwiseDeconv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S1)); + MACE_REGISTER_DELEGATOR( + registry, DepthwiseDeconv2dK4x4S2, + delegator::DepthwiseDeconv2dParam, + MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S2)); +} + +void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam, + MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S1)); + MACE_REGISTER_DELEGATOR( + registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam, + MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, + float, ImplType::NEON, K4x4S2)); +} + +} // namespace arm +} // namespace ops +} // namespace mace + diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h b/mace/ops/arm/base/depthwise_deconv_2d_4x4.h similarity index 51% rename from mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h rename to mace/ops/arm/base/depthwise_deconv_2d_4x4.h index 31d5bd99ed5cfe287026f99ac89d3721c7fed8bb..c543b94af75910734595e92856649dc836228556 100644 --- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h +++ b/mace/ops/arm/base/depthwise_deconv_2d_4x4.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_ -#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_ +#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_ +#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_ #include #include @@ -21,7 +21,7 @@ #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" #include "mace/core/types.h" -#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/arm/base/depthwise_deconv_2d_mxn.h" #include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/delegator/depthwise_deconv_2d.h" #include "mace/public/mace.h" @@ -29,69 +29,55 @@ namespace mace { namespace ops { namespace arm { -namespace fp32 { -class DepthwiseDeconv2dK4x4S1 : public Deconv2dBase { +template +class DepthwiseDeconv2dK4x4S1 : public DepthwiseDeconv2dKMxN { public: explicit DepthwiseDeconv2dK4x4S1( const delegator::DepthwiseDeconv2dParam ¶m) - : Deconv2dBase(param) {} + : DepthwiseDeconv2dKMxN(param) {} virtual ~DepthwiseDeconv2dK4x4S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -class DepthwiseDeconv2dK4x4S2 : public Deconv2dBase { +template +class DepthwiseDeconv2dK4x4S2 : public DepthwiseDeconv2dKMxN { public: explicit DepthwiseDeconv2dK4x4S2( const delegator::DepthwiseDeconv2dParam ¶m) - : Deconv2dBase(param) {} + : DepthwiseDeconv2dKMxN(param) {} virtual ~DepthwiseDeconv2dK4x4S2() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -class GroupDeconv2dK4x4S1 : public Deconv2dBase { +template +class GroupDeconv2dK4x4S1 : public GroupDeconv2dKMxN { public: explicit GroupDeconv2dK4x4S1(const delegator::GroupDeconv2dParam ¶m) - : Deconv2dBase(param) {} + : GroupDeconv2dKMxN(param) {} virtual ~GroupDeconv2dK4x4S1() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -class GroupDeconv2dK4x4S2 : public Deconv2dBase { +template +class GroupDeconv2dK4x4S2 : public GroupDeconv2dKMxN { public: explicit GroupDeconv2dK4x4S2(const delegator::GroupDeconv2dParam ¶m) - : Deconv2dBase(param) {} + : GroupDeconv2dKMxN(param) {} virtual ~GroupDeconv2dK4x4S2() {} - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) override; + MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) override; }; -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_ +#endif // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_ diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc b/mace/ops/arm/base/depthwise_deconv_2d_general.cc similarity index 84% rename from mace/ops/arm/fp32/depthwise_deconv_2d_general.cc rename to mace/ops/arm/base/depthwise_deconv_2d_general.cc index 33d9cb01a377757358757576564d8131eb3c3e48..222706b56e8f5abb1a67ee820b4aae1d50bbd787 100644 --- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc +++ b/mace/ops/arm/base/depthwise_deconv_2d_general.cc @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,18 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/depthwise_deconv_2d_general.h" +#include "mace/ops/arm/base/depthwise_deconv_2d_general.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { -MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { +template +MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { std::unique_ptr padded_out; std::vector out_pad_size; group_ = input->dim(1); @@ -46,9 +46,9 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context, Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard output_mapper(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); + const T *input_data = input->data(); + const T *filter_data = filter->data(); + T *padded_out_data = out_tensor->mutable_data(); auto &in_shape = input->shape(); auto &out_shape = out_tensor->shape(); @@ -79,7 +79,7 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context, index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t c = start1; c < end1; c += step1) { - float *out_base = + T *out_base = padded_out_data + (b * channels + c) * out_img_size; for (index_t i = 0; i < in_height; ++i) { for (index_t j = 0; j < in_width; ++j) { @@ -105,11 +105,12 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context, return MaceStatus::MACE_SUCCESS; } -MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { +template +MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *output_shape, + Tensor *output) { std::unique_ptr padded_out; std::vector out_pad_size; ResizeOutAndPadOut(context, @@ -131,9 +132,9 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context, Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard output_mapper(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); + const T *input_data = input->data(); + const T *filter_data = filter->data(); + T *padded_out_data = out_tensor->mutable_data(); auto &in_shape = input->shape(); auto &out_shape = out_tensor->shape(); @@ -209,19 +210,19 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context, void RegisterDepthwiseDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_DELEGATOR( - registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam, + registry, DepthwiseDeconv2dGeneral, + delegator::DepthwiseDeconv2dParam, MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU, float, ImplType::NEON)); } void RegisterGroupDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) { MACE_REGISTER_DELEGATOR( - registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam, + registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam, MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU, float, ImplType::NEON)); } -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_general.h b/mace/ops/arm/base/depthwise_deconv_2d_general.h similarity index 80% rename from mace/ops/arm/fp32/depthwise_deconv_2d_general.h rename to mace/ops/arm/base/depthwise_deconv_2d_general.h index 924924498301592de6dd1c9af6473eb61d289407..3fa6d4543a0b4432e4c88a5aa1c5df5644e89505 100644 --- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.h +++ b/mace/ops/arm/base/depthwise_deconv_2d_general.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_ -#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_ +#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_ +#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_ #include #include @@ -21,7 +21,7 @@ #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" #include "mace/core/types.h" -#include "mace/ops/arm/fp32/deconv_2d.h" +#include "mace/ops/arm/base/deconv_2d.h" #include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/delegator/depthwise_deconv_2d.h" #include "mace/public/mace.h" @@ -29,13 +29,13 @@ namespace mace { namespace ops { namespace arm { -namespace fp32 { +template class DepthwiseDeconv2dGeneral : public Deconv2dBase { public: explicit DepthwiseDeconv2dGeneral( const delegator::DepthwiseDeconv2dParam ¶m) - : Deconv2dBase(param) {} + : Deconv2dBase(param, sizeof(T)) {} virtual ~DepthwiseDeconv2dGeneral() {} MaceStatus Compute( @@ -46,10 +46,11 @@ class DepthwiseDeconv2dGeneral : public Deconv2dBase { Tensor *output) override; }; +template class GroupDeconv2dGeneral : public Deconv2dBase { public: explicit GroupDeconv2dGeneral(const delegator::GroupDeconv2dParam ¶m) - : Deconv2dBase(param) {} + : Deconv2dBase(param, sizeof(T)) {} virtual ~GroupDeconv2dGeneral() {} MaceStatus Compute( @@ -60,9 +61,8 @@ class GroupDeconv2dGeneral : public Deconv2dBase { Tensor *output) override; }; -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_ +#endif // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_ diff --git a/mace/ops/arm/base/depthwise_deconv_2d_mxn.h b/mace/ops/arm/base/depthwise_deconv_2d_mxn.h new file mode 100644 index 0000000000000000000000000000000000000000..416551c88c9845737846706806d8cd5b5b176533 --- /dev/null +++ b/mace/ops/arm/base/depthwise_deconv_2d_mxn.h @@ -0,0 +1,136 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_ +#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_ + +#include +#include + +#include "mace/core/ops/op_context.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/ops/arm/base/deconv_2d.h" +#include "mace/ops/common/conv_pool_2d_util.h" +#include "mace/ops/delegator/depthwise_deconv_2d.h" +#include "mace/public/mace.h" + +namespace mace { +namespace ops { +namespace arm { + +template +class DepthwiseDeconv2dKMxN : public Deconv2dBase { + public: + explicit DepthwiseDeconv2dKMxN( + const delegator::DepthwiseDeconv2dParam ¶m) + : Deconv2dBase(param, sizeof(T)) {} + virtual ~DepthwiseDeconv2dKMxN() {} + + MaceStatus Compute( + const OpContext *context, const Tensor *input, const Tensor *filter, + const Tensor *output_shape, Tensor *output) override { + std::unique_ptr padded_out; + std::vector out_pad_size; + group_ = input->dim(1); + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + const T *input_data = input->data(); + const T *filter_data = filter->data(); + T *padded_out_data = out_tensor->mutable_data(); + + DepthwiseDeconvComputeParam p = + PreWorkAndGetDepthwiseDeconvParam(context, input, out_tensor); + DoCompute(p, filter_data, input_data, padded_out_data); + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; + } + + virtual MaceStatus DoCompute( + const DepthwiseDeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) = 0; +}; + +template +class GroupDeconv2dKMxN : public Deconv2dBase { + public: + explicit GroupDeconv2dKMxN( + const delegator::DepthwiseDeconv2dParam ¶m) + : Deconv2dBase(param, sizeof(T)) {} + virtual ~GroupDeconv2dKMxN() {} + + MaceStatus Compute( + const OpContext *context, const Tensor *input, const Tensor *filter, + const Tensor *output_shape, Tensor *output) override { + std::unique_ptr padded_out; + std::vector out_pad_size; + ResizeOutAndPadOut(context, + input, + filter, + output_shape, + output, + &out_pad_size, + &padded_out); + + Tensor *out_tensor = output; + if (padded_out != nullptr) { + out_tensor = padded_out.get(); + } + + out_tensor->Clear(); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard output_mapper(output); + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto padded_out_data = out_tensor->mutable_data(); + + GroupDeconvComputeParam p = + PreWorkAndGetGroupDeconvParam(context, input, out_tensor); + DoCompute(p, filter_data, input_data, padded_out_data); + UnPadOutput(*out_tensor, out_pad_size, output); + + return MaceStatus::MACE_SUCCESS; + } + + virtual MaceStatus DoCompute( + const GroupDeconvComputeParam &p, const T *filter, + const T *input_data, T *padded_out_data) = 0; +}; + +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_ diff --git a/mace/ops/arm/base/gemm.cc b/mace/ops/arm/base/gemm.cc new file mode 100644 index 0000000000000000000000000000000000000000..437f767e6956644473ce865f910841a1df9ccb9f --- /dev/null +++ b/mace/ops/arm/base/gemm.cc @@ -0,0 +1,29 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/base/gemm.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterGemmDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Gemm, delegator::GemmParam, + MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/gemm.h b/mace/ops/arm/base/gemm.h similarity index 65% rename from mace/ops/arm/fp32/gemm.h rename to mace/ops/arm/base/gemm.h index 4910ae358347bf94eef076e63934f9365aa1ef79..b2320a71d95842c96fd562413f116516bd0c0c87 100644 --- a/mace/ops/arm/fp32/gemm.h +++ b/mace/ops/arm/base/gemm.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_GEMM_H_ -#define MACE_OPS_ARM_FP32_GEMM_H_ +#ifndef MACE_OPS_ARM_BASE_GEMM_H_ +#define MACE_OPS_ARM_BASE_GEMM_H_ #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" @@ -28,8 +28,10 @@ namespace mace { namespace ops { namespace arm { -namespace fp32 { +enum { kNoCache, kCacheLhs, kCacheRhs }; + +template class Gemm : public delegator::Gemm { public: explicit Gemm(const delegator::GemmParam ¶m) @@ -68,26 +70,49 @@ class Gemm : public delegator::Gemm { const bool transpose_out, const bool lhs_batched, const bool rhs_batched, - Tensor *output) override; + Tensor *output) override { + index_t rows = transpose_lhs ? lhs_cols : lhs_rows; + index_t depth = transpose_lhs ? lhs_rows : lhs_cols; + index_t cols = transpose_rhs ? rhs_rows : rhs_cols; + index_t depth2 = transpose_rhs ? rhs_cols : rhs_rows; + MACE_CHECK(depth == depth2, + "Matrices that multiply have inconsistent depth dim: ", + depth, + " vs. ", + depth2); + + return Compute(context, + lhs, + rhs, + batch, + rows, + cols, + depth, + transpose_lhs ? ColMajor : RowMajor, + transpose_rhs ? ColMajor : RowMajor, + transpose_out ? ColMajor : RowMajor, + lhs_batched, + rhs_batched, + output); + } - private: - void ComputeBlock(const float *packed_lhs_data, - const float *packed_rhs_data, + protected: + void ComputeBlock(const T *packed_lhs_data, + const T *packed_rhs_data, const index_t depth_padded, - float *packed_output_data); - - void PackLhs(const MatrixMap &lhs, - float *packed_lhs); + T *packed_output_data); - void PackRhs(const MatrixMap &rhs, - float *packed_rhs); + void PackLhs(const MatrixMap &lhs, + T *packed_lhs); - void UnpackOutput(const float *packed_output, - MatrixMap *output); + void PackRhs(const MatrixMap &rhs, + T *packed_rhs); + void UnpackOutput(const T *packed_output, + MatrixMap *output); template - void Unpack(const float *packed_output, - MatrixMap *output) { + void Unpack(const T *packed_output, + MatrixMap *output) { const index_t rows = output->rows(); const index_t cols = output->cols(); for (index_t r = 0; r < rows; ++r) { @@ -98,9 +123,9 @@ class Gemm : public delegator::Gemm { } template - void Pack(const MatrixMap &matrix, + void Pack(const MatrixMap &matrix, MatrixMajor dst_major, - float *packed_matrix) { + T *packed_matrix) { const index_t rows = matrix.rows(); const index_t cols = matrix.cols(); index_t depth = cols; @@ -109,7 +134,7 @@ class Gemm : public delegator::Gemm { depth = rows; } const index_t depth_padded = RoundUp(depth, static_cast(4)); - memset(packed_matrix, 0, sizeof(float) * WidthBlockSize * depth_padded); + memset(packed_matrix, 0, sizeof(T) * WidthBlockSize * depth_padded); if (dst_major == ColMajor) { for (index_t c = 0; c < cols; ++c) { for (index_t r = 0; r < rows; ++r) { @@ -125,31 +150,14 @@ class Gemm : public delegator::Gemm { } } + private: Buffer pack_cache_; - bool should_cache_pack_; int cached_; }; -template<> -void Gemm::Pack<4, 4>(const MatrixMap &matrix, - MatrixMajor dst_major, - float *packed_matrix); - -template<> -void Gemm::Pack<8, 4>(const MatrixMap &matrix, - MatrixMajor dst_major, - float *packed_matrix); - -template<> -void Gemm::Unpack<4, 8>(const float *packed_output, MatrixMap *output); - -template<> -void Gemm::Unpack<8, 8>(const float *packed_output, MatrixMap *output); - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_GEMM_H_ +#endif // MACE_OPS_ARM_BASE_GEMM_H_ diff --git a/mace/ops/arm/base/gemv.cc b/mace/ops/arm/base/gemv.cc new file mode 100644 index 0000000000000000000000000000000000000000..eb62314a6774906cd884175d8e32afe1f75f4438 --- /dev/null +++ b/mace/ops/arm/base/gemv.cc @@ -0,0 +1,30 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "mace/ops/arm/base/gemv.h" + +namespace mace { +namespace ops { +namespace arm { + +void RegisterGemvDelegator(OpDelegatorRegistry *registry) { + MACE_REGISTER_DELEGATOR( + registry, Gemv, DelegatorParam, + MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON)); +} + +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/gemv.h b/mace/ops/arm/base/gemv.h similarity index 86% rename from mace/ops/arm/fp32/gemv.h rename to mace/ops/arm/base/gemv.h index 9933cf42b817e20945517588a87dfca2232e7411..b3cbf19ec4e980903114ebb254290f3ab044cad0 100644 --- a/mace/ops/arm/fp32/gemv.h +++ b/mace/ops/arm/base/gemv.h @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_ARM_FP32_GEMV_H_ -#define MACE_OPS_ARM_FP32_GEMV_H_ +#ifndef MACE_OPS_ARM_BASE_GEMV_H_ +#define MACE_OPS_ARM_BASE_GEMV_H_ #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" @@ -23,8 +23,8 @@ namespace mace { namespace ops { namespace arm { -namespace fp32 { +template class Gemv : public delegator::Gemv { public: explicit Gemv(const DelegatorParam ¶m) : delegator::Gemv(param) {} @@ -43,9 +43,8 @@ class Gemv : public delegator::Gemv { Tensor *output) override; }; -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace -#endif // MACE_OPS_ARM_FP32_GEMV_H_ +#endif // MACE_OPS_ARM_BASE_GEMV_H_ diff --git a/mace/ops/arm/fp32/activation.cc b/mace/ops/arm/fp32/activation.cc index 5d8d6984bd04fe7ae1ea9626e409388475505fbb..add68ad01e3b0ea93fcce29ba05768ee3d696ae7 100644 --- a/mace/ops/arm/fp32/activation.cc +++ b/mace/ops/arm/fp32/activation.cc @@ -12,186 +12,139 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/delegator/activation.h" - #include #include +#include "mace/ops/arm/base/activation.h" + namespace mace { namespace ops { namespace arm { -namespace fp32 { - -class Activation : public delegator::Activation { - public: - explicit Activation(const delegator::ActivationParam ¶m) - : delegator::Activation(param) {} - ~Activation() = default; - - MaceStatus Compute(const OpContext *context, - const Tensor *input, Tensor *output) override; - - private: - void DoActivation(const OpContext *context, - const Tensor *input, Tensor *output); -}; - -MaceStatus Activation::Compute(const OpContext *context, - const Tensor *input, Tensor *output) { - Tensor::MappingGuard input_guard(input); - if (input != output) { - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - Tensor::MappingGuard output_guard(output); - DoActivation(context, input, output); - } else { - DoActivation(context, input, output); + +template<> +void Activation::ActivateRelu(utils::ThreadPool *thread_pool, + const float *input_data, + const index_t input_size, + float *output_data) { + const float32x4_t vzero = vdupq_n_f32(0.f); + const index_t block_count = input_size / 4; + + thread_pool->Compute1D( + [=](index_t start, index_t end, index_t step) { + auto input_ptr = input_data + start * 4; + auto output_ptr = output_data + start * 4; + + for (index_t i = start; i < end; i += step) { + float32x4_t v = vld1q_f32(input_ptr); + v = vmaxq_f32(v, vzero); + vst1q_f32(output_ptr, v); + + input_ptr += 4; + output_ptr += 4; + } + }, + 0, block_count, 1); + + // remain + for (index_t i = block_count * 4; i < input_size; ++i) { + output_data[i] = std::max(0.f, input_data[i]); } +} - return MaceStatus::MACE_SUCCESS; +template<> +void Activation::ActivateRelux(utils::ThreadPool *thread_pool, + const float *input_data, + const index_t input_size, + float *output_data) { + const float32x4_t vzero = vdupq_n_f32(0.f); + const float32x4_t vlimit = vdupq_n_f32(limit_); + const index_t block_count = input_size / 4; + + thread_pool->Compute1D( + [=](index_t start, index_t end, index_t step) { + auto input_ptr = input_data + start * 4; + auto output_ptr = output_data + start * 4; + + for (index_t i = start; i < end; i += step) { + float32x4_t v = vld1q_f32(input_ptr); + v = vmaxq_f32(v, vzero); + v = vminq_f32(v, vlimit); + vst1q_f32(output_ptr, v); + + input_ptr += 4; + output_ptr += 4; + } + }, + 0, block_count, 1); + + // remain + for (index_t i = block_count * 4; i < input_size; ++i) { + output_data[i] = std::max(0.f, std::min(limit_, input_data[i])); + } } -void Activation::DoActivation(const OpContext *context, - const Tensor *input, - Tensor *output) { - auto input_data = input->data(); - auto output_data = output->mutable_data(); - const index_t size = input->size(); - - utils::ThreadPool &thread_pool = - context->device()->cpu_runtime()->thread_pool(); - - switch (type_) { - case RELU: { - const float32x4_t vzero = vdupq_n_f32(0.f); - const index_t block_count = size / 4; - - thread_pool.Compute1D( - [=](index_t start, index_t end, index_t step) { - auto input_ptr = input_data + start * 4; - auto output_ptr = output_data + start * 4; - - for (index_t i = start; i < end; i += step) { - float32x4_t v = vld1q_f32(input_ptr); - v = vmaxq_f32(v, vzero); - vst1q_f32(output_ptr, v); - - input_ptr += 4; - output_ptr += 4; - } - }, - 0, block_count, 1); - - // remain - for (index_t i = block_count * 4; i < size; ++i) { - output_data[i] = std::max(0.f, input_data[i]); - } - - break; - } - - case RELUX: { - const float32x4_t vzero = vdupq_n_f32(0.f); - const float32x4_t vlimit = vdupq_n_f32(limit_); - const index_t block_count = size / 4; - - thread_pool.Compute1D( - [=](index_t start, index_t end, index_t step) { - auto input_ptr = input_data + start * 4; - auto output_ptr = output_data + start * 4; - - for (index_t i = start; i < end; i += step) { - float32x4_t v = vld1q_f32(input_ptr); - v = vmaxq_f32(v, vzero); - v = vminq_f32(v, vlimit); - vst1q_f32(output_ptr, v); - - input_ptr += 4; - output_ptr += 4; - } - }, - 0, block_count, 1); - - // remain - for (index_t i = block_count * 4; i < size; ++i) { - output_data[i] = std::max(0.f, std::min(limit_, input_data[i])); - } - - break; - } - - case LEAKYRELU: { - const float32x4_t vzero = vdupq_n_f32(0.f); - const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_); - const index_t block_count = size / 4; - - thread_pool.Compute1D( - [=](index_t start, index_t end, index_t step) { - auto input_ptr = input_data + start * 4; - auto output_ptr = output_data + start * 4; - - for (index_t i = start; i < end; i += step) { - float32x4_t v = vld1q_f32(input_ptr); - float32x4_t u = vminq_f32(v, vzero); - v = vmaxq_f32(v, vzero); - v = vmlaq_f32(v, valpha, u); - vst1q_f32(output_ptr, v); - - input_ptr += 4; - output_ptr += 4; - } - }, - 0, block_count, 1); - - // remain - for (index_t i = block_count * 4; i < size; ++i) { - output_data[i] = std::max(input_data[i], 0.f) + - std::min(input_data[i], 0.f) * leakyrelu_coefficient_; - } - - break; - } - - case TANH: { - thread_pool.Compute1D( - [=](index_t start, index_t end, index_t step) { - for (index_t i = start; i < end; i += step) { - output_data[i] = std::tanh(input_data[i]); - } - }, - 0, size, 1); - - break; - } - - case SIGMOID: { - thread_pool.Compute1D( - [=](index_t start, index_t end, index_t step) { - for (index_t i = start; i < end; i += step) { - output_data[i] = 1 / (1 + std::exp(-(input_data[i]))); - } - }, - 0, size, 1); - - break; - } - - case NOOP: { - break; - } - - default: { - MACE_NOT_IMPLEMENTED; - } +template<> +void Activation::ActivateLeakyRelu(utils::ThreadPool *thread_pool, + const float *input_data, + const index_t input_size, + float *output_data) { + const float32x4_t vzero = vdupq_n_f32(0.f); + const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_); + const index_t block_count = input_size / 4; + + thread_pool->Compute1D( + [=](index_t start, index_t end, index_t step) { + auto input_ptr = input_data + start * 4; + auto output_ptr = output_data + start * 4; + + for (index_t i = start; i < end; i += step) { + float32x4_t v = vld1q_f32(input_ptr); + float32x4_t u = vminq_f32(v, vzero); + v = vmaxq_f32(v, vzero); + v = vmlaq_f32(v, valpha, u); + vst1q_f32(output_ptr, v); + + input_ptr += 4; + output_ptr += 4; + } + }, + 0, block_count, 1); + + // remain + for (index_t i = block_count * 4; i < input_size; ++i) { + output_data[i] = std::max(input_data[i], 0.f) + + std::min(input_data[i], 0.f) * leakyrelu_coefficient_; } } -void RegisterActivationDelegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, Activation, delegator::ActivationParam, - MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON)); +template<> +void Activation::ActivateTanh(utils::ThreadPool *thread_pool, + const float *input_data, + const index_t input_size, + float *output_data) { + thread_pool->Compute1D( + [=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + output_data[i] = std::tanh(input_data[i]); + } + }, + 0, input_size, 1); +} + +template<> +void Activation::ActivateSigmoid(utils::ThreadPool *thread_pool, + const float *input_data, + const index_t input_size, + float *output_data) { + thread_pool->Compute1D( + [=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + output_data[i] = 1 / (1 + std::exp(-(input_data[i]))); + } + }, + 0, input_size, 1); } -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/bias_add.cc b/mace/ops/arm/fp32/bias_add.cc index 7edafec327692d736cc66ec22e82808031819e05..042d306d8475ca850ee61cdc0d14185038543ecb 100644 --- a/mace/ops/arm/fp32/bias_add.cc +++ b/mace/ops/arm/fp32/bias_add.cc @@ -13,129 +13,81 @@ // limitations under the License. #include -#include "mace/ops/delegator/bias_add.h" + +#include "mace/ops/arm/base/bias_add.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { - -class BiasAdd : public delegator::BiasAdd { - public: - explicit BiasAdd(const DelegatorParam ¶m) : delegator::BiasAdd(param) {} - ~BiasAdd() = default; - - MaceStatus Compute(const OpContext *context, const Tensor *input, - const Tensor *bias, Tensor *output) override; - private: - void AddBias(const OpContext *context, const Tensor *input, - const Tensor *bias, Tensor *output); -}; - -MaceStatus BiasAdd::Compute(const OpContext *context, - const Tensor *input, - const Tensor *bias, - Tensor *output) { - Tensor::MappingGuard input_guard(input); - Tensor::MappingGuard bias_guard(bias); - if (input != output) { - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - if (bias == nullptr) { - output->Copy(*input); - } else { - Tensor::MappingGuard output_guard(output); - AddBias(context, input, bias, output); - } - } else { - if (bias != nullptr) { - AddBias(context, input, bias, output); +template<> +void BiasAdd::Add1DimBias( + utils::ThreadPool *thread_pool, const float *input_data, + const float *bias_data, float *output_data, const index_t batch, + const index_t channels, const index_t image_size) { + const index_t block_count = image_size / 4; + const index_t remain = image_size % 4; + thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + const index_t b_offset = b * channels; + for (index_t c = start1; c < end1; c += step1) { + const index_t offset = (b_offset + c) * image_size; + auto input_ptr = input_data + offset; + auto output_ptr = output_data + offset; + const float bias = bias_data[c]; + float32x4_t vbias = vdupq_n_f32(bias); + + for (index_t i = 0; i < block_count; ++i) { + float32x4_t v = vld1q_f32(input_ptr); + v = vaddq_f32(v, vbias); + vst1q_f32(output_ptr, v); + + input_ptr += 4; + output_ptr += 4; + } + for (index_t i = 0; i < remain; ++i) { + (*output_ptr++) = (*input_ptr++) + bias; + } + } } - } - - return MaceStatus::MACE_SUCCESS; + }, 0, batch, 1, 0, channels, 1); } -void BiasAdd::AddBias(const OpContext *context, - const Tensor *input, - const Tensor *bias, - mace::Tensor *output) { - auto input_data = input->data(); - auto bias_data = bias->data(); - auto output_data = output->mutable_data(); - - const index_t batch = input->dim(0); - const index_t channels = input->dim(1); - const index_t height = output->dim(2); - const index_t width = output->dim(3); - const index_t image_size = height * width; +template<> +void BiasAdd::Add2DimsBias( + utils::ThreadPool *thread_pool, const float *input_data, + const float *bias_data, float *output_data, const index_t batch, + const index_t channels, const index_t image_size) { const index_t block_count = image_size / 4; const index_t remain = image_size % 4; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - if (bias->dim_size() == 1) { - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { - for (index_t b = start0; b < end0; b += step0) { - const index_t b_offset = b * channels; - for (index_t c = start1; c < end1; c += step1) { - const index_t offset = (b_offset + c) * image_size; - auto input_ptr = input_data + offset; - auto output_ptr = output_data + offset; - const float bias = bias_data[c]; - float32x4_t vbias = vdupq_n_f32(bias); - - for (index_t i = 0; i < block_count; ++i) { - float32x4_t v = vld1q_f32(input_ptr); - v = vaddq_f32(v, vbias); - vst1q_f32(output_ptr, v); - - input_ptr += 4; - output_ptr += 4; - } - for (index_t i = 0; i < remain; ++i) { - (*output_ptr++) = (*input_ptr++) + bias; - } + thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + const index_t b_offset = b * channels; + for (index_t c = start1; c < end1; c += step1) { + const index_t offset = (b_offset + c) * image_size; + auto input_ptr = input_data + offset; + auto output_ptr = output_data + offset; + const float bias = bias_data[b * channels + c]; + float32x4_t vbias = vdupq_n_f32(bias); + + for (index_t i = 0; i < block_count; ++i) { + float32x4_t v = vld1q_f32(input_ptr); + v = vaddq_f32(v, vbias); + vst1q_f32(output_ptr, v); + + input_ptr += 4; + output_ptr += 4; } - } - }, 0, batch, 1, 0, channels, 1); - } else { - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { - for (index_t b = start0; b < end0; b += step0) { - const index_t b_offset = b * channels; - for (index_t c = start1; c < end1; c += step1) { - const index_t offset = (b_offset + c) * image_size; - auto input_ptr = input_data + offset; - auto output_ptr = output_data + offset; - const float bias = bias_data[b * channels + c]; - float32x4_t vbias = vdupq_n_f32(bias); - - for (index_t i = 0; i < block_count; ++i) { - float32x4_t v = vld1q_f32(input_ptr); - v = vaddq_f32(v, vbias); - vst1q_f32(output_ptr, v); - - input_ptr += 4; - output_ptr += 4; - } - for (index_t i = 0; i < remain; ++i) { - (*output_ptr++) = (*input_ptr++) + bias; - } + for (index_t i = 0; i < remain; ++i) { + (*output_ptr++) = (*input_ptr++) + bias; } } - }, 0, batch, 1, 0, channels, 1); - } -} - -void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, BiasAdd, DelegatorParam, - MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON)); + } + }, 0, batch, 1, 0, channels, 1); } -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/common_neon.h b/mace/ops/arm/fp32/common_neon.h index d4e61add21872e991c1947307f733ac404136738..502ffc393c0601259ee60bd4a7e0b8bcae4e73b2 100644 --- a/mace/ops/arm/fp32/common_neon.h +++ b/mace/ops/arm/fp32/common_neon.h @@ -21,7 +21,6 @@ namespace mace { namespace ops { namespace arm { -namespace fp32 { inline float32x4_t neon_vfma_lane_0(float32x4_t a, float32x4_t b, @@ -63,7 +62,6 @@ inline float32x4_t neon_vfma_lane_3(float32x4_t a, #endif } -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d.h b/mace/ops/arm/fp32/conv_2d.h deleted file mode 100644 index a143f5f84c2092c614d60576e27e26ec69d7e3a3..0000000000000000000000000000000000000000 --- a/mace/ops/arm/fp32/conv_2d.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ARM_FP32_CONV_2D_H_ -#define MACE_OPS_ARM_FP32_CONV_2D_H_ - -#include -#include - -#include "mace/core/ops/op_context.h" -#include "mace/core/tensor.h" -#include "mace/ops/delegator/conv_2d.h" -#include "mace/ops/arm/fp32/gemm.h" -#include "mace/ops/common/conv_pool_2d_util.h" -#include "mace/public/mace.h" - -namespace mace { -namespace ops { -namespace arm { -namespace fp32 { - -class Conv2dBase : public delegator::Conv2d { - public: - explicit Conv2dBase(const delegator::Conv2dParam ¶m) - : delegator::Conv2d(param) {} - - virtual ~Conv2dBase() = default; - - protected: - void CalOutputShapeAndInputPadSize(const std::vector &input_shape, - const std::vector &filter_shape, - std::vector *output_shape, - std::vector *in_pad_size); - - void CalOutputBoundaryWithoutUsingInputPad(const std::vector - &output_shape, - const std::vector - in_pad_size, - std::vector - *out_bound); - - void CalOutputShapeAndPadSize(const Tensor *input, - const Tensor *filter, - const int out_tile_height, - const int out_tile_width, - std::vector *output_shape, - std::vector *in_pad_size, - std::vector *out_pad_size); - - MaceStatus ResizeOutAndPadInOut(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output, - const int out_tile_height, - const int out_tile_width, - std::unique_ptr *padded_input, - std::unique_ptr *padded_output); - - void PadInput(const Tensor &src, - const int pad_top, - const int pad_left, - Tensor *dst); - void UnPadOutput(const Tensor &src, Tensor *dst); -}; - -} // namespace fp32 -} // namespace arm -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ARM_FP32_CONV_2D_H_ diff --git a/mace/ops/arm/fp32/conv_2d_1xn.cc b/mace/ops/arm/fp32/conv_2d_1xn.cc index 0b5d335a69753c705a49180c5e005f6bbff125b2..527ac0980caf189765322c470227d3d32c189e9d 100644 --- a/mace/ops/arm/fp32/conv_2d_1xn.cc +++ b/mace/ops/arm/fp32/conv_2d_1xn.cc @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,93 +12,44 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/conv_2d_1xn.h" - #include #include +#include "mace/ops/arm/base/conv_2d_1xn.h" #include "mace/ops/delegator/conv_2d.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { - -MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { - std::unique_ptr padded_input; - std::unique_ptr padded_output; - - ResizeOutAndPadInOut(context, - input, - filter, - output, - 1, - 4, - &padded_input, - &padded_output); - const Tensor *in_tensor = input; - if (padded_input != nullptr) { - in_tensor = padded_input.get(); - } - Tensor *out_tensor = output; - if (padded_output != nullptr) { - out_tensor = padded_output.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = in_tensor->data(); - auto output_data = out_tensor->mutable_data(); - - auto &in_shape = in_tensor->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { + +template<> +MaceStatus Conv2dK1x7S1::DoCompute( + const ConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - if (m + 3 < out_channels) { + if (m + 3 < p.out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; + output_data + b * p.out_batch_size + m * p.out_image_size; float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * p.out_batch_size + (m + 1) * p.out_image_size; float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * p.out_batch_size + (m + 2) * p.out_image_size; float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + (m + 3) * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; - const float - *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; - const float - *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; - const float - *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; + input_data + b * p.in_batch_size + c * p.in_image_size; + const float *filter_ptr0 = + filter_data + m * p.in_channels * 7 + c * 7; + const float *filter_ptr1 = + filter_data + (m + 1) * p.in_channels * 7 + c * 7; + const float *filter_ptr2 = + filter_data + (m + 2) * p.in_channels * 7 + c * 7; + const float *filter_ptr3 = + filter_data + (m + 3) * p.in_channels * 7 + c * 7; /* load filter (4 outch x 1 height x 4 width) */ float32x4_t vf00, vf01; float32x4_t vf10, vf11; @@ -113,12 +64,12 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, vf30 = vld1q_f32(filter_ptr3); vf31 = vld1q_f32(filter_ptr3 + 3); - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // output (4 outch x 1 height x 4 width): vo_outch_height float32x4_t vo0, vo1, vo2, vo3; // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); vo1 = vld1q_f32(out_ptr1_base + out_offset); vo2 = vld1q_f32(out_ptr2_base + out_offset); @@ -127,7 +78,7 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, // input (3 slide) float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; // input offset - index_t in_offset = h * in_width + w; + index_t in_offset = h * p.in_width + w; // load input vi0 = vld1q_f32(in_ptr_base + in_offset); vi4 = vld1q_f32(in_ptr_base + in_offset + 4); @@ -214,31 +165,31 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, } // h } // c } else { - for (index_t mm = m; mm < out_channels; ++mm) { + for (index_t mm = m; mm < p.out_channels; ++mm) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + mm * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; + input_data + b * p.in_batch_size + c * p.in_image_size; + const float *filter_ptr0 = + filter_data + mm * p.in_channels * 7 + c * 7; /* load filter (1 outch x 1 height x 4 width) */ float32x4_t vf00, vf01; vf00 = vld1q_f32(filter_ptr0); vf01 = vld1q_f32(filter_ptr0 + 3); - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // output (1 outch x 1 height x 4 width): vo_outch_height float32x4_t vo0; // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); // input (3 slide) float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; // input offset - index_t in_offset = h * in_width + w; + index_t in_offset = h * p.in_width + w; // load input vi0 = vld1q_f32(in_ptr_base + in_offset); vi4 = vld1q_f32(in_ptr_base + in_offset + 4); @@ -275,87 +226,39 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, } // if } // m } // b - }, 0, batch, 1, 0, out_channels, 4); + }, 0, p.batch, 1, 0, p.out_channels, 4); - UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } -MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { - std::unique_ptr padded_input; - std::unique_ptr padded_output; - - ResizeOutAndPadInOut(context, - input, - filter, - output, - 4, - 1, - &padded_input, - &padded_output); - const Tensor *in_tensor = input; - if (padded_input != nullptr) { - in_tensor = padded_input.get(); - } - Tensor *out_tensor = output; - if (padded_output != nullptr) { - out_tensor = padded_output.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = in_tensor->data(); - auto output_data = out_tensor->mutable_data(); - - auto &in_shape = in_tensor->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus Conv2dK7x1S1::DoCompute( + const ConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - if (m + 3 < out_channels) { + if (m + 3 < p.out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; + output_data + b * p.out_batch_size + m * p.out_image_size; float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * p.out_batch_size + (m + 1) * p.out_image_size; float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * p.out_batch_size + (m + 2) * p.out_image_size; float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + (m + 3) * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; - const float - *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; - const float - *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; - const float - *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; + input_data + b * p.in_batch_size + c * p.in_image_size; + const float *filter_ptr0 = + filter_data + m * p.in_channels * 7 + c * 7; + const float *filter_ptr1 = + filter_data + (m + 1) * p.in_channels * 7 + c * 7; + const float *filter_ptr2 = + filter_data + (m + 2) * p.in_channels * 7 + c * 7; + const float *filter_ptr3 = + filter_data + (m + 3) * p.in_channels * 7 + c * 7; /* load filter (4 outch x 4 height x 1 width) */ float32x4_t vf00, vf01; float32x4_t vf10, vf11; @@ -370,41 +273,41 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, vf30 = vld1q_f32(filter_ptr3); vf31 = vld1q_f32(filter_ptr3 + 3); - for (index_t h = 0; h + 3 < out_height; h += 4) { - for (index_t w = 0; w < out_width; ++w) { + for (index_t h = 0; h + 3 < p.out_height; h += 4) { + for (index_t w = 0; w < p.out_width; ++w) { // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; // output (4 outch x 4 height x 1 width): vo_outch_height float32x4_t vo0 = {out_ptr0_base[out_offset], - out_ptr0_base[out_offset + out_width], - out_ptr0_base[out_offset + 2 * out_width], - out_ptr0_base[out_offset + 3 * out_width]}; + out_ptr0_base[out_offset + p.out_width], + out_ptr0_base[out_offset + 2 * p.out_width], + out_ptr0_base[out_offset + 3 * p.out_width]}; float32x4_t vo1 = {out_ptr1_base[out_offset], - out_ptr1_base[out_offset + out_width], - out_ptr1_base[out_offset + 2 * out_width], - out_ptr1_base[out_offset + 3 * out_width]}; + out_ptr1_base[out_offset + p.out_width], + out_ptr1_base[out_offset + 2 * p.out_width], + out_ptr1_base[out_offset + 3 * p.out_width]}; float32x4_t vo2 = {out_ptr2_base[out_offset], - out_ptr2_base[out_offset + out_width], - out_ptr2_base[out_offset + 2 * out_width], - out_ptr2_base[out_offset + 3 * out_width]}; + out_ptr2_base[out_offset + p.out_width], + out_ptr2_base[out_offset + 2 * p.out_width], + out_ptr2_base[out_offset + 3 * p.out_width]}; float32x4_t vo3 = {out_ptr3_base[out_offset], - out_ptr3_base[out_offset + out_width], - out_ptr3_base[out_offset + 2 * out_width], - out_ptr3_base[out_offset + 3 * out_width]}; + out_ptr3_base[out_offset + p.out_width], + out_ptr3_base[out_offset + 2 * p.out_width], + out_ptr3_base[out_offset + 3 * p.out_width]}; // input offset - index_t in_offset = h * in_width + w; + index_t in_offset = h * p.in_width + w; // input (3 slide) float32x4_t vi0 = {in_ptr_base[in_offset], - in_ptr_base[in_offset + in_width], - in_ptr_base[in_offset + 2 * in_width], - in_ptr_base[in_offset + 3 * in_width]}; - float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], - in_ptr_base[in_offset + 5 * in_width], - in_ptr_base[in_offset + 6 * in_width], - in_ptr_base[in_offset + 7 * in_width]}; - float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width]}; + in_ptr_base[in_offset + p.in_width], + in_ptr_base[in_offset + 2 * p.in_width], + in_ptr_base[in_offset + 3 * p.in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width], + in_ptr_base[in_offset + 5 * p.in_width], + in_ptr_base[in_offset + 6 * p.in_width], + in_ptr_base[in_offset + 7 * p.in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width], + in_ptr_base[in_offset + 9 * p.in_width]}; float32x4_t vi1 = vextq_f32(vi0, vi4, 1); float32x4_t vi2 = vextq_f32(vi0, vi4, 2); float32x4_t vi3 = vextq_f32(vi0, vi4, 3); @@ -480,63 +383,65 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, #endif out_ptr0_base[out_offset] = vo0[0]; - out_ptr0_base[out_offset + out_width] = vo0[1]; - out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; - out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; + out_ptr0_base[out_offset + p.out_width] = vo0[1]; + out_ptr0_base[out_offset + 2 * p.out_width] = vo0[2]; + out_ptr0_base[out_offset + 3 * p.out_width] = vo0[3]; out_ptr1_base[out_offset] = vo1[0]; - out_ptr1_base[out_offset + out_width] = vo1[1]; - out_ptr1_base[out_offset + 2 * out_width] = vo1[2]; - out_ptr1_base[out_offset + 3 * out_width] = vo1[3]; + out_ptr1_base[out_offset + p.out_width] = vo1[1]; + out_ptr1_base[out_offset + 2 * p.out_width] = vo1[2]; + out_ptr1_base[out_offset + 3 * p.out_width] = vo1[3]; out_ptr2_base[out_offset] = vo2[0]; - out_ptr2_base[out_offset + out_width] = vo2[1]; - out_ptr2_base[out_offset + 2 * out_width] = vo2[2]; - out_ptr2_base[out_offset + 3 * out_width] = vo2[3]; + out_ptr2_base[out_offset + p.out_width] = vo2[1]; + out_ptr2_base[out_offset + 2 * p.out_width] = vo2[2]; + out_ptr2_base[out_offset + 3 * p.out_width] = vo2[3]; out_ptr3_base[out_offset] = vo3[0]; - out_ptr3_base[out_offset + out_width] = vo3[1]; - out_ptr3_base[out_offset + 2 * out_width] = vo3[2]; - out_ptr3_base[out_offset + 3 * out_width] = vo3[3]; + out_ptr3_base[out_offset + p.out_width] = vo3[1]; + out_ptr3_base[out_offset + 2 * p.out_width] = vo3[2]; + out_ptr3_base[out_offset + 3 * p.out_width] = vo3[3]; } // w } // h } // c } else { - for (index_t mm = m; mm < out_channels; ++mm) { + for (index_t mm = m; mm < p.out_channels; ++mm) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + mm * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; + input_data + b * p.in_batch_size + c * p.in_image_size; + const float *filter_ptr0 = + filter_data + mm * p.in_channels * 7 + c * 7; /* load filter (1 outch x 4 height x 1 width) */ float32x4_t vf00, vf01; vf00 = vld1q_f32(filter_ptr0); vf01 = vld1q_f32(filter_ptr0 + 3); - for (index_t h = 0; h + 3 < out_height; h += 4) { - for (index_t w = 0; w < out_width; ++w) { + for (index_t h = 0; h + 3 < p.out_height; h += 4) { + for (index_t w = 0; w < p.out_width; ++w) { // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; // output (1 outch x 4 height x 1 width): vo_outch_height float32x4_t vo0 = {out_ptr0_base[out_offset], - out_ptr0_base[out_offset + out_width], - out_ptr0_base[out_offset + 2 * out_width], - out_ptr0_base[out_offset + 3 * out_width]}; + out_ptr0_base[out_offset + p.out_width], + out_ptr0_base[out_offset + + 2 * p.out_width], + out_ptr0_base[out_offset + + 3 * p.out_width]}; // input offset - index_t in_offset = h * in_width + w; + index_t in_offset = h * p.in_width + w; // input (3 slide) float32x4_t vi0 = {in_ptr_base[in_offset], - in_ptr_base[in_offset + in_width], - in_ptr_base[in_offset + 2 * in_width], - in_ptr_base[in_offset + 3 * in_width]}; - float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], - in_ptr_base[in_offset + 5 * in_width], - in_ptr_base[in_offset + 6 * in_width], - in_ptr_base[in_offset + 7 * in_width]}; - float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width], - in_ptr_base[in_offset + 10 * in_width], - in_ptr_base[in_offset + 11 * in_width]}; + in_ptr_base[in_offset + p.in_width], + in_ptr_base[in_offset + 2 * p.in_width], + in_ptr_base[in_offset + 3 * p.in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width], + in_ptr_base[in_offset + 5 * p.in_width], + in_ptr_base[in_offset + 6 * p.in_width], + in_ptr_base[in_offset + 7 * p.in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width], + in_ptr_base[in_offset + 9 * p.in_width], + in_ptr_base[in_offset + 10 * p.in_width], + in_ptr_base[in_offset + 11 * p.in_width]}; float32x4_t vi1 = vextq_f32(vi0, vi4, 1); float32x4_t vi2 = vextq_f32(vi0, vi4, 2); float32x4_t vi3 = vextq_f32(vi0, vi4, 3); @@ -562,9 +467,9 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, #endif out_ptr0_base[out_offset] = vo0[0]; - out_ptr0_base[out_offset + out_width] = vo0[1]; - out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; - out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; + out_ptr0_base[out_offset + p.out_width] = vo0[1]; + out_ptr0_base[out_offset + 2 * p.out_width] = vo0[2]; + out_ptr0_base[out_offset + 3 * p.out_width] = vo0[3]; } // w } // h } // c @@ -572,78 +477,30 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, } // if } // m } // b - }, 0, batch, 1, 0, out_channels, 4); + }, 0, p.batch, 1, 0, p.out_channels, 4); - UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } -MaceStatus Conv2dK1x15S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { - std::unique_ptr padded_input; - std::unique_ptr padded_output; - - ResizeOutAndPadInOut(context, - input, - filter, - output, - 1, - 4, - &padded_input, - &padded_output); - const Tensor *in_tensor = input; - if (padded_input.get() != nullptr) { - in_tensor = padded_input.get(); - } - Tensor *out_tensor = output; - if (padded_output.get() != nullptr) { - out_tensor = padded_output.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = in_tensor->data(); - auto output_data = out_tensor->mutable_data(); - - auto &in_shape = in_tensor->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - +template<> +MaceStatus Conv2dK1x15S1::DoCompute( + const ConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { const index_t tile_height = - out_channels < 4 ? RoundUpDiv4(out_height) : out_height; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); + p.out_channels < 4 ? RoundUpDiv4(p.out_height) : p.out_height; - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - for (index_t h = 0; h < out_height; h += tile_height) { + for (index_t h = 0; h < p.out_height; h += tile_height) { float *out_ptr_base = - output_data + b * out_batch_size + m * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + m * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr = filter_data + m * in_channels * 15 + c * 15; + input_data + b * p.in_batch_size + c * p.in_image_size; + const float *filter_ptr = + filter_data + m * p.in_channels * 15 + c * 15; /* load filter (1 outch x 4 height x 1 width) */ float32x4_t vf0, vf1, vf2, vf3; vf0 = vld1q_f32(filter_ptr); @@ -651,20 +508,20 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context, vf2 = vld1q_f32(filter_ptr + 8); vf3 = vld1q_f32(filter_ptr + 11); - for (index_t ht = 0; ht < tile_height && h + ht < out_height; + for (index_t ht = 0; ht < tile_height && h + ht < p.out_height; ++ht) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // output (1 outch x 1 height x 4 width): vo_outch_height float32x4_t vo; // load output - index_t out_offset = (h + ht) * out_width + w; + index_t out_offset = (h + ht) * p.out_width + w; vo = vld1q_f32(out_ptr_base + out_offset); // input (3 slide) float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9, vi10, vi11, vi12, vi13, vi14, vi16; // input offset - index_t in_offset = (h + ht) * in_width + w; + index_t in_offset = (h + ht) * p.in_width + w; // load input vi0 = vld1q_f32(in_ptr_base + in_offset); vi4 = vld1q_f32(in_ptr_base + in_offset + 4); @@ -706,78 +563,30 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context, } // h } // m } // b - }, 0, batch, 1, 0, out_channels, 1); + }, 0, p.batch, 1, 0, p.out_channels, 1); - UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } -MaceStatus Conv2dK15x1S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { - std::unique_ptr padded_input; - std::unique_ptr padded_output; - - ResizeOutAndPadInOut(context, - input, - filter, - output, - 4, - 1, - &padded_input, - &padded_output); - const Tensor *in_tensor = input; - if (padded_input.get() != nullptr) { - in_tensor = padded_input.get(); - } - Tensor *out_tensor = output; - if (padded_output.get() != nullptr) { - out_tensor = padded_output.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = in_tensor->data(); - auto output_data = out_tensor->mutable_data(); - - auto &in_shape = in_tensor->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; +template<> +MaceStatus Conv2dK15x1S1::DoCompute( + const ConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { const index_t tile_width = - out_channels < 4 ? RoundUpDiv4(out_width) : out_width; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { + p.out_channels < 4 ? RoundUpDiv4(p.out_width) : p.out_width; + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - for (index_t w = 0; w < out_width; w += tile_width) { + for (index_t w = 0; w < p.out_width; w += tile_width) { float *out_ptr_base = - output_data + b * out_batch_size + m * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + m * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; - const float - *filter_ptr = filter_data + m * in_channels * 15 + c * 15; + input_data + b * p.in_batch_size + c * p.in_image_size; + const float *filter_ptr = + filter_data + m * p.in_channels * 15 + c * 15; /* load filter (1 outch x 4 height x 1 width) */ float32x4_t vf0, vf1, vf2, vf3; vf0 = vld1q_f32(filter_ptr); @@ -785,38 +594,38 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context, vf2 = vld1q_f32(filter_ptr + 8); vf3 = vld1q_f32(filter_ptr + 11); - for (index_t h = 0; h + 3 < out_height; h += 4) { - for (index_t wt = 0; wt < tile_width && w + wt < out_width; + for (index_t h = 0; h + 3 < p.out_height; h += 4) { + for (index_t wt = 0; wt < tile_width && w + wt < p.out_width; ++wt) { // load output - index_t out_offset = h * out_width + w + wt; + index_t out_offset = h * p.out_width + w + wt; // output (1 outch x 4 height x 1 width): vo_outch_height float32x4_t vo = {out_ptr_base[out_offset], - out_ptr_base[out_offset + out_width], - out_ptr_base[out_offset + 2 * out_width], - out_ptr_base[out_offset + 3 * out_width]}; + out_ptr_base[out_offset + p.out_width], + out_ptr_base[out_offset + 2 * p.out_width], + out_ptr_base[out_offset + 3 * p.out_width]}; // input offset - index_t in_offset = h * in_width + w + wt; + index_t in_offset = h * p.in_width + w + wt; // input (3 slide) float32x4_t vi0 = {in_ptr_base[in_offset], - in_ptr_base[in_offset + in_width], - in_ptr_base[in_offset + 2 * in_width], - in_ptr_base[in_offset + 3 * in_width]}; - float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], - in_ptr_base[in_offset + 5 * in_width], - in_ptr_base[in_offset + 6 * in_width], - in_ptr_base[in_offset + 7 * in_width]}; - float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width], - in_ptr_base[in_offset + 10 * in_width], - in_ptr_base[in_offset + 11 * in_width]}; - float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width], - in_ptr_base[in_offset + 13 * in_width], - in_ptr_base[in_offset + 14 * in_width], - in_ptr_base[in_offset + 15 * in_width]}; - float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width], - in_ptr_base[in_offset + 17 * in_width]}; + in_ptr_base[in_offset + p.in_width], + in_ptr_base[in_offset + 2 * p.in_width], + in_ptr_base[in_offset + 3 * p.in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width], + in_ptr_base[in_offset + 5 * p.in_width], + in_ptr_base[in_offset + 6 * p.in_width], + in_ptr_base[in_offset + 7 * p.in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width], + in_ptr_base[in_offset + 9 * p.in_width], + in_ptr_base[in_offset + 10 * p.in_width], + in_ptr_base[in_offset + 11 * p.in_width]}; + float32x4_t vi12 = {in_ptr_base[in_offset + 12 * p.in_width], + in_ptr_base[in_offset + 13 * p.in_width], + in_ptr_base[in_offset + 14 * p.in_width], + in_ptr_base[in_offset + 15 * p.in_width]}; + float32x4_t vi16 = {in_ptr_base[in_offset + 16 * p.in_width], + in_ptr_base[in_offset + 17 * p.in_width]}; float32x4_t vi1 = vextq_f32(vi0, vi4, 1); float32x4_t vi2 = vextq_f32(vi0, vi4, 2); float32x4_t vi3 = vextq_f32(vi0, vi4, 3); @@ -846,44 +655,20 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context, vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); out_ptr_base[out_offset] = vo[0]; - out_ptr_base[out_offset + out_width] = vo[1]; - out_ptr_base[out_offset + 2 * out_width] = vo[2]; - out_ptr_base[out_offset + 3 * out_width] = vo[3]; + out_ptr_base[out_offset + p.out_width] = vo[1]; + out_ptr_base[out_offset + 2 * p.out_width] = vo[2]; + out_ptr_base[out_offset + 3 * p.out_width] = vo[3]; } // wt } // h } // c } // w } // m } // b - }, 0, batch, 1, 0, out_channels, 1); + }, 0, p.batch, 1, 0, p.out_channels, 1); - UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } -void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, Conv2dK1x7S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, - float, ImplType::NEON, K1x7S1)); - - MACE_REGISTER_DELEGATOR( - registry, Conv2dK7x1S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, - float, ImplType::NEON, K7x1S1)); - - MACE_REGISTER_DELEGATOR( - registry, Conv2dK1x15S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, - float, ImplType::NEON, K1x15S1)); - - MACE_REGISTER_DELEGATOR( - registry, Conv2dK15x1S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, - float, ImplType::NEON, K15x1S1)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_3x3.cc b/mace/ops/arm/fp32/conv_2d_3x3.cc index 84635c7cac26d7c76bd82cd181716c2f5b987ecd..d058e0780b0cd621f8fb348c268717e2445257b0 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3.cc +++ b/mace/ops/arm/fp32/conv_2d_3x3.cc @@ -1,4 +1,4 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. +// Copyright 2020 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,95 +12,47 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/conv_2d_3x3.h" - #include #include +#include "mace/ops/arm/base/conv_2d_3x3.h" #include "mace/ops/delegator/conv_2d.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { - -MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { - std::unique_ptr padded_input; - std::unique_ptr padded_output; - ResizeOutAndPadInOut(context, - input, - filter, - output, - 2, - 4, - &padded_input, - &padded_output); - const Tensor *in_tensor = input; - if (padded_input != nullptr) { - in_tensor = padded_input.get(); - } - Tensor *out_tensor = output; - if (padded_output != nullptr) { - out_tensor = padded_output.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = in_tensor->data(); - auto output_data = out_tensor->mutable_data(); - - auto &in_shape = in_tensor->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { + +template<> +MaceStatus Conv2dK3x3S1::DoCompute( + const ConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - if (m + 1 < out_channels) { + if (m + 1 < p.out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; + output_data + b * p.out_batch_size + m * p.out_image_size; float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float - *in_ptr0 = input_data + b * in_batch_size + c * in_image_size; + output_data + b * p.out_batch_size + (m + 1) * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { + const float *in_ptr0 = + input_data + b * p.in_batch_size + c * p.in_image_size; const float - *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9; + *filter_ptr0 = filter_data + m * p.in_channels * 9 + c * 9; float *out_ptr1 = out_ptr1_base; const float *in_ptr1 = - input_data + b * in_batch_size + c * in_image_size - + 1 * in_width; + input_data + b * p.in_batch_size + c * p.in_image_size + + 1 * p.in_width; const float *in_ptr2 = - input_data + b * in_batch_size + c * in_image_size - + 2 * in_width; + input_data + b * p.in_batch_size + c * p.in_image_size + + 2 * p.in_width; const float *in_ptr3 = - input_data + b * in_batch_size + c * in_image_size - + 3 * in_width; - const float - *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9; + input_data + b * p.in_batch_size + c * p.in_image_size + + 3 * p.in_width; + const float *filter_ptr1 = + filter_data + (m + 1) * p.in_channels * 9 + c * 9; #if defined(__aarch64__) float *out_ptr0 = out_ptr0_base; @@ -116,8 +68,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, vf11 = vld1q_f32(filter_ptr1 + 3); vf12 = vld1q_f32(filter_ptr1 + 6); - for (index_t h = 0; h + 1 < out_height; h += 2) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + for (index_t h = 0; h + 1 < p.out_height; h += 2) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input (4 height x 3 slide): vi_height_slide float32x4_t vi00, vi01, vi02; // reg count: 14 float32x4_t vi10, vi11, vi12; @@ -150,9 +102,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, // load ouptut vo00 = vld1q_f32(out_ptr0); - vo01 = vld1q_f32(out_ptr0 + out_width); + vo01 = vld1q_f32(out_ptr0 + p.out_width); vo10 = vld1q_f32(out_ptr1); - vo11 = vld1q_f32(out_ptr1 + out_width); + vo11 = vld1q_f32(out_ptr1 + p.out_width); // outch 0, height 0 vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); // reg count: 18 @@ -199,9 +151,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, vo11 = vfmaq_laneq_f32(vo11, vi32, vf12, 2); vst1q_f32(out_ptr0, vo00); - vst1q_f32(out_ptr0 + out_width, vo01); + vst1q_f32(out_ptr0 + p.out_width, vo01); vst1q_f32(out_ptr1, vo10); - vst1q_f32(out_ptr1 + out_width, vo11); + vst1q_f32(out_ptr1 + p.out_width, vo11); in_ptr0 += 4; in_ptr1 += 4; @@ -212,13 +164,13 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, out_ptr1 += 4; } // w - in_ptr0 += 2 + in_width; - in_ptr1 += 2 + in_width; - in_ptr2 += 2 + in_width; - in_ptr3 += 2 + in_width; + in_ptr0 += 2 + p.in_width; + in_ptr1 += 2 + p.in_width; + in_ptr2 += 2 + p.in_width; + in_ptr3 += 2 + p.in_width; - out_ptr0 += out_width; - out_ptr1 += out_width; + out_ptr0 += p.out_width; + out_ptr1 += p.out_width; } // h #else // arm v7 float *out_ptr0 = out_ptr0_base; @@ -238,8 +190,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, vf167 = vld1_f32(filter_ptr1 + 6); vf189 = vld1_f32(filter_ptr1 + 8); - for (index_t h = 0; h + 1 < out_height; h += 2) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + for (index_t h = 0; h + 1 < p.out_height; h += 2) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input (4 height x 3 slide): vi_height_slide float32x4_t vi00, vi01, vi02; // reg count: 14 float32x4_t vi10, vi11, vi12; @@ -272,9 +224,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, // load ouptut vo00 = vld1q_f32(out_ptr0); - vo01 = vld1q_f32(out_ptr0 + out_width); + vo01 = vld1q_f32(out_ptr0 + p.out_width); vo10 = vld1q_f32(out_ptr1); - vo11 = vld1q_f32(out_ptr1 + out_width); + vo11 = vld1q_f32(out_ptr1 + p.out_width); // outch 0, height 0 vo00 = vmlaq_lane_f32(vo00, vi00, vf001, 0); @@ -321,9 +273,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, vo11 = vmlaq_lane_f32(vo11, vi32, vf189, 0); vst1q_f32(out_ptr0, vo00); - vst1q_f32(out_ptr0 + out_width, vo01); + vst1q_f32(out_ptr0 + p.out_width, vo01); vst1q_f32(out_ptr1, vo10); - vst1q_f32(out_ptr1 + out_width, vo11); + vst1q_f32(out_ptr1 + p.out_width, vo11); in_ptr0 += 4; in_ptr1 += 4; @@ -334,34 +286,34 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, out_ptr1 += 4; } // w - in_ptr0 += 2 + in_width; - in_ptr1 += 2 + in_width; - in_ptr2 += 2 + in_width; - in_ptr3 += 2 + in_width; + in_ptr0 += 2 + p.in_width; + in_ptr1 += 2 + p.in_width; + in_ptr2 += 2 + p.in_width; + in_ptr3 += 2 + p.in_width; - out_ptr0 += out_width; - out_ptr1 += out_width; + out_ptr0 += p.out_width; + out_ptr1 += p.out_width; } // h #endif } // c } else { - for (index_t mm = m; mm < out_channels; ++mm) { + for (index_t mm = m; mm < p.out_channels; ++mm) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + mm * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr0 = - input_data + b * in_batch_size + c * in_image_size; + input_data + b * p.in_batch_size + c * p.in_image_size; const float *in_ptr1 = - input_data + b * in_batch_size + c * in_image_size - + 1 * in_width; + input_data + b * p.in_batch_size + c * p.in_image_size + + 1 * p.in_width; const float *in_ptr2 = - input_data + b * in_batch_size + c * in_image_size - + 2 * in_width; + input_data + b * p.in_batch_size + c * p.in_image_size + + 2 * p.in_width; const float *in_ptr3 = - input_data + b * in_batch_size + c * in_image_size - + 3 * in_width; + input_data + b * p.in_batch_size + c * p.in_image_size + + 3 * p.in_width; const float - *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9; + *filter_ptr0 = filter_data + mm * p.in_channels * 9 + c * 9; #if defined(__aarch64__) float *out_ptr0 = out_ptr0_base; @@ -372,8 +324,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, vf01 = vld1q_f32(filter_ptr0 + 3); vf02 = vld1q_f32(filter_ptr0 + 5); - for (index_t h = 0; h + 1 < out_height; h += 2) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + for (index_t h = 0; h + 1 < p.out_height; h += 2) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input (4 height x 3 slide): vi_height_slide float32x4_t vi00, vi01, vi02, vi0n; float32x4_t vi10, vi11, vi12, vi1n; @@ -404,7 +356,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, // load ouptut vo00 = vld1q_f32(out_ptr0); - vo01 = vld1q_f32(out_ptr0 + out_width); + vo01 = vld1q_f32(out_ptr0 + p.out_width); // outch 0, height 0 vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); @@ -429,7 +381,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3); vst1q_f32(out_ptr0, vo00); - vst1q_f32(out_ptr0 + out_width, vo01); + vst1q_f32(out_ptr0 + p.out_width, vo01); in_ptr0 += 4; in_ptr1 += 4; @@ -439,12 +391,12 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, out_ptr0 += 4; } // w - in_ptr0 += 2 + in_width; - in_ptr1 += 2 + in_width; - in_ptr2 += 2 + in_width; - in_ptr3 += 2 + in_width; + in_ptr0 += 2 + p.in_width; + in_ptr1 += 2 + p.in_width; + in_ptr2 += 2 + p.in_width; + in_ptr3 += 2 + p.in_width; - out_ptr0 += out_width; + out_ptr0 += p.out_width; } // h #else // arm v7 float *out_ptr0 = out_ptr0_base; @@ -457,8 +409,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, vf67 = vld1_f32(filter_ptr0 + 6); vf78 = vld1_f32(filter_ptr0 + 7); - for (index_t h = 0; h + 1 < out_height; h += 2) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + for (index_t h = 0; h + 1 < p.out_height; h += 2) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input (4 height x 3 slide): vi_height_slide float32x4_t vi00, vi01, vi02, vi0n; float32x4_t vi10, vi11, vi12, vi1n; @@ -489,7 +441,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, // load ouptut vo00 = vld1q_f32(out_ptr0); - vo01 = vld1q_f32(out_ptr0 + out_width); + vo01 = vld1q_f32(out_ptr0 + p.out_width); // outch 0, height 0 vo00 = vmlaq_lane_f32(vo00, vi00, vf01, 0); @@ -514,7 +466,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, vo01 = vmlaq_lane_f32(vo01, vi32, vf78, 1); vst1q_f32(out_ptr0, vo00); - vst1q_f32(out_ptr0 + out_width, vo01); + vst1q_f32(out_ptr0 + p.out_width, vo01); in_ptr0 += 4; in_ptr1 += 4; @@ -524,12 +476,12 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, out_ptr0 += 4; } // w - in_ptr0 += 2 + in_width; - in_ptr1 += 2 + in_width; - in_ptr2 += 2 + in_width; - in_ptr3 += 2 + in_width; + in_ptr0 += 2 + p.in_width; + in_ptr1 += 2 + p.in_width; + in_ptr2 += 2 + p.in_width; + in_ptr3 += 2 + p.in_width; - out_ptr0 += out_width; + out_ptr0 += p.out_width; } // h #endif } // c @@ -537,73 +489,25 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, } // if } // m } // b - }, 0, batch, 1, 0, out_channels, 2); + }, 0, p.batch, 1, 0, p.out_channels, 2); - UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } -MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { - std::unique_ptr padded_input; - std::unique_ptr padded_output; - - ResizeOutAndPadInOut(context, - input, - filter, - output, - 1, - 4, - &padded_input, - &padded_output); - const Tensor *in_tensor = input; - if (padded_input != nullptr) { - in_tensor = padded_input.get(); - } - Tensor *out_tensor = output; - if (padded_output != nullptr) { - out_tensor = padded_output.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = in_tensor->data(); - auto output_data = out_tensor->mutable_data(); - - auto &in_shape = in_tensor->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus Conv2dK3x3S2::DoCompute( + const ConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - for (index_t c = 0; c < in_channels; ++c) { + for (index_t c = 0; c < p.in_channels; ++c) { const float - *in_base = input_data + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9; - float - *out_base = output_data + b * out_batch_size + m * out_image_size; + *in_base = input_data + b * p.in_batch_size + c * p.in_image_size; + const float *filter_ptr = filter_data + m * p.in_channels * 9 + c * 9; + float *out_base = + output_data + b * p.out_batch_size + m * p.out_image_size; #if defined(__aarch64__) // load filter (1 outch x 3 height x 3 width): vf_outch_height @@ -612,8 +516,8 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, vf01 = vld1q_f32(filter_ptr + 3); vf02 = vld1q_f32(filter_ptr + 5); - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { float32x4x2_t vi0, vi1, vi2; float32x4_t vi0n, vi1n, vi2n; @@ -628,17 +532,17 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, // load input index_t in_h = h * 2; index_t in_w = w * 2; - index_t in_offset = in_h * in_width + in_w; + index_t in_offset = in_h * p.in_width + in_w; vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] - vi1 = vld2q_f32(in_base + in_offset + in_width); - vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); + vi1 = vld2q_f32(in_base + in_offset + p.in_width); + vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width); vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] - vi1n = vld1q_f32(in_base + in_offset + in_width + 8); - vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); + vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8); + vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8); // load ouptut - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo = vld1q_f32(out_base + out_offset); vi00 = vi0.val[0]; // [0.2.4.6] @@ -674,8 +578,8 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, vf67 = vld1_f32(filter_ptr + 6); vf78 = vld1_f32(filter_ptr + 7); - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { float32x4x2_t vi0, vi1, vi2; float32x4_t vi0n, vi1n, vi2n; @@ -690,17 +594,17 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, // load input index_t in_h = h * 2; index_t in_w = w * 2; - index_t in_offset = in_h * in_width + in_w; + index_t in_offset = in_h * p.in_width + in_w; vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] - vi1 = vld2q_f32(in_base + in_offset + in_width); - vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); + vi1 = vld2q_f32(in_base + in_offset + p.in_width); + vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width); vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] - vi1n = vld1q_f32(in_base + in_offset + in_width + 8); - vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); + vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8); + vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8); // load ouptut - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo = vld1q_f32(out_base + out_offset); vi00 = vi0.val[0]; // [0.2.4.6] @@ -731,24 +635,11 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, } // c } // m } // b - }, 0, batch, 1, 0, out_channels, 1); + }, 0, p.batch, 1, 0, p.out_channels, 1); - UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } -void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, Conv2dK3x3S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, - float, ImplType::NEON, K3x3S1)); - MACE_REGISTER_DELEGATOR( - registry, Conv2dK3x3S2, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, - float, ImplType::NEON, K3x3S2)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc index 1ec5205735e9564e5c7516768c77491a394c391d..051d558797730c6e42389db03275c99e2e03c655 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc +++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc @@ -18,8 +18,8 @@ #include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/delegator/conv_2d.h" -#include "mace/utils/memory.h" #include "mace/utils/math.h" +#include "mace/utils/memory.h" namespace mace { namespace ops { diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h index ec4db81bb2d552615430b81e330ef0ff862c563f..513cc99d4a9eb6538aecf299f5a7e6aaf8b8a309 100644 --- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h +++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h @@ -20,8 +20,8 @@ #include "mace/core/ops/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/arm/fp32/conv_2d.h" -#include "mace/ops/arm/fp32/gemm.h" +#include "mace/ops/arm/base/conv_2d.h" +#include "mace/ops/arm/base/gemm.h" #include "mace/public/mace.h" namespace mace { @@ -32,7 +32,7 @@ namespace fp32 { class Conv2dK3x3Winograd : public Conv2dBase { public: explicit Conv2dK3x3Winograd(const delegator::Conv2dParam ¶m) - : Conv2dBase(param), + : Conv2dBase(param, sizeof(float)), gemm_(delegator::GemmParam()), transformed_filter_(nullptr), out_tile_size_(0) {} @@ -94,7 +94,7 @@ class Conv2dK3x3Winograd : public Conv2dBase { index_t tile_count, float *output); - Gemm gemm_; + Gemm gemm_; std::unique_ptr transformed_filter_; index_t out_tile_size_; }; diff --git a/mace/ops/arm/fp32/conv_2d_5x5.cc b/mace/ops/arm/fp32/conv_2d_5x5.cc index 2bfb762520f49cf0a5b5cb82dea11bc2f55fc6a0..4751b0d15439fcd724d2562f96cd33bc07d9d600 100644 --- a/mace/ops/arm/fp32/conv_2d_5x5.cc +++ b/mace/ops/arm/fp32/conv_2d_5x5.cc @@ -15,26 +15,12 @@ #include #include -#include "mace/ops/arm/fp32/conv_2d.h" +#include "mace/ops/arm/base/conv_2d_5x5.h" #include "mace/ops/delegator/conv_2d.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { - -class Conv2dK5x5S1 : public Conv2dBase { - public: - explicit Conv2dK5x5S1(const delegator::Conv2dParam ¶m) - : Conv2dBase(param) {} - virtual ~Conv2dK5x5S1() {} - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; -}; #define MACE_Conv2dNeonK5x5SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ @@ -91,89 +77,43 @@ class Conv2dK5x5S1 : public Conv2dBase { vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); -MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { - std::unique_ptr padded_input; - std::unique_ptr padded_output; - ResizeOutAndPadInOut(context, - input, - filter, - output, - 1, - 4, - &padded_input, - &padded_output); - const Tensor *in_tensor = input; - if (padded_input != nullptr) { - in_tensor = padded_input.get(); - } - Tensor *out_tensor = output; - if (padded_output != nullptr) { - out_tensor = padded_output.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = in_tensor->data(); - auto output_data = out_tensor->mutable_data(); - - auto &in_shape = in_tensor->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus Conv2dK5x5S1::DoCompute( + const ConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - if (m + 3 < out_channels) { + if (m + 3 < p.out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; + output_data + b * p.out_batch_size + m * p.out_image_size; float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * p.out_batch_size + (m + 1) * p.out_image_size; float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * p.out_batch_size + (m + 2) * p.out_image_size; float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; + output_data + b * p.out_batch_size + (m + 3) * p.out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; + input_data + b * p.in_batch_size + c * p.in_image_size; const float - *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25; + *filter_ptr0 = filter_data + m * p.in_channels * 25 + c * 25; const float *filter_ptr1 = - filter_data + (m + 1) * in_channels * 25 + c * 25; + filter_data + (m + 1) * p.in_channels * 25 + c * 25; const float *filter_ptr2 = - filter_data + (m + 2) * in_channels * 25 + c * 25; + filter_data + (m + 2) * p.in_channels * 25 + c * 25; const float *filter_ptr3 = - filter_data + (m + 3) * in_channels * 25 + c * 25; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + filter_data + (m + 3) * p.in_channels * 25 + c * 25; + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input offset - index_t in_offset = h * in_width + w; + index_t in_offset = h * p.in_width + w; // output (4 outch x 1 height x 4 width): vo_outch_height float32x4_t vo0, vo1, vo2, vo3; // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); vo1 = vld1q_f32(out_ptr1_base + out_offset); vo2 = vld1q_f32(out_ptr2_base + out_offset); @@ -190,7 +130,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, MACE_Conv2dNeonK5x5SnLoadCalc4; - in_offset += in_width; + in_offset += p.in_width; filter_ptr0 += 5; filter_ptr1 += 5; filter_ptr2 += 5; @@ -210,22 +150,22 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, } // h } // c } else { - for (index_t mm = m; mm < out_channels; ++mm) { + for (index_t mm = m; mm < p.out_channels; ++mm) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + mm * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; + input_data + b * p.in_batch_size + c * p.in_image_size; const float - *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + *filter_ptr0 = filter_data + mm * p.in_channels * 25 + c * 25; + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input offset - index_t in_offset = h * in_width + w; + index_t in_offset = h * p.in_width + w; // output (1 outch x 1 height x 4 width): vo_outch_height float32x4_t vo0; // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); for (index_t r = 0; r < 5; ++r) { // input (3 slide) @@ -239,7 +179,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, MACE_Conv2dNeonK5x5SnLoadCalc1; - in_offset += in_width; + in_offset += p.in_width; filter_ptr0 += 5; } // r @@ -252,20 +192,11 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, } // if } // m } // b - }, 0, batch, 1, 0, out_channels, 4); + }, 0, p.batch, 1, 0, p.out_channels, 4); - UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } -void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, Conv2dK5x5S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, - float, ImplType::NEON, K5x5S1)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_7x7.cc b/mace/ops/arm/fp32/conv_2d_7x7.cc index d1f69967a21dd7393dafb196fd02b0c9e0322e4b..1ebb052113388d00bbb1e2191c91580ce3a3e299 100644 --- a/mace/ops/arm/fp32/conv_2d_7x7.cc +++ b/mace/ops/arm/fp32/conv_2d_7x7.cc @@ -12,17 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/conv_2d_7x7.h" - #include #include +#include "mace/ops/arm/base/conv_2d_7x7.h" #include "mace/ops/delegator/conv_2d.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ @@ -156,88 +154,43 @@ namespace fp32 { vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \ vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); -MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { - std::unique_ptr padded_input; - std::unique_ptr padded_output; - ResizeOutAndPadInOut(context, - input, - filter, - output, - 1, - 4, - &padded_input, - &padded_output); - const Tensor *in_tensor = input; - if (padded_input != nullptr) { - in_tensor = padded_input.get(); - } - Tensor *out_tensor = output; - if (padded_output != nullptr) { - out_tensor = padded_output.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = in_tensor->data(); - auto output_data = out_tensor->mutable_data(); - - auto &in_shape = in_tensor->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus Conv2dK7x7S1::DoCompute( + const ConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { + + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - if (m + 3 < out_channels) { + if (m + 3 < p.out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; + output_data + b * p.out_batch_size + m * p.out_image_size; float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * p.out_batch_size + (m + 1) * p.out_image_size; float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * p.out_batch_size + (m + 2) * p.out_image_size; float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + (m + 3) * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; + input_data + b * p.in_batch_size + c * p.in_image_size; const float - *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; + *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49; const float *filter_ptr1 = - filter_data + (m + 1) * in_channels * 49 + c * 49; + filter_data + (m + 1) * p.in_channels * 49 + c * 49; const float *filter_ptr2 = - filter_data + (m + 2) * in_channels * 49 + c * 49; + filter_data + (m + 2) * p.in_channels * 49 + c * 49; const float *filter_ptr3 = - filter_data + (m + 3) * in_channels * 49 + c * 49; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + filter_data + (m + 3) * p.in_channels * 49 + c * 49; + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input offset - index_t in_offset = h * in_width + w; + index_t in_offset = h * p.in_width + w; // output (4 outch x 1 height x 4 width): vo_outch_height float32x4_t vo0, vo1, vo2, vo3; // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); vo1 = vld1q_f32(out_ptr1_base + out_offset); vo2 = vld1q_f32(out_ptr2_base + out_offset); @@ -262,7 +215,7 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif - in_offset += in_width; + in_offset += p.in_width; filter_ptr0 += 7; filter_ptr1 += 7; filter_ptr2 += 7; @@ -282,22 +235,22 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, } // h } // c } else { - for (index_t mm = m; mm < out_channels; ++mm) { + for (index_t mm = m; mm < p.out_channels; ++mm) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + mm * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; + input_data + b * p.in_batch_size + c * p.in_image_size; const float - *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49; + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input offset - index_t in_offset = h * in_width + w; + index_t in_offset = h * p.in_width + w; // output (1 outch x 1 height x 4 width): vo_outch_height float32x4_t vo0; // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); for (index_t r = 0; r < 7; ++r) { // input (3 slide) @@ -319,7 +272,7 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; #endif - in_offset += in_width; + in_offset += p.in_width; filter_ptr0 += 7; } // r @@ -332,96 +285,49 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, } // if } // m } // b - }, 0, batch, 1, 0, out_channels, 4); + }, 0, p.batch, 1, 0, p.out_channels, 4); - UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } -MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { - std::unique_ptr padded_input; - std::unique_ptr padded_output; - ResizeOutAndPadInOut(context, - input, - filter, - output, - 1, - 4, - &padded_input, - &padded_output); - const Tensor *in_tensor = input; - if (padded_input != nullptr) { - in_tensor = padded_input.get(); - } - Tensor *out_tensor = output; - if (padded_output != nullptr) { - out_tensor = padded_output.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = in_tensor->data(); - auto output_data = out_tensor->mutable_data(); - - auto &in_shape = in_tensor->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus Conv2dK7x7S2::DoCompute( + const ConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - if (m + 3 < out_channels) { + if (m + 3 < p.out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; + output_data + b * p.out_batch_size + m * p.out_image_size; float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * p.out_batch_size + (m + 1) * p.out_image_size; float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * p.out_batch_size + (m + 2) * p.out_image_size; float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + (m + 3) * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; + input_data + b * p.in_batch_size + c * p.in_image_size; const float - *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; + *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49; const float *filter_ptr1 = - filter_data + (m + 1) * in_channels * 49 + c * 49; + filter_data + (m + 1) * p.in_channels * 49 + c * 49; const float *filter_ptr2 = - filter_data + (m + 2) * in_channels * 49 + c * 49; + filter_data + (m + 2) * p.in_channels * 49 + c * 49; const float *filter_ptr3 = - filter_data + (m + 3) * in_channels * 49 + c * 49; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + filter_data + (m + 3) * p.in_channels * 49 + c * 49; + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input offset index_t in_h = h * 2; index_t in_w = w * 2; - index_t in_offset = in_h * in_width + in_w; + index_t in_offset = in_h * p.in_width + in_w; // output (4 outch x 1 height x 4 width): vo_outch_height float32x4_t vo0, vo1, vo2, vo3; // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); vo1 = vld1q_f32(out_ptr1_base + out_offset); vo2 = vld1q_f32(out_ptr2_base + out_offset); @@ -449,7 +355,7 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif - in_offset += in_width; + in_offset += p.in_width; filter_ptr0 += 7; filter_ptr1 += 7; filter_ptr2 += 7; @@ -469,24 +375,24 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, } // h } // c } else { - for (index_t mm = m; mm < out_channels; ++mm) { + for (index_t mm = m; mm < p.out_channels; ++mm) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + mm * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; + input_data + b * p.in_batch_size + c * p.in_image_size; const float - *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49; + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input offset index_t in_h = h * 2; index_t in_w = w * 2; - index_t in_offset = in_h * in_width + in_w; + index_t in_offset = in_h * p.in_width + in_w; // output (1 outch x 1 height x 4 width): vo_outch_height float32x4_t vo0; // load ouput - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); for (index_t r = 0; r < 7; ++r) { // input (3 slide) @@ -511,7 +417,7 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; #endif - in_offset += in_width; + in_offset += p.in_width; filter_ptr0 += 7; } // r @@ -524,96 +430,49 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, } // if } // m } // b - }, 0, batch, 1, 0, out_channels, 4); + }, 0, p.batch, 1, 0, p.out_channels, 4); - UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } -MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { - std::unique_ptr padded_input; - std::unique_ptr padded_output; - ResizeOutAndPadInOut(context, - input, - filter, - output, - 1, - 4, - &padded_input, - &padded_output); - const Tensor *in_tensor = input; - if (padded_input != nullptr) { - in_tensor = padded_input.get(); - } - Tensor *out_tensor = output; - if (padded_output != nullptr) { - out_tensor = padded_output.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = in_tensor->data(); - auto output_data = out_tensor->mutable_data(); - - auto &in_shape = in_tensor->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus Conv2dK7x7S3::DoCompute( + const ConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - if (m + 3 < out_channels) { + if (m + 3 < p.out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; + output_data + b * p.out_batch_size + m * p.out_image_size; float *out_ptr1_base = - output_data + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * p.out_batch_size + (m + 1) * p.out_image_size; float *out_ptr2_base = - output_data + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * p.out_batch_size + (m + 2) * p.out_image_size; float *out_ptr3_base = - output_data + b * out_batch_size + (m + 3) * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + (m + 3) * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; + input_data + b * p.in_batch_size + c * p.in_image_size; const float - *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; + *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49; const float *filter_ptr1 = - filter_data + (m + 1) * in_channels * 49 + c * 49; + filter_data + (m + 1) * p.in_channels * 49 + c * 49; const float *filter_ptr2 = - filter_data + (m + 2) * in_channels * 49 + c * 49; + filter_data + (m + 2) * p.in_channels * 49 + c * 49; const float *filter_ptr3 = - filter_data + (m + 3) * in_channels * 49 + c * 49; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + filter_data + (m + 3) * p.in_channels * 49 + c * 49; + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input offset index_t in_h = h * 3; index_t in_w = w * 3; - index_t in_offset = in_h * in_width + in_w; + index_t in_offset = in_h * p.in_width + in_w; // output (4 outch x 1 height x 4 width): vo_outch_height float32x4_t vo0, vo1, vo2, vo3; // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); vo1 = vld1q_f32(out_ptr1_base + out_offset); vo2 = vld1q_f32(out_ptr2_base + out_offset); @@ -641,7 +500,7 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; #endif - in_offset += in_width; + in_offset += p.in_width; filter_ptr0 += 7; filter_ptr1 += 7; filter_ptr2 += 7; @@ -661,24 +520,24 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, } // h } // c } else { - for (index_t mm = m; mm < out_channels; ++mm) { + for (index_t mm = m; mm < p.out_channels; ++mm) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + mm * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; + input_data + b * p.in_batch_size + c * p.in_image_size; const float - *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49; + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input offset index_t in_h = h * 3; index_t in_w = w * 3; - index_t in_offset = in_h * in_width + in_w; + index_t in_offset = in_h * p.in_width + in_w; // output (1 outch x 1 height x 4 width): vo_outch_height float32x4_t vo0; // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo0 = vld1q_f32(out_ptr0_base + out_offset); for (index_t r = 0; r < 7; ++r) { // input (3 slide) @@ -703,7 +562,7 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; #endif - in_offset += in_width; + in_offset += p.in_width; filter_ptr0 += 7; } // r @@ -716,28 +575,11 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, } // if } // m } // b - }, 0, batch, 1, 0, out_channels, 4); + }, 0, p.batch, 1, 0, p.out_channels, 4); - UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } -void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, Conv2dK7x7S1, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, - float, ImplType::NEON, K7x7S1)); - MACE_REGISTER_DELEGATOR( - registry, Conv2dK7x7S2, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, - float, ImplType::NEON, K7x7S2)); - MACE_REGISTER_DELEGATOR( - registry, Conv2dK7x7S3, delegator::Conv2dParam, - MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, - float, ImplType::NEON, K7x7S3)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_general.cc b/mace/ops/arm/fp32/conv_2d_general.cc similarity index 61% rename from mace/ops/arm/fp32/conv_general.cc rename to mace/ops/arm/fp32/conv_2d_general.cc index d58a1725e507e27af12bcb0b0d64821c36769829..6f6a1ff5693a60c3cdf6a754a531fbac14d4be01 100644 --- a/mace/ops/arm/fp32/conv_general.cc +++ b/mace/ops/arm/fp32/conv_2d_general.cc @@ -12,118 +12,59 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/conv_2d.h" - #include +#include "mace/ops/arm/base/conv_2d_general.h" #include "mace/ops/delegator/conv_2d.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { - -class Conv2dGeneral : public Conv2dBase { - public: - explicit Conv2dGeneral(const delegator::Conv2dParam ¶m) - : Conv2dBase(param) {} - virtual ~Conv2dGeneral() {} - - MaceStatus Compute( - const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) override; -}; - -MaceStatus Conv2dGeneral::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - Tensor *output) { - std::unique_ptr padded_input; - std::unique_ptr padded_output; - - ResizeOutAndPadInOut(context, - input, - filter, - output, - 1, - 4, - &padded_input, - &padded_output); - - const Tensor *in_tensor = input; - if (padded_input != nullptr) { - in_tensor = padded_input.get(); - } - Tensor *out_tensor = output; - if (padded_output != nullptr) { - out_tensor = padded_output.get(); - } - out_tensor->Clear(); - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = in_tensor->data(); - auto output_data = out_tensor->mutable_data(); - - auto &in_shape = in_tensor->shape(); - auto &out_shape = out_tensor->shape(); - auto &filter_shape = filter->shape(); - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; +template<> +MaceStatus Conv2dGeneral::DoCompute( + const ConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data, + const std::vector &filter_shape) { const index_t filter_height = filter_shape[2]; const index_t filter_width = filter_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; const index_t filter_size = filter_height * filter_width; - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { const int stride_h = strides_[0]; const int stride_w = strides_[1]; const int dilation_h = dilations_[0]; const int dilation_w = dilations_[1]; - if (m + 3 < out_channels) { + if (m + 3 < p.out_channels) { float *out_ptr0_base = - output_data + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = out_ptr0_base + out_image_size; - float *out_ptr2_base = out_ptr1_base + out_image_size; - float *out_ptr3_base = out_ptr2_base + out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + m * p.out_image_size; + float *out_ptr1_base = out_ptr0_base + p.out_image_size; + float *out_ptr2_base = out_ptr1_base + p.out_image_size; + float *out_ptr3_base = out_ptr2_base + p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; + input_data + b * p.in_batch_size + c * p.in_image_size; const float *filter_ptr0 = - filter_data + m * in_channels * filter_size + c * filter_size; - const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; - const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; - const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + filter_data + m * p.in_channels * filter_size + c * filter_size; + const float *filter_ptr1 = + filter_ptr0 + p.in_channels * filter_size; + const float *filter_ptr2 = + filter_ptr1 + p.in_channels * filter_size; + const float *filter_ptr3 = + filter_ptr2 + p.in_channels * filter_size; + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input offset index_t ih = h * stride_h; index_t iw = w * stride_w; - index_t in_offset = ih * in_width + iw; + index_t in_offset = ih * p.in_width + iw; // output (4 outch x 1 height x 4 width): vo_outch_height float vo0[4], vo1[4], vo2[4], vo3[4]; // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; for (index_t ow = 0; ow < 4; ++ow) { vo0[ow] = out_ptr0_base[out_offset + ow]; vo1[ow] = out_ptr1_base[out_offset + ow]; @@ -171,7 +112,7 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, + kw * dilation_w] * filter_ptr3[kw]; } // kw - in_offset += dilation_h * in_width; + in_offset += dilation_h * p.in_width; filter_ptr0 += filter_width; filter_ptr1 += filter_width; filter_ptr2 += filter_width; @@ -193,26 +134,26 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, } // h } // c } else { - for (index_t mm = m; mm < out_channels; ++mm) { + for (index_t mm = m; mm < p.out_channels; ++mm) { float *out_ptr0_base = - output_data + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { + output_data + b * p.out_batch_size + mm * p.out_image_size; + for (index_t c = 0; c < p.in_channels; ++c) { const float *in_ptr_base = - input_data + b * in_batch_size + c * in_image_size; + input_data + b * p.in_batch_size + c * p.in_image_size; const float *filter_ptr0 = - filter_data + mm * in_channels * filter_size + filter_data + mm * p.in_channels * filter_size + c * filter_size; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { + for (index_t h = 0; h < p.out_height; ++h) { + for (index_t w = 0; w + 3 < p.out_width; w += 4) { // input offset index_t ih = h * stride_h; index_t iw = w * stride_w; - index_t in_offset = ih * in_width + iw; + index_t in_offset = ih * p.in_width + iw; // output (1 outch x 1 height x 4 width): vo_outch_height float vo0[4]; // load output - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; for (index_t ow = 0; ow < 4; ++ow) { vo0[ow] = out_ptr0_base[out_offset + ow]; } @@ -231,7 +172,7 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, + kw * dilation_w] * filter_ptr0[kw]; } // kw - in_offset += dilation_h * in_width; + in_offset += dilation_h * p.in_width; filter_ptr0 += filter_width; } // kh @@ -246,19 +187,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, } // if } // m } // b - }, 0, batch, 1, 0, out_channels, 4); + }, 0, p.batch, 1, 0, p.out_channels, 4); - UnPadOutput(*out_tensor, output); return MaceStatus::MACE_SUCCESS; } -void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, Conv2dGeneral, delegator::Conv2dParam, - MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/deconv_2d.h b/mace/ops/arm/fp32/deconv_2d.h deleted file mode 100644 index 128d5858beee4a8530ed3f775536fb3d1652c44b..0000000000000000000000000000000000000000 --- a/mace/ops/arm/fp32/deconv_2d.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2019 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ARM_FP32_DECONV_2D_H_ -#define MACE_OPS_ARM_FP32_DECONV_2D_H_ - -#include -#include - -#include "mace/core/ops/op_context.h" -#include "mace/core/tensor.h" -#include "mace/core/types.h" -#include "mace/ops/arm/fp32/gemm.h" -#include "mace/ops/common/conv_pool_2d_util.h" -#include "mace/ops/delegator/deconv_2d.h" -#include "mace/public/mace.h" - -namespace mace { -namespace ops { -namespace arm { -namespace fp32 { - -class Deconv2dBase : public delegator::Deconv2d { - public: - explicit Deconv2dBase(const delegator::Deconv2dParam ¶m) - : delegator::Deconv2d(param), - group_(param.group_) {} - - virtual ~Deconv2dBase() = default; - - protected: - MaceStatus ResizeOutAndPadOut(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output, - std::vector *out_pad_size, - std::unique_ptr *padded_output); - - void UnPadOutput(const Tensor &src, - const std::vector &out_pad_size, - Tensor *dst); - index_t group_; -}; - -} // namespace fp32 -} // namespace arm -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ARM_FP32_DECONV_2D_H_ diff --git a/mace/ops/arm/fp32/deconv_2d_2x2.cc b/mace/ops/arm/fp32/deconv_2d_2x2.cc index 57784e638f0da27575020b50a63e3080674c5c6f..2a6ca40d624e27a1d0cd531745685a45de1c5264 100644 --- a/mace/ops/arm/fp32/deconv_2d_2x2.cc +++ b/mace/ops/arm/fp32/deconv_2d_2x2.cc @@ -12,74 +12,33 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/deconv_2d_2x2.h" - #include + +#include "mace/ops/arm/base/deconv_2d_2x2.h" #include "mace/ops/arm/fp32/common_neon.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { - -MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - - const index_t out_img_size = outh * outw; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { + +template<> +MaceStatus Deconv2dK2x2S1::DoCompute( + const DeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t oc = start1; oc < end1; oc += step1) { - if (oc + 1 < outch) { - float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size; - float *out_base1 = out_base0 + out_img_size; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input_data + (b * inch + ic) * h * w; - const float *kernel_base0 = filter_data + (oc * inch + ic) * 4; - const float *kernel_base1 = kernel_base0 + inch * 4; + if (oc + 1 < p.out_channels) { + float *out_base0 = + padded_out_data + (b * p.out_channels + oc) * p.out_img_size; + float *out_base1 = out_base0 + p.out_img_size; + for (index_t ic = 0; ic < p.in_channels; ++ic) { + const float *input_base = input_data + + (b * p.in_channels + ic) * p.in_height * p.in_width; + const float *kernel_base0 = + filter_data + (oc * p.in_channels + ic) * 4; + const float *kernel_base1 = kernel_base0 + p.in_channels * 4; const float *in = input_base; // output channel 0 const float *k0 = kernel_base0; @@ -89,18 +48,18 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, float32x4_t k0_vec = vld1q_f32(k0); float32x4_t k1_vec = vld1q_f32(k1); - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; + for (index_t i = 0; i < p.in_height; ++i) { + float *out_row_base0 = out_base0 + i * p.out_width; float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; + float *out_row0_1 = out_row_base0 + p.out_width; - float *out_row_base1 = out_base1 + i * outw; + float *out_row_base1 = out_base1 + i * p.out_width; float *out_row1_0 = out_row_base1; - float *out_row1_1 = out_row_base1 + outw; + float *out_row1_1 = out_row_base1 + p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00, out01, out02, out03; @@ -145,7 +104,7 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, out_row1_1 += 4; } - for (; j < w; ++j) { + for (; j < p.in_width; ++j) { float val = in[0]; for (int k = 0; k < 2; ++k) { out_row0_0[k] += val * k0[k]; @@ -162,23 +121,26 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, } } } else { - float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input_data + (b * inch + ic) * h * w; - const float *kernel_base0 = filter_data + (oc * inch + ic) * 4; + float *out_base0 = padded_out_data + + (b * p.out_channels + oc) * p.out_height * p.out_width; + for (index_t ic = 0; ic < p.in_channels; ++ic) { + const float *input_base = input_data + + (b * p.in_channels + ic) * p.in_height * p.in_width; + const float *kernel_base0 = + filter_data + (oc * p.in_channels + ic) * 4; const float *in = input_base; const float *k0 = kernel_base0; // load filter float32x4_t k0_vec = vld1q_f32(k0); - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; + for (index_t i = 0; i < p.in_height; ++i) { + float *out_row_base0 = out_base0 + i * p.out_width; float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; + float *out_row0_1 = out_row_base0 + p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00, out01, out02, out03; @@ -203,7 +165,7 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, out_row0_1 += 4; } - for (; j < w; ++j) { + for (; j < p.in_width; ++j) { float val = in[0]; for (int k = 0; k < 2; ++k) { out_row0_0[k] += val * k0[k]; @@ -218,79 +180,39 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, outch, 2); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, p.out_channels, 2); return MaceStatus::MACE_SUCCESS; } -MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus Deconv2dK2x2S2::DoCompute( + const DeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t oc = start1; oc < end1; oc += step1) { - float *out_base = padded_out_data + (b * outch + oc) * out_img_size; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input_data + (b * inch + ic) * h * w; - const float *kernel_base = filter_data + (oc * inch + ic) * 4; + float *out_base = + padded_out_data + (b * p.out_channels + oc) * p.out_img_size; + for (index_t ic = 0; ic < p.in_channels; ++ic) { + const float *input_base = input_data + + (b * p.in_channels + ic) * p.in_height * p.in_width; + const float *kernel_base = + filter_data + (oc * p.in_channels + ic) * 4; const float *in = input_base; const float *k0 = kernel_base; float32x4_t k0_vec = vld1q_f32(k0); - for (index_t i = 0; i < h; ++i) { - float *out_row_base = out_base + i * 2 * outw; + for (index_t i = 0; i < p.in_height; ++i) { + float *out_row_base = out_base + i * 2 * p.out_width; float *out_row_0 = out_row_base; - float *out_row_1 = out_row_0 + outw; + float *out_row_1 = out_row_0 + p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); // out row 0 @@ -314,7 +236,7 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context, out_row_1 += 8; } - for (; j < w; ++j) { + for (; j < p.in_width; ++j) { float val = in[0]; for (int k = 0; k < 2; ++k) { out_row_0[k] += val * k0[k]; @@ -328,25 +250,11 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, outch, 1); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, p.out_channels, 1); return MaceStatus::MACE_SUCCESS; } -void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, Deconv2dK2x2S1, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, - float, ImplType::NEON, K2x2S1)); - MACE_REGISTER_DELEGATOR( - registry, Deconv2dK2x2S2, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, - float, ImplType::NEON, K2x2S2)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/deconv_2d_3x3.cc b/mace/ops/arm/fp32/deconv_2d_3x3.cc index d0b49e0d296d89ca2dc12757dd8feda69ef25a67..4c00f07d28634254b6deef1479070054d07074c3 100644 --- a/mace/ops/arm/fp32/deconv_2d_3x3.cc +++ b/mace/ops/arm/fp32/deconv_2d_3x3.cc @@ -12,73 +12,33 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/deconv_2d_3x3.h" - #include + +#include "mace/ops/arm/base/deconv_2d_3x3.h" #include "mace/ops/arm/fp32/common_neon.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { - -MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = out_shape[0]; - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - - const index_t out_img_size = outh * outw; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { + +template<> +MaceStatus Deconv2dK3x3S1::DoCompute( + const DeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t oc = start1; oc < end1; oc += step1) { - if (oc + 1 < outch) { - float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size; - float *out_base1 = out_base0 + out_img_size; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input_data + (b * inch + ic) * h * w; - const float *kernel_base0 = filter_data + (oc * inch + ic) * 9; - const float *kernel_base1 = kernel_base0 + inch * 9; + if (oc + 1 < p.out_channels) { + float *out_base0 = + padded_out_data + (b * p.out_channels + oc) * p.out_img_size; + float *out_base1 = out_base0 + p.out_img_size; + for (index_t ic = 0; ic < p.in_channels; ++ic) { + const float *input_base = input_data + + (b * p.in_channels + ic) * p.in_height * p.in_width; + const float *kernel_base0 = + filter_data + (oc * p.in_channels + ic) * 9; + const float *kernel_base1 = kernel_base0 + p.in_channels * 9; const float *in = input_base; // output channel 0 @@ -102,20 +62,20 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, k11_vec = vld1q_f32(k1_1); k12_vec = vld1q_f32(k1_2); - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; + for (index_t i = 0; i < p.in_height; ++i) { + float *out_row_base0 = out_base0 + i * p.out_width; float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; - float *out_row0_2 = out_row_base0 + 2 * outw; + float *out_row0_1 = out_row_base0 + p.out_width; + float *out_row0_2 = out_row_base0 + 2 * p.out_width; - float *out_row_base1 = out_base1 + i * outw; + float *out_row_base1 = out_base1 + i * p.out_width; float *out_row1_0 = out_row_base1; - float *out_row1_1 = out_row_base1 + outw; - float *out_row1_2 = out_row_base1 + 2 * outw; + float *out_row1_1 = out_row_base1 + p.out_width; + float *out_row1_2 = out_row_base1 + 2 * p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00, out01, out02; @@ -203,7 +163,7 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, out_row1_2 += 4; } - for (; j < w; ++j) { + for (; j < p.in_width; ++j) { float val = in[0]; for (int k = 0; k < 3; ++k) { out_row0_0[k] += val * k0_0[k]; @@ -224,10 +184,13 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, } } } else { - float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input_data + (b * inch + ic) * h * w; - const float *kernel_base0 = filter_data + (oc * inch + ic) * 9; + float *out_base0 = padded_out_data + + (b * p.out_channels + oc) * p.out_height * p.out_width; + for (index_t ic = 0; ic < p.in_channels; ++ic) { + const float *input_base = input_data + + (b * p.in_channels + ic) * p.in_height * p.in_width; + const float *kernel_base0 = + filter_data + (oc * p.in_channels + ic) * 9; const float *in = input_base; const float *k0_0 = kernel_base0; const float *k0_1 = kernel_base0 + 3; @@ -238,14 +201,14 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, float32x4_t k01_vec = vld1q_f32(k0_1); float32x4_t k02_vec = vld1q_f32(k0_2); - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; + for (index_t i = 0; i < p.in_height; ++i) { + float *out_row_base0 = out_base0 + i * p.out_width; float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; - float *out_row0_2 = out_row_base0 + 2 * outw; + float *out_row0_1 = out_row_base0 + p.out_width; + float *out_row0_2 = out_row_base0 + 2 * p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00, out01, out02; @@ -294,7 +257,7 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, out_row0_2 += 4; } - for (; j < w; ++j) { + for (; j < p.in_width; ++j) { float val = in[0]; for (int k = 0; k < 3; ++k) { out_row0_0[k] += val * k0_0[k]; @@ -311,67 +274,26 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, outch, 2); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, p.out_channels, 2); return MaceStatus::MACE_SUCCESS; } -MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus Deconv2dK3x3S2::DoCompute( + const DeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t oc = start1; oc < end1; oc += step1) { - float *out_base = padded_out_data + (b * outch + oc) * out_img_size; - for (index_t ic = 0; ic < inch; ++ic) { - const float *input_base = input_data + (b * inch + ic) * h * w; - const float *kernel_base = filter_data + (oc * inch + ic) * 9; + float *out_base = + padded_out_data + (b * p.out_channels + oc) * p.out_img_size; + for (index_t ic = 0; ic < p.in_channels; ++ic) { + const float *input_base = + input_data + (b * p.in_channels + ic) * p.in_height * p.in_width; + const float *kernel_base = + filter_data + (oc * p.in_channels + ic) * 9; const float *in = input_base; const float *k0 = kernel_base; @@ -382,15 +304,15 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context, float32x4_t k1_vec = vld1q_f32(k1); float32x4_t k2_vec = vld1q_f32(k2); - for (index_t i = 0; i < h; ++i) { - float *out_row_base = out_base + i * 2 * outw; + for (index_t i = 0; i < p.in_height; ++i) { + float *out_row_base = out_base + i * 2 * p.out_width; float *out_row_0 = out_row_base; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; + float *out_row_1 = out_row_0 + p.out_width; + float *out_row_2 = out_row_1 + p.out_width; index_t j = 0; - for (index_t n = 0; n + 9 < outw; n += 8) { + for (index_t n = 0; n + 9 < p.out_width; n += 8) { float32x4_t in_vec = vld1q_f32(in); // out row 0 @@ -439,7 +361,7 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context, j += 4; } - for (; j < w; ++j) { + for (; j < p.in_width; ++j) { float val = in[0]; for (int k = 0; k < 3; ++k) { @@ -457,25 +379,11 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, outch, 1); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, p.out_channels, 1); return MaceStatus::MACE_SUCCESS; } -void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, Deconv2dK3x3S1, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, - float, ImplType::NEON, K3x3S1)); - MACE_REGISTER_DELEGATOR( - registry, Deconv2dK3x3S2, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, - float, ImplType::NEON, K3x3S2)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/deconv_2d_4x4.cc b/mace/ops/arm/fp32/deconv_2d_4x4.cc index 4a84e0394bf07764103c7c2c6c23f8cc79a31d5b..2dbe4d3e9f226c71a5ba0c3c26f6ef0f0e40210b 100644 --- a/mace/ops/arm/fp32/deconv_2d_4x4.cc +++ b/mace/ops/arm/fp32/deconv_2d_4x4.cc @@ -12,78 +12,39 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/deconv_2d_4x4.h" - #include + +#include "mace/ops/arm/base/deconv_2d_4x4.h" #include "mace/ops/arm/fp32/common_neon.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { - -MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - - const index_t out_img_size = outh * outw; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { + +template<> +MaceStatus Deconv2dK4x4S1::DoCompute( + const DeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t oc = start1; oc < end1; oc += step1) { - if (oc + 1 < outch) { - float *out_base = padded_out_data + (b * outch + oc) * out_img_size; - float *out_base1 = out_base + out_img_size; - for (index_t q = 0; q < inch; q++) { - const float *input_base = input_data + (b * inch + q) * h * w; + if (oc + 1 < p.out_channels) { + float *out_base = + padded_out_data + (b * p.out_channels + oc) * p.out_img_size; + float *out_base1 = out_base + p.out_img_size; + for (index_t q = 0; q < p.in_channels; q++) { + const float *input_base = input_data + + (b * p.in_channels + q) * p.in_height * p.in_width; const float *in = input_base; - const float *kernel_base = filter_data + (oc * inch + q) * 16; + const float *kernel_base = + filter_data + (oc * p.in_channels + q) * 16; const float *k0 = kernel_base; const float *k1 = kernel_base + 4; const float *k2 = kernel_base + 8; const float *k3 = kernel_base + 12; - const float *kernel_base1 = kernel_base + inch * 16; + const float *kernel_base1 = kernel_base + p.in_channels * 16; const float *k10 = kernel_base1; const float *k11 = kernel_base1 + 4; const float *k12 = kernel_base1 + 8; @@ -99,24 +60,24 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, float32x4_t k12_vec = vld1q_f32(k12); float32x4_t k13_vec = vld1q_f32(k13); - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + i * outw; + for (index_t i = 0; i < p.in_height; i++) { + float *out_row = out_base + i * p.out_width; float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; + float *out_row_1 = out_row_0 + p.out_width; + float *out_row_2 = out_row_1 + p.out_width; + float *out_row_3 = out_row_2 + p.out_width; - float *out_row1 = out_base1 + i * outw; + float *out_row1 = out_base1 + i * p.out_width; float *out_row1_0 = out_row1; - float *out_row1_1 = out_row1_0 + outw; - float *out_row1_2 = out_row1_1 + outw; - float *out_row1_3 = out_row1_2 + outw; + float *out_row1_1 = out_row1_0 + p.out_width; + float *out_row1_2 = out_row1_1 + p.out_width; + float *out_row1_3 = out_row1_2 + p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00, out01, out02, out03; float32x4_t out10, out11, out12, out13; @@ -260,7 +221,7 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, out_row1_3 += 4; } - for (; j < w; j++) { + for (; j < p.in_width; j++) { float val = in[0]; for (int k = 0; k < 4; ++k) { out_row_0[k] += val * k0[k]; @@ -285,10 +246,13 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, } } } else { - float *out_base = padded_out_data + (b * outch + oc) * out_img_size; - for (index_t q = 0; q < inch; q++) { - const float *input_base = input_data + (b * inch + q) * h * w; - const float *kernel_base = filter_data + (oc * inch + q) * 16; + float *out_base = + padded_out_data + (b * p.out_channels + oc) * p.out_img_size; + for (index_t q = 0; q < p.in_channels; q++) { + const float *input_base = input_data + + (b * p.in_channels + q) * p.in_height * p.in_width; + const float *kernel_base = + filter_data + (oc * p.in_channels + q) * 16; const float *in = input_base; const float *k0 = kernel_base; const float *k1 = kernel_base + 4; @@ -300,15 +264,15 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + i * outw; + for (index_t i = 0; i < p.in_height; i++) { + float *out_row = out_base + i * p.out_width; float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; + float *out_row_1 = out_row_0 + p.out_width; + float *out_row_2 = out_row_1 + p.out_width; + float *out_row_3 = out_row_2 + p.out_width; int j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00 = vld1q_f32(out_row_0); @@ -382,7 +346,7 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, out_row_3 += 4; } - for (; j < w; j++) { + for (; j < p.in_width; j++) { float val = in[0]; for (int k = 0; k < 4; ++k) { out_row_0[k] += val * k0[k]; @@ -401,65 +365,25 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, outch, 2); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, p.out_channels, 2); return MaceStatus::MACE_SUCCESS; } -MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus Deconv2dK4x4S2::DoCompute( + const DeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { - for (index_t p = start1; p < end1; p += step1) { - float *out_base = padded_out_data + (b * outch + p) * out_img_size; - for (index_t q = 0; q < inch; q++) { - const float *input_base = input_data + (b * inch + q) * h * w; - const float *kernel_base = filter_data + (p * inch + q) * 16; + for (index_t k = start1; k < end1; k += step1) { + float *out_base = + padded_out_data + (b * p.out_channels + k) * p.out_img_size; + for (index_t q = 0; q < p.in_channels; q++) { + const float *input_base = input_data + + (b * p.in_channels + q) * p.in_height * p.in_width; + const float *kernel_base = filter_data + (k * p.in_channels + q) * 16; const float *in = input_base; const float *k0 = kernel_base; @@ -472,17 +396,17 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + 2 * i * outw; + for (index_t i = 0; i < p.in_height; i++) { + float *out_row = out_base + 2 * i * p.out_width; float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; + float *out_row_1 = out_row_0 + p.out_width; + float *out_row_2 = out_row_1 + p.out_width; + float *out_row_3 = out_row_2 + p.out_width; index_t j = 0; - for (index_t n = 0; n + 9 < outw; n += 8) { + for (index_t n = 0; n + 9 < p.out_width; n += 8) { float32x4_t in_vec = vld1q_f32(in); // row 0 @@ -549,7 +473,7 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context, j += 4; } - for (; j < w; j++) { + for (; j < p.in_width; j++) { float val = in[0]; for (int k = 0; k < 4; ++k) { out_row_0[k] += val * k0[k]; @@ -567,25 +491,11 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, outch, 1); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, p.out_channels, 1); return MaceStatus::MACE_SUCCESS; } -void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, Deconv2dK4x4S1, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, - float, ImplType::NEON, K4x4S1)); - MACE_REGISTER_DELEGATOR( - registry, Deconv2dK4x4S2, delegator::Deconv2dParam, - MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU, - float, ImplType::NEON, K4x4S2)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc index cc0ab45a02425f5917eb9edc44d4d20122b57296..fa850e562b6a9ebf8def69b4ce8a193e1d929602 100644 --- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc +++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc @@ -12,14 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/depthwise_conv_2d_3x3.h" - #include +#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h" + namespace mace { namespace ops { namespace arm { -namespace fp32 { namespace { void DepthwiseConv2dPixel(const float *in_base, @@ -48,79 +47,36 @@ void DepthwiseConv2dPixel(const float *in_base, } } // namespace -MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, - const mace::Tensor *input, - const mace::Tensor *filter, - mace::Tensor *output) { - MACE_UNUSED(context); - std::vector out_shape(4); - std::vector paddings(2); - auto &in_shape = input->shape(); - auto &filter_shape = filter->shape(); - CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings); - out_shape[1] *= filter_shape[1]; - MACE_RETURN_IF_ERROR(output->Resize(out_shape)); - output->Clear(); - - const int pad_top = paddings[0] / 2; - const int pad_left = paddings[1] / 2; - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - const index_t multiplier = out_channels / in_channels; - - std::vector out_bounds; - CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds); - const index_t valid_h_start = out_bounds[0]; - const index_t valid_h_stop = out_bounds[1]; - const index_t valid_w_start = out_bounds[2]; - const index_t valid_w_stop = out_bounds[3]; - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = input->data(); - auto output_data = output->mutable_data(); - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus DepthwiseConv2dK3x3S1::DoCompute( + const DepthwiseConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - const index_t c = m / multiplier; - const index_t multi_index = m % multiplier; + const index_t c = m / p.multiplier; + const index_t multi_index = m % p.multiplier; const float - *in_base = input_data + b * in_batch_size + c * in_image_size; + *in_base = input_data + b * p.in_batch_size + c * p.in_image_size; const float - *filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9; - float *out_base = output_data + b * out_batch_size + m * out_image_size; + *filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9; + float *out_base = + output_data + b * p.out_batch_size + m * p.out_image_size; index_t h, w; // top - for (h = 0; h < valid_h_start; ++h) { - for (w = 0; w < out_width; ++w) { + for (h = 0; h < p.valid_h_start; ++h) { + for (w = 0; w < p.out_width; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, w, - h - pad_top, - w - pad_left, - out_width, - in_height, - in_width, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, 3, 3, out_base); @@ -133,18 +89,18 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, vf01 = vld1q_f32(filter_ptr + 3); vf02 = vld1q_f32(filter_ptr + 5); - for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) { + for (h = p.valid_h_start; h + 1 < p.valid_h_stop; h += 2) { // left - for (w = 0; w < valid_w_start; ++w) { + for (w = 0; w < p.valid_w_start; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, w, - h - pad_top, - w - pad_left, - out_width, - in_height, - in_width, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, 3, 3, out_base); @@ -152,17 +108,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, filter_ptr, h + 1, w, - h + 1 - pad_top, - w - pad_left, - out_width, - in_height, - in_width, + h + 1 - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, 3, 3, out_base); } - for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { + for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) { // input (4 height x 3 slide): vi_height_slide float32x4_t vi00, vi01, vi02, vi0n; float32x4_t vi10, vi11, vi12, vi1n; @@ -173,17 +129,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, float32x4_t vo00, vo01; // load input - index_t in_h = h - pad_top; - index_t in_w = w - pad_left; - index_t in_offset = in_h * in_width + in_w; + index_t in_h = h - p.pad_top; + index_t in_w = w - p.pad_left; + index_t in_offset = in_h * p.in_width + in_w; vi00 = vld1q_f32(in_base + in_offset); vi0n = vld1q_f32(in_base + in_offset + 4); - vi10 = vld1q_f32(in_base + in_offset + in_width); - vi1n = vld1q_f32(in_base + in_offset + in_width + 4); - vi20 = vld1q_f32(in_base + in_offset + 2 * in_width); - vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 4); - vi30 = vld1q_f32(in_base + in_offset + 3 * in_width); - vi3n = vld1q_f32(in_base + in_offset + 3 * in_width + 4); + vi10 = vld1q_f32(in_base + in_offset + p.in_width); + vi1n = vld1q_f32(in_base + in_offset + p.in_width + 4); + vi20 = vld1q_f32(in_base + in_offset + 2 * p.in_width); + vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 4); + vi30 = vld1q_f32(in_base + in_offset + 3 * p.in_width); + vi3n = vld1q_f32(in_base + in_offset + 3 * p.in_width + 4); vi01 = vextq_f32(vi00, vi0n, 1); vi02 = vextq_f32(vi00, vi0n, 2); @@ -195,9 +151,9 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, vi32 = vextq_f32(vi30, vi3n, 2); // load ouptut - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo00 = vld1q_f32(out_base + out_offset); - vo01 = vld1q_f32(out_base + out_offset + out_width); + vo01 = vld1q_f32(out_base + out_offset + p.out_width); #if defined(__aarch64__) // outch 0, height 0 @@ -245,20 +201,20 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1); #endif vst1q_f32(out_base + out_offset, vo00); - vst1q_f32(out_base + out_offset + out_width, vo01); + vst1q_f32(out_base + out_offset + p.out_width, vo01); } // w // right - for (; w < out_width; ++w) { + for (; w < p.out_width; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, w, - h - pad_top, - w - pad_left, - out_width, - in_height, - in_width, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, 3, 3, out_base); @@ -266,11 +222,11 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, filter_ptr, h + 1, w, - h + 1 - pad_top, - w - pad_left, - out_width, - in_height, - in_width, + h + 1 - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, 3, 3, out_base); @@ -279,17 +235,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, // bottom - for (; h < out_height; ++h) { - for (w = 0; w < out_width; ++w) { + for (; h < p.out_height; ++h) { + for (w = 0; w < p.out_width; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, w, - h - pad_top, - w - pad_left, - out_width, - in_height, - in_width, + h - p.pad_top, + w - p.pad_left, + p.out_width, + p.in_height, + p.in_width, 3, 3, out_base); @@ -297,86 +253,41 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, } } // m } // b - }, 0, batch, 1, 0, out_channels, 1); // threadpool + }, 0, p.batch, 1, 0, p.out_channels, 1); // threadpool return MaceStatus::MACE_SUCCESS; } -MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, - const mace::Tensor *input, - const mace::Tensor *filter, - mace::Tensor *output) { - MACE_UNUSED(context); - - std::vector out_shape(4); - std::vector paddings(2); - auto &in_shape = input->shape(); - auto &filter_shape = filter->shape(); - - CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings); - out_shape[1] *= in_shape[1]; - MACE_RETURN_IF_ERROR(output->Resize(out_shape)); - output->Clear(); - - const int pad_top = paddings[0] / 2; - const int pad_left = paddings[1] / 2; - - const index_t batch = in_shape[0]; - const index_t in_channels = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - const index_t multiplier = out_channels / in_channels; - - std::vector out_bounds; - CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds); - const index_t valid_h_start = out_bounds[0]; - const index_t valid_h_stop = out_bounds[1]; - const index_t valid_w_start = out_bounds[2]; - const index_t valid_w_stop = out_bounds[3]; - - Tensor::MappingGuard in_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard out_guard(output); - auto filter_data = filter->data(); - auto input_data = input->data(); - auto output_data = output->mutable_data(); - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus DepthwiseConv2dK3x3S2::DoCompute( + const DepthwiseConvComputeParam &p, const float *filter_data, + const float *input_data, float *output_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t m = start1; m < end1; m += step1) { - index_t c = m / multiplier; - index_t multi_index = m % multiplier; + index_t c = m / p.multiplier; + index_t multi_index = m % p.multiplier; const float - *in_base = input_data + b * in_batch_size + c * in_image_size; + *in_base = input_data + b * p.in_batch_size + c * p.in_image_size; const float - *filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9; - float *out_base = output_data + b * out_batch_size + m * out_image_size; + *filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9; + float *out_base = + output_data + b * p.out_batch_size + m * p.out_image_size; index_t h, w; // top - for (h = 0; h < valid_h_start; ++h) { - for (w = 0; w < out_width; ++w) { + for (h = 0; h < p.valid_h_start; ++h) { + for (w = 0; w < p.out_width; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, w, - h * 2 - pad_top, - w * 2 - pad_left, - out_width, - in_height, - in_width, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, 3, 3, out_base); @@ -389,24 +300,24 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, vf01 = vld1q_f32(filter_ptr + 3); vf02 = vld1q_f32(filter_ptr + 5); - for (h = valid_h_start; h < valid_h_stop; ++h) { + for (h = p.valid_h_start; h < p.valid_h_stop; ++h) { // left - for (w = 0; w < valid_w_start; ++w) { + for (w = 0; w < p.valid_w_start; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, w, - h * 2 - pad_top, - w * 2 - pad_left, - out_width, - in_height, - in_width, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, 3, 3, out_base); } - for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { + for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) { float32x4x2_t vi0, vi1, vi2; float32x4_t vi0n, vi1n, vi2n; @@ -419,19 +330,19 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, float32x4_t vo; // load input - index_t in_h = h * 2 - pad_top; - index_t in_w = w * 2 - pad_left; - index_t in_offset = in_h * in_width + in_w; + index_t in_h = h * 2 - p.pad_top; + index_t in_w = w * 2 - p.pad_left; + index_t in_offset = in_h * p.in_width + in_w; vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] - vi1 = vld2q_f32(in_base + in_offset + in_width); - vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); + vi1 = vld2q_f32(in_base + in_offset + p.in_width); + vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width); vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] - vi1n = vld1q_f32(in_base + in_offset + in_width + 8); - vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); + vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8); + vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8); // load ouptut - index_t out_offset = h * out_width + w; + index_t out_offset = h * p.out_width + w; vo = vld1q_f32(out_base + out_offset); vi00 = vi0.val[0]; // [0.2.4.6] @@ -471,16 +382,16 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, } // w // right - for (; w < out_width; ++w) { + for (; w < p.out_width; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, w, - h * 2 - pad_top, - w * 2 - pad_left, - out_width, - in_height, - in_width, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, 3, 3, out_base); @@ -489,17 +400,17 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, // bottom - for (; h < out_height; ++h) { - for (w = 0; w < out_width; ++w) { + for (; h < p.out_height; ++h) { + for (w = 0; w < p.out_width; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, w, - h * 2 - pad_top, - w * 2 - pad_left, - out_width, - in_height, - in_width, + h * 2 - p.pad_top, + w * 2 - p.pad_left, + p.out_width, + p.in_height, + p.in_width, 3, 3, out_base); @@ -507,23 +418,11 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, } } // m } // b - }, 0, batch, 1, 0, out_channels, 1); + }, 0, p.batch, 1, 0, p.out_channels, 1); return MaceStatus::MACE_SUCCESS; } -void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, - float, ImplType::NEON, K3x3S1)); - MACE_REGISTER_DELEGATOR( - registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU, - float, ImplType::NEON, K3x3S2)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc index 875e08fa5ed271d599b33d490b0211dcd1360254..99e9c9eb018a8817ce7096544bb565bb0c5e6e03 100644 --- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc +++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc @@ -12,69 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h" - #include + +#include "mace/ops/arm/base/depthwise_deconv_2d_3x3.h" #include "mace/ops/arm/fp32/common_neon.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { - -MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - group_ = input->dim(1); - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t channels = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - const index_t in_img_size = h * w; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { + +template<> +MaceStatus DepthwiseDeconv2dK3x3S1::DoCompute( + const DepthwiseDeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t c = start1; c < end1; c += step1) { - const index_t offset = b * channels + c; - float *out_base = padded_out_data + offset * out_img_size; - const float *input_base = input_data + offset * in_img_size; + const index_t offset = b * p.in_channels + c; + float *out_base = padded_out_data + offset * p.out_img_size; + const float *input_base = input_data + offset * p.in_img_size; const float *kernel_base = filter_data + c * 9; const float *in = input_base; const float *k0 = kernel_base; @@ -86,14 +43,14 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context, float32x4_t k1_vec = vld1q_f32(k1); float32x4_t k2_vec = vld1q_f32(k2); - for (index_t i = 0; i < h; ++i) { - float *out_row_base = out_base + i * outw; + for (index_t i = 0; i < p.in_height; ++i) { + float *out_row_base = out_base + i * p.out_width; float *out_row0 = out_row_base; - float *out_row1 = out_row_base + outw; - float *out_row2 = out_row_base + 2 * outw; + float *out_row1 = out_row_base + p.out_width; + float *out_row2 = out_row_base + 2 * p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00, out01, out02; @@ -142,7 +99,7 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context, out_row2 += 4; } - for (; j < w; ++j) { + for (; j < p.in_width; ++j) { float val = in[0]; for (int k = 0; k < 3; ++k) { out_row0[k] += val * k0[k]; @@ -157,66 +114,22 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, channels, 1); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, p.in_channels, 1); return MaceStatus::MACE_SUCCESS; } -MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - group_ = input->dim(1); - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t channels = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - const index_t in_img_size = h * w; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus DepthwiseDeconv2dK3x3S2::DoCompute( + const DepthwiseDeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t c = start1; c < end1; c += step1) { - const index_t offset = b * channels + c; - float *out_base = padded_out_data + offset * out_img_size; - const float *input_base = input_data + offset * in_img_size; + const index_t offset = b * p.in_channels + c; + float *out_base = padded_out_data + offset * p.out_img_size; + const float *input_base = input_data + offset * p.in_img_size; const float *kernel_base = filter_data + c * 9; const float *in = input_base; @@ -228,15 +141,15 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context, float32x4_t k1_vec = vld1q_f32(k1); float32x4_t k2_vec = vld1q_f32(k2); - for (index_t i = 0; i < h; ++i) { - float *out_row_base = out_base + i * 2 * outw; + for (index_t i = 0; i < p.in_height; ++i) { + float *out_row_base = out_base + i * 2 * p.out_width; float *out_row_0 = out_row_base; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; + float *out_row_1 = out_row_0 + p.out_width; + float *out_row_2 = out_row_1 + p.out_width; index_t j = 0; - for (index_t n = 0; n + 9 < outw; n += 8) { + for (index_t n = 0; n + 9 < p.out_width; n += 8) { float32x4_t in_vec = vld1q_f32(in); // out row 0 @@ -285,7 +198,7 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context, j += 4; } - for (; j < w; ++j) { + for (; j < p.in_width; ++j) { float val = in[0]; for (int k = 0; k < 3; ++k) { @@ -302,80 +215,31 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, channels, 1); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, p.in_channels, 1); return MaceStatus::MACE_SUCCESS; } -MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - - const index_t in_img_size = h * w; - const index_t out_img_size = outh * outw; - - const index_t inch_g = inch / group_; - const index_t outch_g = outch / group_; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1, - index_t start2, index_t end2, index_t step2) { +template<> +MaceStatus GroupDeconv2dK3x3S1::DoCompute( + const GroupDeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { for (index_t b = start0; b < end0; b += step0) { for (index_t g = start1; g < end1; g += step1) { for (index_t oc = start2; oc < end2; oc += step2) { - if (oc + 1 < outch_g) { - const index_t out_offset = b * outch + outch_g * g + oc; - float *out_base0 = padded_out_data + out_offset * out_img_size; - float *out_base1 = out_base0 + out_img_size; - for (index_t ic = 0; ic < inch_g; ++ic) { - const index_t in_offset = b * inch + inch_g * g + ic; - const float *input_base = input_data + in_offset * in_img_size; - const index_t kernel_offset = (oc * group_ + g) * inch_g + ic; + if (oc + 1 < p.outch_g) { + const index_t out_offset = b * p.out_channels + p.outch_g * g + oc; + float *out_base0 = padded_out_data + out_offset * p.out_img_size; + float *out_base1 = out_base0 + p.out_img_size; + for (index_t ic = 0; ic < p.inch_g; ++ic) { + const index_t in_offset = b * p.in_channels + p.inch_g * g + ic; + const float *input_base = input_data + in_offset * p.in_img_size; + const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic; const float *kernel_base0 = filter_data + kernel_offset * 9; - const float *kernel_base1 = kernel_base0 + inch * 9; + const float *kernel_base1 = kernel_base0 + p.in_channels * 9; const float *in = input_base; // output channel 0 @@ -399,20 +263,20 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, k11_vec = vld1q_f32(k1_1); k12_vec = vld1q_f32(k1_2); - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; + for (index_t i = 0; i < p.in_height; ++i) { + float *out_row_base0 = out_base0 + i * p.out_width; float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; - float *out_row0_2 = out_row_base0 + 2 * outw; + float *out_row0_1 = out_row_base0 + p.out_width; + float *out_row0_2 = out_row_base0 + 2 * p.out_width; - float *out_row_base1 = out_base1 + i * outw; + float *out_row_base1 = out_base1 + i * p.out_width; float *out_row1_0 = out_row_base1; - float *out_row1_1 = out_row_base1 + outw; - float *out_row1_2 = out_row_base1 + 2 * outw; + float *out_row1_1 = out_row_base1 + p.out_width; + float *out_row1_2 = out_row_base1 + 2 * p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00, out01, out02; @@ -500,7 +364,7 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, out_row1_2 += 4; } - for (; j < w; ++j) { + for (; j < p.in_width; ++j) { float val = in[0]; for (int k = 0; k < 3; ++k) { out_row0_0[k] += val * k0_0[k]; @@ -521,12 +385,12 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, } } } else { - const index_t out_offset = b * outch + outch_g * g + oc; - float *out_base0 = padded_out_data + out_offset * out_img_size; - for (index_t ic = 0; ic < inch_g; ++ic) { - const index_t in_offset = (b * group_ + g) * inch_g + ic; - const float *input_base = input_data + in_offset * in_img_size; - const index_t kernel_offset = (oc * group_ + g) * inch_g + ic; + const index_t out_offset = b * p.out_channels + p.outch_g * g + oc; + float *out_base0 = padded_out_data + out_offset * p.out_img_size; + for (index_t ic = 0; ic < p.inch_g; ++ic) { + const index_t in_offset = (b * group_ + g) * p.inch_g + ic; + const float *input_base = input_data + in_offset * p.in_img_size; + const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic; const float *kernel_base0 = filter_data + kernel_offset * 9; const float *in = input_base; const float *k0_0 = kernel_base0; @@ -538,14 +402,14 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, float32x4_t k01_vec = vld1q_f32(k0_1); float32x4_t k02_vec = vld1q_f32(k0_2); - for (index_t i = 0; i < h; ++i) { - float *out_row_base0 = out_base0 + i * outw; + for (index_t i = 0; i < p.in_height; ++i) { + float *out_row_base0 = out_base0 + i * p.out_width; float *out_row0_0 = out_row_base0; - float *out_row0_1 = out_row_base0 + outw; - float *out_row0_2 = out_row_base0 + 2 * outw; + float *out_row0_1 = out_row_base0 + p.out_width; + float *out_row0_2 = out_row_base0 + 2 * p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00, out01, out02; @@ -594,7 +458,7 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, out_row0_2 += 4; } - for (; j < w; ++j) { + for (; j < p.in_width; ++j) { float val = in[0]; for (int k = 0; k < 3; ++k) { out_row0_0[k] += val * k0_0[k]; @@ -612,76 +476,27 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, group_, 1, 0, outch_g, 2); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 2); return MaceStatus::MACE_SUCCESS; } -MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - - const index_t in_img_size = h * w; - const index_t out_img_size = outh * outw; - - const index_t inch_g = inch / group_; - const index_t outch_g = outch / group_; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1, - index_t start2, index_t end2, index_t step2) { +template<> +MaceStatus GroupDeconv2dK3x3S2::DoCompute( + const GroupDeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { for (index_t b = start0; b < end0; b += step0) { for (index_t g = start1; g < end1; g += step1) { for (index_t oc = start2; oc < end2; oc += step2) { - const index_t out_offset = b * outch + outch_g * g + oc; - float *out_base = padded_out_data + out_offset * out_img_size; - for (index_t ic = 0; ic < inch_g; ++ic) { - const index_t in_offset = b * inch + inch_g * g + ic; - const float *input_base = input_data + in_offset * in_img_size; - const index_t kernel_offset = (oc * group_ + g) * inch_g + ic; + const index_t out_offset = b * p.out_channels + p.outch_g * g + oc; + float *out_base = padded_out_data + out_offset * p.out_img_size; + for (index_t ic = 0; ic < p.inch_g; ++ic) { + const index_t in_offset = b * p.in_channels + p.inch_g * g + ic; + const float *input_base = input_data + in_offset * p.in_img_size; + const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic; const float *kernel_base = filter_data + kernel_offset * 9; const float *in = input_base; @@ -693,15 +508,15 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context, float32x4_t k1_vec = vld1q_f32(k1); float32x4_t k2_vec = vld1q_f32(k2); - for (index_t i = 0; i < h; ++i) { - float *out_row_base = out_base + i * 2 * outw; + for (index_t i = 0; i < p.in_height; ++i) { + float *out_row_base = out_base + i * 2 * p.out_width; float *out_row_0 = out_row_base; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; + float *out_row_1 = out_row_0 + p.out_width; + float *out_row_2 = out_row_1 + p.out_width; index_t j = 0; - for (index_t n = 0; n + 9 < outw; n += 8) { + for (index_t n = 0; n + 9 < p.out_width; n += 8) { float32x4_t in_vec = vld1q_f32(in); // out row 0 @@ -750,7 +565,7 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context, j += 4; } - for (; j < w; ++j) { + for (; j < p.in_width; ++j) { float val = in[0]; for (int k = 0; k < 3; ++k) { @@ -769,36 +584,11 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, group_, 1, 0, outch_g, 1); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 1); return MaceStatus::MACE_SUCCESS; } -void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, - float, ImplType::NEON, K3x3S1)); - MACE_REGISTER_DELEGATOR( - registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, - float, ImplType::NEON, K3x3S2)); -} - -void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam, - MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, - float, ImplType::NEON, K3x3S1)); - MACE_REGISTER_DELEGATOR( - registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam, - MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, - float, ImplType::NEON, K3x3S2)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc index 6f313c591212008b0c614cfebbf24d5dfebdc1a1..529b728fcb6baf2d6b04585d59025bea552d6ef5 100644 --- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc +++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc @@ -12,69 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h" - #include + +#include "mace/ops/arm/base/depthwise_deconv_2d_4x4.h" #include "mace/ops/arm/fp32/common_neon.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { - -MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - group_ = input->dim(1); - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t channels = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - const index_t in_img_size = h * w; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { + +template<> +MaceStatus DepthwiseDeconv2dK4x4S1::DoCompute( + const DepthwiseDeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t c = start1; c < end1; c += step1) { - const index_t offset = b * channels + c; - float *out_base = padded_out_data + offset * out_img_size; - const float *input_base = input_data + offset * in_img_size; + const index_t offset = b * p.in_channels + c; + float *out_base = padded_out_data + offset * p.out_img_size; + const float *input_base = input_data + offset * p.in_img_size; const float *kernel_base = filter_data + c * 16; const float *in = input_base; const float *k0 = kernel_base; @@ -87,15 +44,15 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + i * outw; + for (index_t i = 0; i < p.in_height; i++) { + float *out_row = out_base + i * p.out_width; float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; + float *out_row_1 = out_row_0 + p.out_width; + float *out_row_2 = out_row_1 + p.out_width; + float *out_row_3 = out_row_2 + p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00 = vld1q_f32(out_row_0); @@ -172,7 +129,7 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context, out_row_3 += 4; } - for (; j < w; j++) { + for (; j < p.in_width; j++) { float val = in[0]; for (int k = 0; k < 4; ++k) { out_row_0[k] += val * k0[k]; @@ -189,66 +146,22 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, channels, 1); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, p.in_channels, 1); return MaceStatus::MACE_SUCCESS; } -MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - group_ = input->dim(1); - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t channels = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - const index_t in_img_size = h * w; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - const index_t out_img_size = outh * outw; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { +template<> +MaceStatus DepthwiseDeconv2dK4x4S2::DoCompute( + const DepthwiseDeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { for (index_t b = start0; b < end0; b += step0) { for (index_t c = start1; c < end1; c += step1) { - const index_t offset = b * channels + c; - float *out_base = padded_out_data + offset * out_img_size; - const float *input_base = input_data + offset * in_img_size; + const index_t offset = b * p.in_channels + c; + float *out_base = padded_out_data + offset * p.out_img_size; + const float *input_base = input_data + offset * p.in_img_size; const float *kernel_base = filter_data + c * 16; const float *in = input_base; @@ -262,17 +175,17 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + 2 * i * outw; + for (index_t i = 0; i < p.in_height; i++) { + float *out_row = out_base + 2 * i * p.out_width; float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; + float *out_row_1 = out_row_0 + p.out_width; + float *out_row_2 = out_row_1 + p.out_width; + float *out_row_3 = out_row_2 + p.out_width; index_t j = 0; - for (index_t n = 0; n + 9 < outw; n += 8) { + for (index_t n = 0; n + 9 < p.out_width; n += 8) { float32x4_t in_vec = vld1q_f32(in); // row 0 @@ -339,7 +252,7 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context, j += 4; } - for (; j < w; j++) { + for (; j < p.in_width; j++) { float val = in[0]; for (int k = 0; k < 4; ++k) { out_row_0[k] += val * k0[k]; @@ -356,89 +269,40 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, channels, 1); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, p.in_channels, 1); return MaceStatus::MACE_SUCCESS; } -MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - - const index_t in_img_size = h * w; - const index_t out_img_size = outh * outw; - - const index_t inch_g = inch / group_; - const index_t outch_g = outch / group_; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1, - index_t start2, index_t end2, index_t step2) { +template<> +MaceStatus GroupDeconv2dK4x4S1::DoCompute( + const GroupDeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { for (index_t b = start0; b < end0; b += step0) { for (index_t g = start1; g < end1; g += step1) { for (index_t oc = start2; oc < end2; oc += step2) { - if (oc + 1 < outch_g) { + if (oc + 1 < p.outch_g) { const index_t out_offset = - (b * outch + outch_g * g + oc) * out_img_size; + (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size; float *out_base = padded_out_data + out_offset; - float *out_base1 = out_base + out_img_size; - for (index_t ic = 0; ic < inch_g; ic++) { + float *out_base1 = out_base + p.out_img_size; + for (index_t ic = 0; ic < p.inch_g; ic++) { const index_t in_offset = - (b * inch + inch_g * g + ic) * in_img_size; + (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size; const float *input_base = input_data + in_offset; const float *in = input_base; const index_t kernel_offset = - ((oc * group_ + g) * inch_g + ic) * 16; + ((oc * group_ + g) * p.inch_g + ic) * 16; const float *kernel_base = filter_data + kernel_offset; const float *k0 = kernel_base; const float *k1 = kernel_base + 4; const float *k2 = kernel_base + 8; const float *k3 = kernel_base + 12; - const float *kernel_base1 = kernel_base + inch * 16; + const float *kernel_base1 = kernel_base + p.in_channels * 16; const float *k10 = kernel_base1; const float *k11 = kernel_base1 + 4; const float *k12 = kernel_base1 + 8; @@ -454,24 +318,24 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, float32x4_t k12_vec = vld1q_f32(k12); float32x4_t k13_vec = vld1q_f32(k13); - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + i * outw; + for (index_t i = 0; i < p.in_height; i++) { + float *out_row = out_base + i * p.out_width; float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; + float *out_row_1 = out_row_0 + p.out_width; + float *out_row_2 = out_row_1 + p.out_width; + float *out_row_3 = out_row_2 + p.out_width; - float *out_row1 = out_base1 + i * outw; + float *out_row1 = out_base1 + i * p.out_width; float *out_row1_0 = out_row1; - float *out_row1_1 = out_row1_0 + outw; - float *out_row1_2 = out_row1_1 + outw; - float *out_row1_3 = out_row1_2 + outw; + float *out_row1_1 = out_row1_0 + p.out_width; + float *out_row1_2 = out_row1_1 + p.out_width; + float *out_row1_3 = out_row1_2 + p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00, out01, out02, out03; float32x4_t out10, out11, out12, out13; @@ -618,7 +482,7 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, out_row1_3 += 4; } - for (; j < w; j++) { + for (; j < p.in_width; j++) { float val = in[0]; for (int k = 0; k < 4; ++k) { out_row_0[k] += val * k0[k]; @@ -644,13 +508,13 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, } } else { const index_t out_offset = - (b * outch + outch_g * g + oc) * out_img_size; + (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size; float *out_base = padded_out_data + out_offset; - for (index_t ic = 0; ic < inch_g; ++ic) { + for (index_t ic = 0; ic < p.inch_g; ++ic) { const index_t in_offset = - (b * inch + inch_g * g + ic) * in_img_size; + (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size; const index_t kernel_offset = - ((oc * group_ + g) * inch_g + ic) * 16; + ((oc * group_ + g) * p.inch_g + ic) * 16; const float *input_base = input_data + in_offset; const float *kernel_base = filter_data + kernel_offset; @@ -665,15 +529,15 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + i * outw; + for (index_t i = 0; i < p.in_height; i++) { + float *out_row = out_base + i * p.out_width; float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; + float *out_row_1 = out_row_0 + p.out_width; + float *out_row_2 = out_row_1 + p.out_width; + float *out_row_3 = out_row_2 + p.out_width; index_t j = 0; - for (; j + 3 < w; j += 4) { + for (; j + 3 < p.in_width; j += 4) { float32x4_t in_vec = vld1q_f32(in); float32x4_t out00 = vld1q_f32(out_row_0); @@ -750,7 +614,7 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, out_row_3 += 4; } - for (; j < w; j++) { + for (; j < p.in_width; j++) { float val = in[0]; for (int k = 0; k < 4; ++k) { out_row_0[k] += val * k0[k]; @@ -770,78 +634,29 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, group_, 1, 0, outch_g, 2); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 2); return MaceStatus::MACE_SUCCESS; } -MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *output_shape, - Tensor *output) { - std::unique_ptr padded_out; - std::vector out_pad_size; - ResizeOutAndPadOut(context, - input, - filter, - output_shape, - output, - &out_pad_size, - &padded_out); - - Tensor *out_tensor = output; - if (padded_out != nullptr) { - out_tensor = padded_out.get(); - } - - out_tensor->Clear(); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard output_mapper(output); - - auto input_data = input->data(); - auto filter_data = filter->data(); - auto padded_out_data = out_tensor->mutable_data(); - - auto &in_shape = input->shape(); - auto &out_shape = out_tensor->shape(); - - const index_t batch = in_shape[0]; - const index_t inch = in_shape[1]; - const index_t h = in_shape[2]; - const index_t w = in_shape[3]; - - const index_t outch = out_shape[1]; - const index_t outh = out_shape[2]; - const index_t outw = out_shape[3]; - - const index_t in_img_size = h * w; - const index_t out_img_size = outh * outw; - - const index_t inch_g = inch / group_; - const index_t outch_g = outch / group_; - - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1, - index_t start2, index_t end2, index_t step2) { +template<> +MaceStatus GroupDeconv2dK4x4S2::DoCompute( + const GroupDeconvComputeParam &p, const float *filter_data, + const float *input_data, float *padded_out_data) { + p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1, + index_t start2, index_t end2, index_t step2) { for (index_t b = start0; b < end0; b += step0) { for (index_t g = start1; g < end1; g += step1) { for (index_t oc = start2; oc < end2; oc += step2) { const index_t out_offset = - (b * outch + outch_g * g + oc) * out_img_size; + (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size; float *out_base = padded_out_data + out_offset; - for (index_t ic = 0; ic < inch_g; ic++) { + for (index_t ic = 0; ic < p.inch_g; ic++) { const index_t in_offset = - (b * inch + inch_g * g + ic) * in_img_size; + (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size; const index_t kernel_offset = - ((oc * group_ + g) * inch_g + ic) * 16; + ((oc * group_ + g) * p.inch_g + ic) * 16; const float *input_base = input_data + in_offset; const float *kernel_base = filter_data + kernel_offset; const float *in = input_base; @@ -856,17 +671,17 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); - for (index_t i = 0; i < h; i++) { - float *out_row = out_base + 2 * i * outw; + for (index_t i = 0; i < p.in_height; i++) { + float *out_row = out_base + 2 * i * p.out_width; float *out_row_0 = out_row; - float *out_row_1 = out_row_0 + outw; - float *out_row_2 = out_row_1 + outw; - float *out_row_3 = out_row_2 + outw; + float *out_row_1 = out_row_0 + p.out_width; + float *out_row_2 = out_row_1 + p.out_width; + float *out_row_3 = out_row_2 + p.out_width; index_t j = 0; - for (index_t n = 0; n + 9 < outw; n += 8) { + for (index_t n = 0; n + 9 < p.out_width; n += 8) { float32x4_t in_vec = vld1q_f32(in); // row 0 @@ -933,7 +748,7 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context, j += 4; } - for (; j < w; j++) { + for (; j < p.in_width; j++) { float val = in[0]; for (int k = 0; k < 4; ++k) { out_row_0[k] += val * k0[k]; @@ -952,36 +767,11 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context, } } } - }, 0, batch, 1, 0, group_, 1, 0, outch_g, 1); - - UnPadOutput(*out_tensor, out_pad_size, output); + }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 1); return MaceStatus::MACE_SUCCESS; } -void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, - float, ImplType::NEON, K4x4S1)); - MACE_REGISTER_DELEGATOR( - registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam, - MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU, - float, ImplType::NEON, K4x4S2)); -} - -void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam, - MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, - float, ImplType::NEON, K4x4S1)); - MACE_REGISTER_DELEGATOR( - registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam, - MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU, - float, ImplType::NEON, K4x4S2)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/gemm.cc b/mace/ops/arm/fp32/gemm.cc index d506d8b1dbec75121dc4d025b7e89eaf22da1ecf..123e3aaee0e46ded600cf8ff6182846eb23394b8 100644 --- a/mace/ops/arm/fp32/gemm.cc +++ b/mace/ops/arm/fp32/gemm.cc @@ -12,687 +12,498 @@ // See the License for the specific language governing permissions and // limitations under the License. - -#include "mace/ops/arm/fp32/gemm.h" - #include #include #include +#include "mace/ops/arm/base/gemm.h" #include "mace/port/env.h" namespace mace { namespace ops { namespace arm { -namespace fp32 { - -enum { kNoCache, kCacheLhs, kCacheRhs }; - -MaceStatus Gemm::Compute(const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const index_t batch, - const index_t rows, - const index_t cols, - const index_t depth, - const MatrixMajor lhs_major, - const MatrixMajor rhs_major, - const MatrixMajor output_major, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) { - MACE_CHECK(output->size() == batch * rows * cols, - "Need resize output tensor before call gemm."); - Tensor::MappingGuard lhs_guard(lhs); - Tensor::MappingGuard rhs_guard(rhs); - Tensor::MappingGuard output_guard(output); - const float *lhs_data = lhs->data(); - const float *rhs_data = rhs->data(); - float *output_data = output->mutable_data(); - -#ifdef __aarch64__ - const index_t row_block_size = 8; -#else - const index_t row_block_size = 4; -#endif - const index_t col_block_size = 8; - const index_t depth_block_size = 4; - const index_t row_block_count = RoundUpDiv(rows, row_block_size); - const index_t col_block_count = RoundUpDiv(cols, col_block_size); - const index_t rows_padded = RoundUp(rows, row_block_size); - const index_t cols_padded = RoundUp(cols, col_block_size); - const index_t depth_padded = RoundUp(depth, depth_block_size); - - ScratchBuffer *scratch = context->device()->scratch_buffer(); - index_t packed_lhs_size = - PadAlignSize(sizeof(float) * rows_padded * depth_padded); - index_t packed_rhs_size = - PadAlignSize(sizeof(float) * depth_padded * cols_padded); - index_t packed_output_size = - PadAlignSize(sizeof(float) * rows_padded * cols_padded); - // resize to the total size of lhs & rhs & output anyway, - // in case we do not cache const tensor for saving memory - MACE_RETURN_IF_ERROR(scratch->GrowSize( - packed_lhs_size + packed_rhs_size + packed_output_size)); - float *packed_lhs_data = - scratch->Scratch(packed_lhs_size).mutable_data(); - float *packed_rhs_data = - scratch->Scratch(packed_rhs_size).mutable_data(); - float *packed_output_data = - scratch->Scratch(packed_output_size).mutable_data(); +template<> +template<> +void Gemm::Pack<4, 4>(const MatrixMap &matrix, + MatrixMajor dst_major, + float *packed_matrix) { + const index_t rows = matrix.rows(); + const index_t cols = matrix.cols(); - int cache_side = kNoCache; - if (cached_ == kCacheLhs) { - packed_lhs_data = pack_cache_.mutable_data(); - } else if (cached_ == kCacheRhs) { - packed_rhs_data = pack_cache_.mutable_data(); - } else if (should_cache_pack_) { - if (lhs->is_weight() && (!lhs_batched || batch == 1)) { - cache_side = kCacheLhs; - pack_cache_.Resize(packed_lhs_size); - packed_lhs_data = pack_cache_.mutable_data(); - } else if (rhs->is_weight() && (!rhs_batched || batch == 1)) { - cache_side = kCacheRhs; - pack_cache_.Resize(packed_rhs_size); - packed_rhs_data = pack_cache_.mutable_data(); - } + // use the same terminology as GemmLowp: + // depth is depth, width is the opposite dim other than depth + // lhs + index_t width = rows; + index_t depth = cols; + index_t width_stride = matrix.rows_stride(); + index_t depth_stride = matrix.cols_stride(); + if (dst_major == RowMajor) { + // rhs + std::swap(width, depth); + std::swap(width_stride, depth_stride); } + const float *data = matrix.data(); + float *packed_ptr = packed_matrix; - utils::ThreadPool - &thread_pool = context->device()->cpu_runtime()->thread_pool(); - - for (index_t b = 0; b < batch; ++b) { - MatrixMap - lhs_matrix - (lhs_data + static_cast(lhs_batched) * b * rows * depth, - lhs_major, - rows, - depth); - MatrixMap - rhs_matrix - (rhs_data + static_cast(rhs_batched) * b * depth * cols, - rhs_major, - depth, - cols); - MatrixMap output_matrix - (output_data + b * rows * cols, output_major, rows, cols); + const index_t block_size = 4; + const index_t depth_padded = RoundUp(depth, static_cast(4)); - // pack lhs - if (cached_ != kCacheLhs) { - thread_pool.Compute1D([=, &lhs_matrix](index_t start, - index_t end, - index_t step) { - for (index_t row_block_idx = start; row_block_idx < end; - row_block_idx += step) { - const index_t start_row = row_block_idx * row_block_size; - const index_t - row_block_len = std::min(row_block_size, rows - start_row); - float *packed_lhs_data_block = - packed_lhs_data + row_block_idx * row_block_size * depth_padded; - PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth), - packed_lhs_data_block); - } - }, 0, row_block_count, 1); + if (depth_padded > depth) { + memset(packed_ptr + depth * block_size, + 0, + sizeof(float) * (depth_padded - depth) * block_size); + } - if (cache_side == kCacheLhs) { - cached_ = kCacheLhs; - if (lhs->UnderlyingBuffer()->OnHost()) { - AdviseFree(reinterpret_cast(const_cast(lhs->data< - float>())), - lhs->raw_size()); - } + if (dst_major == matrix.matrix_major()) { + if (width < block_size) { + const index_t width_remain = block_size - width; + for (index_t d = 0; d < depth; ++d) { + memcpy(packed_ptr, data, sizeof(float) * width); + memset(packed_ptr + width, 0, sizeof(float) * width_remain); + data += depth_stride; + packed_ptr += block_size; + } + } else { + for (index_t d = 0; d < depth; ++d) { + float32x4_t vi = vld1q_f32(data); + vst1q_f32(packed_ptr, vi); + data += depth_stride; + packed_ptr += block_size; } } + } else { + if (width < block_size) { + const index_t width_remain = block_size - width; + for (index_t d = 0; d < depth; ++d) { + for (index_t w = 0; w < width; ++w) { + packed_ptr[w] = data[w * width_stride + d]; + } // w + memset(packed_ptr + width, 0, sizeof(float) * width_remain); + packed_ptr += block_size; + } // d + } else { + const float *data0 = data; + const float *data1 = data + width_stride; + const float *data2 = data1 + width_stride; + const float *data3 = data2 + width_stride; - // pack rhs - if (cached_ != kCacheRhs) { - thread_pool.Compute1D([=, &rhs_matrix](index_t start, - index_t end, - index_t step) { - for (index_t col_block_idx = start; col_block_idx < end; - col_block_idx += step) { - const index_t start_col = col_block_idx * col_block_size; - const index_t - col_block_len = std::min(col_block_size, cols - start_col); - float *packed_rhs_data_block = - packed_rhs_data + col_block_idx * col_block_size * depth_padded; - PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len), - packed_rhs_data_block); - } - }, 0, col_block_count, 1); + const index_t depth_block = depth / 4; + const index_t depth_remain = depth - depth_block * 4; + for (index_t depth_block_idx = 0; depth_block_idx < depth_block; + ++depth_block_idx) { + float32x4_t v0 = vld1q_f32(data0); + float32x4_t v1 = vld1q_f32(data1); + float32x4_t v2 = vld1q_f32(data2); + float32x4_t v3 = vld1q_f32(data3); + float32x4x2_t v02_intertwined = vzipq_f32(v0, v2); + float32x4x2_t v13_intertwined = vzipq_f32(v1, v3); + float32x4x2_t v0123_intertwined = + vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]); + float32x4x2_t v0123n_intertwined = + vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]); - if (cache_side == kCacheRhs) { - cached_ = kCacheRhs; - if (rhs->UnderlyingBuffer()->OnHost()) { - AdviseFree(reinterpret_cast(const_cast(rhs->data< - float>())), - rhs->raw_size()); - } - } - } + vst1q_f32(packed_ptr, v0123_intertwined.val[0]); + packed_ptr += 4; - // multiply lhs and rhs - thread_pool.Compute1D([=, &output_matrix](index_t start, - index_t end, - index_t step) { - for (index_t row_block_idx = start; row_block_idx < end; - row_block_idx += step) { - const index_t start_row = row_block_idx * row_block_size; - const index_t - row_block_len = std::min(row_block_size, rows - start_row); - const float *packed_lhs_data_block = - packed_lhs_data + row_block_idx * row_block_size * depth_padded; + vst1q_f32(packed_ptr, v0123_intertwined.val[1]); + packed_ptr += 4; - for (index_t col_block_idx = 0; col_block_idx < col_block_count; - ++col_block_idx) { - const index_t start_col = col_block_idx * col_block_size; - const index_t - col_block_len = std::min(col_block_size, cols - start_col); - const float *packed_rhs_data_block = - packed_rhs_data + col_block_idx * col_block_size * depth_padded; - float *packed_output_data_block = - packed_output_data + row_block_idx * row_block_size * cols_padded - + col_block_idx * col_block_size; - ComputeBlock(packed_lhs_data_block, - packed_rhs_data_block, - depth_padded, - packed_output_data_block); - MatrixMap output_block = output_matrix.block(start_row, - start_col, - row_block_len, - col_block_len); - UnpackOutput(packed_output_data_block, &output_block); - } // col_block_idx - } // row_block_idx - }, 0, row_block_count, 1); - } // b + vst1q_f32(packed_ptr, v0123n_intertwined.val[0]); + packed_ptr += 4; - return MaceStatus::MACE_SUCCESS; -} + vst1q_f32(packed_ptr, v0123n_intertwined.val[1]); + packed_ptr += 4; -void Gemm::ComputeBlock(const float *packed_lhs_data, - const float *packed_rhs_data, - const index_t depth_padded, - float *packed_output_data) { - /* Ref: - for (index_t r = 0; r < block_size; ++r) { - for (index_t c = 0; c < block_size; ++c) { - float sum = 0; - for (index_t d = 0; d < depth; ++d) { - // (r, d) * (d, c) - sum += packed_lhs_data[d * r_block_size + r] - * packed_rhs_data[d * c_block_size + c]; + data0 += 4; + data1 += 4; + data2 += 4; + data3 += 4; } - packed_output_data[r * c_block_size + c] = sum; - } - } - */ - const float *lhs_ptr = packed_lhs_data; - const float *rhs_ptr = packed_rhs_data; + for (index_t d = 0; d < depth_remain; ++d) { + float32x4_t vi = {*data0, *data1, *data2, *data3}; + vst1q_f32(packed_ptr, vi); + packed_ptr += 4; - const index_t depth_block_count = depth_padded / 4; + ++data0; + ++data1; + ++data2; + ++data3; + } // d + } + } +} -#ifdef __aarch64__ - // Register layout: (8x4) x (4,8) - // - // +--------+--------+ - // | v8 ... | v9 ... | - // Rhs +--------+--------+ - // | v10... | v11... | - // +--------+--------+ - // | v12... | v13... | - // +--------+--------+ - // | v14... | v15... | - // +--------+--------+ - // - // Lhs - // - // +----+----+----+----+ - - +--------+--------+ - // | v0 | v2 | v4 | v6 | | v16... | v17... | - // | . | | | | | v18... | v19... | - // | . | | | | | v20... | v21... | - // | . | | | | | v22... | v23... | - // +----+----|----+----+ +--------+--------+ - // | v1 | v3 | v5 | v7 | | v24... | v25... | - // | . | | | | | v26... | v27... | - // | . | | | | | v28... | v29... | - // | . | | | | | v30... | v31... | - // +----+----|----+----+ +--------+--------+ - // - // Accumulator - // +template<> +template<> +void Gemm::Pack<8, 4>(const MatrixMap &matrix, + MatrixMajor dst_major, + float *packed_matrix) { + const index_t rows = matrix.rows(); + const index_t cols = matrix.cols(); - if (depth_block_count > 0) { - index_t r_depth_block_count = depth_block_count; - // just make compiler happy - MACE_UNUSED(r_depth_block_count); + // use the same terminology as GemmLowp: + // depth is depth, width is the opposite dim other than depth + // lhs + index_t width = rows; + index_t depth = cols; + index_t width_stride = matrix.rows_stride(); + index_t depth_stride = matrix.cols_stride(); + if (dst_major == RowMajor) { + // rhs + std::swap(width, depth); + std::swap(width_stride, depth_stride); + } + const float *data = matrix.data(); + float *packed_ptr = packed_matrix; - asm volatile( - "dup v16.4s, wzr \n" - "dup v17.4s, wzr \n" - "dup v18.4s, wzr \n" - "dup v19.4s, wzr \n" - "dup v20.4s, wzr \n" - "dup v21.4s, wzr \n" - "dup v22.4s, wzr \n" - "dup v23.4s, wzr \n" - "dup v24.4s, wzr \n" - "dup v25.4s, wzr \n" - "dup v26.4s, wzr \n" - "dup v27.4s, wzr \n" - "dup v28.4s, wzr \n" - "dup v29.4s, wzr \n" - "dup v30.4s, wzr \n" - "dup v31.4s, wzr \n" + const index_t block_size = 8; + const index_t depth_padded = RoundUp(depth, static_cast(4)); - // prelogue - "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n" - "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n" - "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n" - "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n" - "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n" - "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n" - "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n" - "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n" + if (depth_padded > depth) { + memset(packed_ptr + depth * block_size, + 0, + sizeof(float) * (depth_padded - depth) * block_size); + } - "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n" - "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n" - "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n" - "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n" - "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n" - "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n" - "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n" - "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n" + if (dst_major == matrix.matrix_major()) { + if (width < block_size) { + const index_t width_remain = block_size - width; + for (index_t d = 0; d < depth; ++d) { + memcpy(packed_ptr, data, sizeof(float) * width); + memset(packed_ptr + width, 0, sizeof(float) * width_remain); + data += depth_stride; + packed_ptr += block_size; + } + } else { + for (index_t d = 0; d < depth; ++d) { + float32x4_t vi = vld1q_f32(data); + vst1q_f32(packed_ptr, vi); + float32x4_t vin = vld1q_f32(data + 4); + vst1q_f32(packed_ptr + 4, vin); + data += depth_stride; + packed_ptr += block_size; + } + } + } else { + if (width < block_size) { + const index_t width_remain = block_size - width; + for (index_t d = 0; d < depth; ++d) { + for (index_t w = 0; w < width; ++w) { + packed_ptr[w] = data[w * width_stride + d]; + } // w + memset(packed_ptr + width, 0, sizeof(float) * width_remain); + packed_ptr += block_size; + } // d + } else { + const float *data0 = data; + const float *data1 = data + width_stride; + const float *data2 = data1 + width_stride; + const float *data3 = data2 + width_stride; + const float *data4 = data3 + width_stride; + const float *data5 = data4 + width_stride; + const float *data6 = data5 + width_stride; + const float *data7 = data6 + width_stride; - "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" - "beq 1f\n" + const index_t depth_block = depth / 4; + const index_t depth_remain = depth - depth_block * 4; + for (index_t depth_block_idx = 0; depth_block_idx < depth_block; + ++depth_block_idx) { + float32x4_t v0 = vld1q_f32(data0); + float32x4_t v1 = vld1q_f32(data1); + float32x4_t v2 = vld1q_f32(data2); + float32x4_t v3 = vld1q_f32(data3); + float32x4x2_t v02_intertwined = vzipq_f32(v0, v2); + float32x4x2_t v13_intertwined = vzipq_f32(v1, v3); + float32x4x2_t v0123_intertwined = + vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]); + float32x4x2_t v0123n_intertwined = + vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]); - "0: \n" - "fmla v16.4s, v8.4s, v0.s[0] \n" - "fmla v17.4s, v9.4s, v0.s[0] \n" - "fmla v18.4s, v8.4s, v0.s[1] \n" - "fmla v19.4s, v9.4s, v0.s[1] \n" - "fmla v20.4s, v8.4s, v0.s[2] \n" - "fmla v21.4s, v9.4s, v0.s[2] \n" - "fmla v22.4s, v8.4s, v0.s[3] \n" - "fmla v23.4s, v9.4s, v0.s[3] \n" + float32x4_t v4 = vld1q_f32(data4); + float32x4_t v5 = vld1q_f32(data5); + float32x4_t v6 = vld1q_f32(data6); + float32x4_t v7 = vld1q_f32(data7); + float32x4x2_t v46_intertwined = vzipq_f32(v4, v6); + float32x4x2_t v57_intertwined = vzipq_f32(v5, v7); + float32x4x2_t v4567_intertwined = + vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]); + float32x4x2_t v4567n_intertwined = + vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]); - "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n" + vst1q_f32(packed_ptr, v0123_intertwined.val[0]); + packed_ptr += 4; - "fmla v24.4s, v8.4s, v1.s[0] \n" - "fmla v25.4s, v9.4s, v1.s[0] \n" - "fmla v26.4s, v8.4s, v1.s[1] \n" - "fmla v27.4s, v9.4s, v1.s[1] \n" - "fmla v28.4s, v8.4s, v1.s[2] \n" - "fmla v29.4s, v9.4s, v1.s[2] \n" - "fmla v30.4s, v8.4s, v1.s[3] \n" - "fmla v31.4s, v9.4s, v1.s[3] \n" + vst1q_f32(packed_ptr, v4567_intertwined.val[0]); + packed_ptr += 4; - "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n" - "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n" - "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n" + vst1q_f32(packed_ptr, v0123_intertwined.val[1]); + packed_ptr += 4; - "fmla v16.4s, v10.4s, v2.s[0] \n" - "fmla v17.4s, v11.4s, v2.s[0] \n" - "fmla v18.4s, v10.4s, v2.s[1] \n" - "fmla v19.4s, v11.4s, v2.s[1] \n" - "fmla v20.4s, v10.4s, v2.s[2] \n" - "fmla v21.4s, v11.4s, v2.s[2] \n" - "fmla v22.4s, v10.4s, v2.s[3] \n" - "fmla v23.4s, v11.4s, v2.s[3] \n" + vst1q_f32(packed_ptr, v4567_intertwined.val[1]); + packed_ptr += 4; - "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n" + vst1q_f32(packed_ptr, v0123n_intertwined.val[0]); + packed_ptr += 4; - "fmla v24.4s, v10.4s, v3.s[0] \n" - "fmla v25.4s, v11.4s, v3.s[0] \n" - "fmla v26.4s, v10.4s, v3.s[1] \n" - "fmla v27.4s, v11.4s, v3.s[1] \n" - "fmla v28.4s, v10.4s, v3.s[2] \n" - "fmla v29.4s, v11.4s, v3.s[2] \n" - "fmla v30.4s, v10.4s, v3.s[3] \n" - "fmla v31.4s, v11.4s, v3.s[3] \n" + vst1q_f32(packed_ptr, v4567n_intertwined.val[0]); + packed_ptr += 4; - "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n" - "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n" - "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n" + vst1q_f32(packed_ptr, v0123n_intertwined.val[1]); + packed_ptr += 4; - "fmla v16.4s, v12.4s, v4.s[0] \n" - "fmla v17.4s, v13.4s, v4.s[0] \n" - "fmla v18.4s, v12.4s, v4.s[1] \n" - "fmla v19.4s, v13.4s, v4.s[1] \n" - "fmla v20.4s, v12.4s, v4.s[2] \n" - "fmla v21.4s, v13.4s, v4.s[2] \n" - "fmla v22.4s, v12.4s, v4.s[3] \n" - "fmla v23.4s, v13.4s, v4.s[3] \n" + vst1q_f32(packed_ptr, v4567n_intertwined.val[1]); + packed_ptr += 4; - "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n" + data0 += 4; + data1 += 4; + data2 += 4; + data3 += 4; + data4 += 4; + data5 += 4; + data6 += 4; + data7 += 4; + } + for (index_t d = 0; d < depth_remain; ++d) { + float32x4_t vi = {*data0, *data1, *data2, *data3}; + vst1q_f32(packed_ptr, vi); + packed_ptr += 4; - "fmla v24.4s, v12.4s, v5.s[0] \n" - "fmla v25.4s, v13.4s, v5.s[0] \n" - "fmla v26.4s, v12.4s, v5.s[1] \n" - "fmla v27.4s, v13.4s, v5.s[1] \n" - "fmla v28.4s, v12.4s, v5.s[2] \n" - "fmla v29.4s, v13.4s, v5.s[2] \n" - "fmla v30.4s, v12.4s, v5.s[3] \n" - "fmla v31.4s, v13.4s, v5.s[3] \n" + float32x4_t vin = {*data4, *data5, *data6, *data7}; + vst1q_f32(packed_ptr, vin); + packed_ptr += 4; - "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n" - "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n" - "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n" - - "fmla v16.4s, v14.4s, v6.s[0] \n" - "fmla v17.4s, v15.4s, v6.s[0] \n" - "fmla v18.4s, v14.4s, v6.s[1] \n" - "fmla v19.4s, v15.4s, v6.s[1] \n" - "fmla v20.4s, v14.4s, v6.s[2] \n" - "fmla v21.4s, v15.4s, v6.s[2] \n" - "fmla v22.4s, v14.4s, v6.s[3] \n" - "fmla v23.4s, v15.4s, v6.s[3] \n" + ++data0; + ++data1; + ++data2; + ++data3; + ++data4; + ++data5; + ++data6; + ++data7; + } // d + } + } +} - "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n" +template<> +template<> +void Gemm::Unpack<4, 8>(const float *packed_output, + MatrixMap *output) { + const index_t rows = output->rows(); + const index_t cols = output->cols(); + index_t row_stride = output->rows_stride(); + index_t col_stride = output->cols_stride(); - "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" + float *output_ptr = output->data(); + const float *packed_ptr = packed_output; - "fmla v24.4s, v14.4s, v7.s[0] \n" - "fmla v25.4s, v15.4s, v7.s[0] \n" - "fmla v26.4s, v14.4s, v7.s[1] \n" - "fmla v27.4s, v15.4s, v7.s[1] \n" - "fmla v28.4s, v14.4s, v7.s[2] \n" - "fmla v29.4s, v15.4s, v7.s[2] \n" - "fmla v30.4s, v14.4s, v7.s[3] \n" - "fmla v31.4s, v15.4s, v7.s[3] \n" + const index_t block_size = 8; - "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n" - "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n" - "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n" + // packed_output always has row-major + if (output->matrix_major() == RowMajor) { + if (cols < block_size) { + for (index_t r = 0; r < rows; ++r) { + memcpy(output_ptr, packed_ptr, sizeof(float) * cols); + output_ptr += row_stride; + packed_ptr += block_size; + } + } else { + for (index_t r = 0; r < rows; ++r) { + float32x4_t vi = vld1q_f32(packed_ptr); + vst1q_f32(output_ptr, vi); + float32x4_t vin = vld1q_f32(packed_ptr + 4); + vst1q_f32(output_ptr + 4, vin); - "bne 0b \n" + output_ptr += row_stride; + packed_ptr += block_size; + } + } + } else { + // ColMajor + if (rows < block_size) { + for (index_t c = 0; c < cols; ++c) { + for (index_t r = 0; r < rows; ++r) { + output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c]; + } // r + } // c + } else { + const float *data0 = packed_ptr; + const float *data1 = data0 + block_size; + const float *data2 = data1 + block_size; + const float *data3 = data2 + block_size; - // prologue - "1:\n" - "fmla v16.4s, v8.4s, v0.s[0] \n" - "fmla v17.4s, v9.4s, v0.s[0] \n" - "fmla v18.4s, v8.4s, v0.s[1] \n" - "fmla v19.4s, v9.4s, v0.s[1] \n" - "fmla v20.4s, v8.4s, v0.s[2] \n" - "fmla v21.4s, v9.4s, v0.s[2] \n" - "fmla v22.4s, v8.4s, v0.s[3] \n" - "fmla v23.4s, v9.4s, v0.s[3] \n" + index_t col_block = cols / 4; + index_t col_remain = cols - col_block * 4; + for (index_t col_block_idx = 0; col_block_idx < col_block; + ++col_block_idx) { + float32x4_t v0 = vld1q_f32(data0); + float32x4_t v1 = vld1q_f32(data1); + float32x4_t v2 = vld1q_f32(data2); + float32x4_t v3 = vld1q_f32(data3); + float32x4x2_t v02_intertwined = vzipq_f32(v0, v2); + float32x4x2_t v13_intertwined = vzipq_f32(v1, v3); + float32x4x2_t v0123_intertwined = + vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]); + float32x4x2_t v0123n_intertwined = + vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]); - "fmla v24.4s, v8.4s, v1.s[0] \n" - "fmla v25.4s, v9.4s, v1.s[0] \n" - "fmla v26.4s, v8.4s, v1.s[1] \n" - "fmla v27.4s, v9.4s, v1.s[1] \n" - "fmla v28.4s, v8.4s, v1.s[2] \n" - "fmla v29.4s, v9.4s, v1.s[2] \n" - "fmla v30.4s, v8.4s, v1.s[3] \n" - "fmla v31.4s, v9.4s, v1.s[3] \n" + vst1q_f32(output_ptr, v0123_intertwined.val[0]); + output_ptr += col_stride; - "fmla v16.4s, v10.4s, v2.s[0] \n" - "fmla v17.4s, v11.4s, v2.s[0] \n" - "fmla v18.4s, v10.4s, v2.s[1] \n" - "fmla v19.4s, v11.4s, v2.s[1] \n" - "fmla v20.4s, v10.4s, v2.s[2] \n" - "fmla v21.4s, v11.4s, v2.s[2] \n" - "fmla v22.4s, v10.4s, v2.s[3] \n" - "fmla v23.4s, v11.4s, v2.s[3] \n" + vst1q_f32(output_ptr, v0123_intertwined.val[1]); + output_ptr += col_stride; - "fmla v24.4s, v10.4s, v3.s[0] \n" - "fmla v25.4s, v11.4s, v3.s[0] \n" - "fmla v26.4s, v10.4s, v3.s[1] \n" - "fmla v27.4s, v11.4s, v3.s[1] \n" - "fmla v28.4s, v10.4s, v3.s[2] \n" - "fmla v29.4s, v11.4s, v3.s[2] \n" - "fmla v30.4s, v10.4s, v3.s[3] \n" - "fmla v31.4s, v11.4s, v3.s[3] \n" + vst1q_f32(output_ptr, v0123n_intertwined.val[0]); + output_ptr += col_stride; - "fmla v16.4s, v12.4s, v4.s[0] \n" - "fmla v17.4s, v13.4s, v4.s[0] \n" - "fmla v18.4s, v12.4s, v4.s[1] \n" - "fmla v19.4s, v13.4s, v4.s[1] \n" - "fmla v20.4s, v12.4s, v4.s[2] \n" - "fmla v21.4s, v13.4s, v4.s[2] \n" - "fmla v22.4s, v12.4s, v4.s[3] \n" - "fmla v23.4s, v13.4s, v4.s[3] \n" + vst1q_f32(output_ptr, v0123n_intertwined.val[1]); + output_ptr += col_stride; - "fmla v24.4s, v12.4s, v5.s[0] \n" - "fmla v25.4s, v13.4s, v5.s[0] \n" - "fmla v26.4s, v12.4s, v5.s[1] \n" - "fmla v27.4s, v13.4s, v5.s[1] \n" - "fmla v28.4s, v12.4s, v5.s[2] \n" - "fmla v29.4s, v13.4s, v5.s[2] \n" - "fmla v30.4s, v12.4s, v5.s[3] \n" - "fmla v31.4s, v13.4s, v5.s[3] \n" + data0 += 4; + data1 += 4; + data2 += 4; + data3 += 4; + } + for (index_t c = 0; c < col_remain; ++c) { + float32x4_t vi = {*data0, *data1, *data2, *data3}; + vst1q_f32(output_ptr, vi); + output_ptr += col_stride; - "fmla v16.4s, v14.4s, v6.s[0] \n" - "fmla v17.4s, v15.4s, v6.s[0] \n" - "fmla v18.4s, v14.4s, v6.s[1] \n" - "fmla v19.4s, v15.4s, v6.s[1] \n" - "fmla v20.4s, v14.4s, v6.s[2] \n" - "fmla v21.4s, v15.4s, v6.s[2] \n" - "fmla v22.4s, v14.4s, v6.s[3] \n" - "fmla v23.4s, v15.4s, v6.s[3] \n" + ++data0; + ++data1; + ++data2; + ++data3; + } // d + } + } +} - "fmla v24.4s, v14.4s, v7.s[0] \n" - "fmla v25.4s, v15.4s, v7.s[0] \n" - "fmla v26.4s, v14.4s, v7.s[1] \n" - "fmla v27.4s, v15.4s, v7.s[1] \n" - "fmla v28.4s, v14.4s, v7.s[2] \n" - "fmla v29.4s, v15.4s, v7.s[2] \n" - "fmla v30.4s, v14.4s, v7.s[3] \n" - "fmla v31.4s, v15.4s, v7.s[3] \n" +template<> +template<> +void Gemm::Unpack<8, 8>(const float *packed_output, + MatrixMap *output) { + const index_t rows = output->rows(); + const index_t cols = output->cols(); + index_t row_stride = output->rows_stride(); + index_t col_stride = output->cols_stride(); - "st1 {v16.4s}, [%[packed_output_data]], #16 \n" - "st1 {v17.4s}, [%[packed_output_data]], #16 \n" - "st1 {v18.4s}, [%[packed_output_data]], #16 \n" - "st1 {v19.4s}, [%[packed_output_data]], #16 \n" - "st1 {v20.4s}, [%[packed_output_data]], #16 \n" - "st1 {v21.4s}, [%[packed_output_data]], #16 \n" - "st1 {v22.4s}, [%[packed_output_data]], #16 \n" - "st1 {v23.4s}, [%[packed_output_data]], #16 \n" - "st1 {v24.4s}, [%[packed_output_data]], #16 \n" - "st1 {v25.4s}, [%[packed_output_data]], #16 \n" - "st1 {v26.4s}, [%[packed_output_data]], #16 \n" - "st1 {v27.4s}, [%[packed_output_data]], #16 \n" - "st1 {v28.4s}, [%[packed_output_data]], #16 \n" - "st1 {v29.4s}, [%[packed_output_data]], #16 \n" - "st1 {v30.4s}, [%[packed_output_data]], #16 \n" - "st1 {v31.4s}, [%[packed_output_data]], #16 \n" - : // outputs - [lhs_ptr] "+r"(lhs_ptr), - [rhs_ptr] "+r"(rhs_ptr), - [packed_output_data] "+r"(packed_output_data), - [r_depth_block_count] "+r"(r_depth_block_count) - : // inputs - : // clabbers - "cc", "memory", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - } -#else // armeabi-v7a - // Register layout: (4x4) x (4,8) - // - // +--------+--------+ - // | q4 ... | q5 ... | - // Rhs +--------+--------+ - // | q6 ... | q7 ... | - // +--------+--------+ - // | q4 ... | q5 ... | - // +--------+--------+ - // | q6 ... | q7 ... | - // +--------+--------+ - // - // Lhs - // - // +----+----+----+----+ - - +--------+--------+ - // | q0 | q1 | q2 | q3 | | q8... | q9... | - // | . | | | | | q10... | q11... | - // | . | | | | | q12... | q13... | - // | . | | | | | q14... | q15... | - // +----+----+----+----+ +--------+--------+ - // - // Accumulator - // + float *output_ptr = output->data(); + const float *packed_ptr = packed_output; - if (depth_block_count > 0) { - index_t r_depth_block_count = depth_block_count; - // just make compiler happy - MACE_UNUSED(r_depth_block_count); + const index_t block_size = 8; - asm volatile( - "mov r0, #0\n" - "vdup.f32 q8, r0 \n" - "vdup.f32 q9, r0 \n" - "vdup.f32 q10, r0 \n" - "vdup.f32 q11, r0 \n" - "vdup.f32 q12, r0 \n" - "vdup.f32 q13, r0 \n" - "vdup.f32 q14, r0 \n" - "vdup.f32 q15, r0 \n" - - // prelogue - "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" - "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" - "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n" - "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n" - - "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" - "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" - "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" - "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" - - "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" - "beq 1f\n" - - "0: \n" - - "vmla.f32 q8, q4, d0[0] \n" - "vmla.f32 q9, q5, d0[0] \n" - "vmla.f32 q10, q4, d0[1] \n" - "vmla.f32 q11, q5, d0[1] \n" - "vmla.f32 q12, q4, d1[0] \n" - "vmla.f32 q13, q5, d1[0] \n" - "vmla.f32 q14, q4, d1[1] \n" - "vmla.f32 q15, q5, d1[1] \n" - - "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" - "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" - "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" - - "vmla.f32 q8, q6, d2[0] \n" - "vmla.f32 q9, q7, d2[0] \n" - "vmla.f32 q10, q6, d2[1] \n" - "vmla.f32 q11, q7, d2[1] \n" - "vmla.f32 q12, q6, d3[0] \n" - "vmla.f32 q13, q7, d3[0] \n" - "vmla.f32 q14, q6, d3[1] \n" - "vmla.f32 q15, q7, d3[1] \n" - - "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" - "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" - "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" - - "vmla.f32 q8, q4, d4[0] \n" - "vmla.f32 q9, q5, d4[0] \n" - "vmla.f32 q10, q4, d4[1] \n" - "vmla.f32 q11, q5, d4[1] \n" - "vmla.f32 q12, q4, d5[0] \n" - "vmla.f32 q13, q5, d5[0] \n" - "vmla.f32 q14, q4, d5[1] \n" - "vmla.f32 q15, q5, d5[1] \n" - - "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n" - "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" - "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" - - "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" - - "vmla.f32 q8, q6, d6[0] \n" - "vmla.f32 q9, q7, d6[0] \n" - "vmla.f32 q10, q6, d6[1] \n" - "vmla.f32 q11, q7, d6[1] \n" - "vmla.f32 q12, q6, d7[0] \n" - "vmla.f32 q13, q7, d7[0] \n" - "vmla.f32 q14, q6, d7[1] \n" - "vmla.f32 q15, q7, d7[1] \n" + // packed_output always has row-major + if (output->matrix_major() == RowMajor) { + if (cols < block_size) { + for (index_t r = 0; r < rows; ++r) { + memcpy(output_ptr, packed_ptr, sizeof(float) * cols); + output_ptr += row_stride; + packed_ptr += block_size; + } + } else { + for (index_t r = 0; r < rows; ++r) { + float32x4_t vi = vld1q_f32(packed_ptr); + vst1q_f32(output_ptr, vi); + float32x4_t vin = vld1q_f32(packed_ptr + 4); + vst1q_f32(output_ptr + 4, vin); - "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n" - "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" - "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" + output_ptr += row_stride; + packed_ptr += block_size; + } + } + } else { + // ColMajor + if (rows < block_size) { + for (index_t c = 0; c < cols; ++c) { + for (index_t r = 0; r < rows; ++r) { + output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c]; + } // r + } // c + } else { + const float *data0 = packed_ptr; + const float *data1 = data0 + block_size; + const float *data2 = data1 + block_size; + const float *data3 = data2 + block_size; + const float *data4 = data3 + block_size; + const float *data5 = data4 + block_size; + const float *data6 = data5 + block_size; + const float *data7 = data6 + block_size; - "bne 0b \n" + index_t col_block = cols / 4; + index_t col_remain = cols - col_block * 4; + for (index_t col_block_idx = 0; col_block_idx < col_block; + ++col_block_idx) { + float32x4_t v0 = vld1q_f32(data0); + float32x4_t v1 = vld1q_f32(data1); + float32x4_t v2 = vld1q_f32(data2); + float32x4_t v3 = vld1q_f32(data3); + float32x4x2_t v02_intertwined = vzipq_f32(v0, v2); + float32x4x2_t v13_intertwined = vzipq_f32(v1, v3); + float32x4x2_t v0123_intertwined = + vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]); + float32x4x2_t v0123n_intertwined = + vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]); - // prologue - "1:\n" - "vmla.f32 q8, q4, d0[0] \n" - "vmla.f32 q9, q5, d0[0] \n" - "vmla.f32 q10, q4, d0[1] \n" - "vmla.f32 q11, q5, d0[1] \n" - "vmla.f32 q12, q4, d1[0] \n" - "vmla.f32 q13, q5, d1[0] \n" - "vmla.f32 q14, q4, d1[1] \n" - "vmla.f32 q15, q5, d1[1] \n" + float32x4_t v4 = vld1q_f32(data4); + float32x4_t v5 = vld1q_f32(data5); + float32x4_t v6 = vld1q_f32(data6); + float32x4_t v7 = vld1q_f32(data7); + float32x4x2_t v46_intertwined = vzipq_f32(v4, v6); + float32x4x2_t v57_intertwined = vzipq_f32(v5, v7); + float32x4x2_t v4567_intertwined = + vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]); + float32x4x2_t v4567n_intertwined = + vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]); - "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" - "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" + vst1q_f32(output_ptr, v0123_intertwined.val[0]); + vst1q_f32(output_ptr + 4, v4567_intertwined.val[0]); + output_ptr += col_stride; - "vmla.f32 q8, q6, d2[0] \n" - "vmla.f32 q9, q7, d2[0] \n" - "vmla.f32 q10, q6, d2[1] \n" - "vmla.f32 q11, q7, d2[1] \n" - "vmla.f32 q12, q6, d3[0] \n" - "vmla.f32 q13, q7, d3[0] \n" - "vmla.f32 q14, q6, d3[1] \n" - "vmla.f32 q15, q7, d3[1] \n" + vst1q_f32(output_ptr, v0123_intertwined.val[1]); + vst1q_f32(output_ptr + 4, v4567_intertwined.val[1]); + output_ptr += col_stride; - "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" - "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" + vst1q_f32(output_ptr, v0123n_intertwined.val[0]); + vst1q_f32(output_ptr + 4, v4567n_intertwined.val[0]); + output_ptr += col_stride; - "vmla.f32 q8, q4, d4[0] \n" - "vmla.f32 q9, q5, d4[0] \n" - "vmla.f32 q10, q4, d4[1] \n" - "vmla.f32 q11, q5, d4[1] \n" - "vmla.f32 q12, q4, d5[0] \n" - "vmla.f32 q13, q5, d5[0] \n" - "vmla.f32 q14, q4, d5[1] \n" - "vmla.f32 q15, q5, d5[1] \n" + vst1q_f32(output_ptr, v0123n_intertwined.val[1]); + vst1q_f32(output_ptr + 4, v4567n_intertwined.val[1]); + output_ptr += col_stride; - "vmla.f32 q8, q6, d6[0] \n" - "vmla.f32 q9, q7, d6[0] \n" - "vmla.f32 q10, q6, d6[1] \n" - "vmla.f32 q11, q7, d6[1] \n" - "vmla.f32 q12, q6, d7[0] \n" - "vmla.f32 q13, q7, d7[0] \n" - "vmla.f32 q14, q6, d7[1] \n" - "vmla.f32 q15, q7, d7[1] \n" + data0 += 4; + data1 += 4; + data2 += 4; + data3 += 4; + data4 += 4; + data5 += 4; + data6 += 4; + data7 += 4; + } + for (index_t c = 0; c < col_remain; ++c) { + float32x4_t vi = {*data0, *data1, *data2, *data3}; + vst1q_f32(output_ptr, vi); + float32x4_t vin = {*data4, *data5, *data6, *data7}; + vst1q_f32(output_ptr + 4, vin); + output_ptr += col_stride; - "vst1.f32 {d16-d17}, [%[packed_output_data]]! \n" - "vst1.f32 {d18-d19}, [%[packed_output_data]]! \n" - "vst1.f32 {d20-d21}, [%[packed_output_data]]! \n" - "vst1.f32 {d22-d23}, [%[packed_output_data]]! \n" - "vst1.f32 {d24-d25}, [%[packed_output_data]]! \n" - "vst1.f32 {d26-d27}, [%[packed_output_data]]! \n" - "vst1.f32 {d28-d29}, [%[packed_output_data]]! \n" - "vst1.f32 {d30-d31}, [%[packed_output_data]]! \n" - : // outputs - [lhs_ptr] "+r"(lhs_ptr), - [rhs_ptr] "+r"(rhs_ptr), - [packed_output_data] "+r"(packed_output_data), - [r_depth_block_count] "+r"(r_depth_block_count) - : // inputs - : // clabbers - "cc", "memory", "r0", - "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + ++data0; + ++data1; + ++data2; + ++data3; + ++data4; + ++data5; + ++data6; + ++data7; + } // d + } } -#endif } -void Gemm::PackLhs(const MatrixMap &lhs, - float *packed_lhs) { +template<> +void Gemm::PackLhs(const MatrixMap &lhs, + float *packed_lhs) { #ifdef __aarch64__ Pack<8, 4>(lhs, ColMajor, packed_lhs); #else @@ -700,12 +511,15 @@ void Gemm::PackLhs(const MatrixMap &lhs, #endif } -void Gemm::PackRhs(const MatrixMap &rhs, - float *packed_rhs) { +template<> +void Gemm::PackRhs(const MatrixMap &rhs, + float *packed_rhs) { Pack<8, 4>(rhs, RowMajor, packed_rhs); } -void Gemm::UnpackOutput(const float *packed_output, MatrixMap *output) { +template<> +void Gemm::UnpackOutput(const float *packed_output, + MatrixMap *output) { #ifdef __aarch64__ Unpack<8, 8>(packed_output, output); #else @@ -714,523 +528,670 @@ void Gemm::UnpackOutput(const float *packed_output, MatrixMap *output) { } template<> -void Gemm::Pack<4, 4>(const MatrixMap &matrix, - MatrixMajor dst_major, - float *packed_matrix) { - const index_t rows = matrix.rows(); - const index_t cols = matrix.cols(); - - // use the same terminology as GemmLowp: - // depth is depth, width is the opposite dim other than depth - // lhs - index_t width = rows; - index_t depth = cols; - index_t width_stride = matrix.rows_stride(); - index_t depth_stride = matrix.cols_stride(); - if (dst_major == RowMajor) { - // rhs - std::swap(width, depth); - std::swap(width_stride, depth_stride); +void Gemm::ComputeBlock(const float *packed_lhs_data, + const float *packed_rhs_data, + const index_t depth_padded, + float *packed_output_data) { + /* Ref: + for (index_t r = 0; r < block_size; ++r) { + for (index_t c = 0; c < block_size; ++c) { + float sum = 0; + for (index_t d = 0; d < depth; ++d) { + // (r, d) * (d, c) + sum += packed_lhs_data[d * r_block_size + r] + * packed_rhs_data[d * c_block_size + c]; + } + packed_output_data[r * c_block_size + c] = sum; + } } - const float *data = matrix.data(); - float *packed_ptr = packed_matrix; - - const index_t block_size = 4; - const index_t depth_padded = RoundUp(depth, static_cast(4)); + */ + const float *lhs_ptr = packed_lhs_data; + const float *rhs_ptr = packed_rhs_data; - if (depth_padded > depth) { - memset(packed_ptr + depth * block_size, - 0, - sizeof(float) * (depth_padded - depth) * block_size); - } + const index_t depth_block_count = depth_padded / 4; - if (dst_major == matrix.matrix_major()) { - if (width < block_size) { - const index_t width_remain = block_size - width; - for (index_t d = 0; d < depth; ++d) { - memcpy(packed_ptr, data, sizeof(float) * width); - memset(packed_ptr + width, 0, sizeof(float) * width_remain); - data += depth_stride; - packed_ptr += block_size; - } - } else { - for (index_t d = 0; d < depth; ++d) { - float32x4_t vi = vld1q_f32(data); - vst1q_f32(packed_ptr, vi); - data += depth_stride; - packed_ptr += block_size; - } - } - } else { - if (width < block_size) { - const index_t width_remain = block_size - width; - for (index_t d = 0; d < depth; ++d) { - for (index_t w = 0; w < width; ++w) { - packed_ptr[w] = data[w * width_stride + d]; - } // w - memset(packed_ptr + width, 0, sizeof(float) * width_remain); - packed_ptr += block_size; - } // d - } else { - const float *data0 = data; - const float *data1 = data + width_stride; - const float *data2 = data1 + width_stride; - const float *data3 = data2 + width_stride; +#ifdef __aarch64__ + // Register layout: (8x4) x (4,8) + // + // +--------+--------+ + // | v8 ... | v9 ... | + // Rhs +--------+--------+ + // | v10... | v11... | + // +--------+--------+ + // | v12... | v13... | + // +--------+--------+ + // | v14... | v15... | + // +--------+--------+ + // + // Lhs + // + // +----+----+----+----+ - - +--------+--------+ + // | v0 | v2 | v4 | v6 | | v16... | v17... | + // | . | | | | | v18... | v19... | + // | . | | | | | v20... | v21... | + // | . | | | | | v22... | v23... | + // +----+----|----+----+ +--------+--------+ + // | v1 | v3 | v5 | v7 | | v24... | v25... | + // | . | | | | | v26... | v27... | + // | . | | | | | v28... | v29... | + // | . | | | | | v30... | v31... | + // +----+----|----+----+ +--------+--------+ + // + // Accumulator + // - const index_t depth_block = depth / 4; - const index_t depth_remain = depth - depth_block * 4; - for (index_t depth_block_idx = 0; depth_block_idx < depth_block; - ++depth_block_idx) { - float32x4_t v0 = vld1q_f32(data0); - float32x4_t v1 = vld1q_f32(data1); - float32x4_t v2 = vld1q_f32(data2); - float32x4_t v3 = vld1q_f32(data3); - float32x4x2_t v02_intertwined = vzipq_f32(v0, v2); - float32x4x2_t v13_intertwined = vzipq_f32(v1, v3); - float32x4x2_t v0123_intertwined = - vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]); - float32x4x2_t v0123n_intertwined = - vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]); + if (depth_block_count > 0) { + index_t r_depth_block_count = depth_block_count; + // just make compiler happy + MACE_UNUSED(r_depth_block_count); - vst1q_f32(packed_ptr, v0123_intertwined.val[0]); - packed_ptr += 4; + asm volatile( + "dup v16.4s, wzr \n" + "dup v17.4s, wzr \n" + "dup v18.4s, wzr \n" + "dup v19.4s, wzr \n" + "dup v20.4s, wzr \n" + "dup v21.4s, wzr \n" + "dup v22.4s, wzr \n" + "dup v23.4s, wzr \n" + "dup v24.4s, wzr \n" + "dup v25.4s, wzr \n" + "dup v26.4s, wzr \n" + "dup v27.4s, wzr \n" + "dup v28.4s, wzr \n" + "dup v29.4s, wzr \n" + "dup v30.4s, wzr \n" + "dup v31.4s, wzr \n" - vst1q_f32(packed_ptr, v0123_intertwined.val[1]); - packed_ptr += 4; + // prelogue + "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n" + "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n" + "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n" + "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n" + "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n" + "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n" + "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n" - vst1q_f32(packed_ptr, v0123n_intertwined.val[0]); - packed_ptr += 4; + "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n" + "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n" + "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n" + "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n" + "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n" + "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n" + "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n" + "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n" - vst1q_f32(packed_ptr, v0123n_intertwined.val[1]); - packed_ptr += 4; + "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" + "beq 1f\n" - data0 += 4; - data1 += 4; - data2 += 4; - data3 += 4; - } - for (index_t d = 0; d < depth_remain; ++d) { - float32x4_t vi = {*data0, *data1, *data2, *data3}; - vst1q_f32(packed_ptr, vi); - packed_ptr += 4; + "0: \n" + "fmla v16.4s, v8.4s, v0.s[0] \n" + "fmla v17.4s, v9.4s, v0.s[0] \n" + "fmla v18.4s, v8.4s, v0.s[1] \n" + "fmla v19.4s, v9.4s, v0.s[1] \n" + "fmla v20.4s, v8.4s, v0.s[2] \n" + "fmla v21.4s, v9.4s, v0.s[2] \n" + "fmla v22.4s, v8.4s, v0.s[3] \n" + "fmla v23.4s, v9.4s, v0.s[3] \n" - ++data0; - ++data1; - ++data2; - ++data3; - } // d - } - } -} + "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n" -template<> -void Gemm::Pack<8, 4>(const MatrixMap &matrix, - MatrixMajor dst_major, - float *packed_matrix) { - const index_t rows = matrix.rows(); - const index_t cols = matrix.cols(); + "fmla v24.4s, v8.4s, v1.s[0] \n" + "fmla v25.4s, v9.4s, v1.s[0] \n" + "fmla v26.4s, v8.4s, v1.s[1] \n" + "fmla v27.4s, v9.4s, v1.s[1] \n" + "fmla v28.4s, v8.4s, v1.s[2] \n" + "fmla v29.4s, v9.4s, v1.s[2] \n" + "fmla v30.4s, v8.4s, v1.s[3] \n" + "fmla v31.4s, v9.4s, v1.s[3] \n" - // use the same terminology as GemmLowp: - // depth is depth, width is the opposite dim other than depth - // lhs - index_t width = rows; - index_t depth = cols; - index_t width_stride = matrix.rows_stride(); - index_t depth_stride = matrix.cols_stride(); - if (dst_major == RowMajor) { - // rhs - std::swap(width, depth); - std::swap(width_stride, depth_stride); - } - const float *data = matrix.data(); - float *packed_ptr = packed_matrix; + "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n" + "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n" + "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n" - const index_t block_size = 8; - const index_t depth_padded = RoundUp(depth, static_cast(4)); + "fmla v16.4s, v10.4s, v2.s[0] \n" + "fmla v17.4s, v11.4s, v2.s[0] \n" + "fmla v18.4s, v10.4s, v2.s[1] \n" + "fmla v19.4s, v11.4s, v2.s[1] \n" + "fmla v20.4s, v10.4s, v2.s[2] \n" + "fmla v21.4s, v11.4s, v2.s[2] \n" + "fmla v22.4s, v10.4s, v2.s[3] \n" + "fmla v23.4s, v11.4s, v2.s[3] \n" - if (depth_padded > depth) { - memset(packed_ptr + depth * block_size, - 0, - sizeof(float) * (depth_padded - depth) * block_size); + "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n" + + "fmla v24.4s, v10.4s, v3.s[0] \n" + "fmla v25.4s, v11.4s, v3.s[0] \n" + "fmla v26.4s, v10.4s, v3.s[1] \n" + "fmla v27.4s, v11.4s, v3.s[1] \n" + "fmla v28.4s, v10.4s, v3.s[2] \n" + "fmla v29.4s, v11.4s, v3.s[2] \n" + "fmla v30.4s, v10.4s, v3.s[3] \n" + "fmla v31.4s, v11.4s, v3.s[3] \n" + + "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n" + "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n" + "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n" + + "fmla v16.4s, v12.4s, v4.s[0] \n" + "fmla v17.4s, v13.4s, v4.s[0] \n" + "fmla v18.4s, v12.4s, v4.s[1] \n" + "fmla v19.4s, v13.4s, v4.s[1] \n" + "fmla v20.4s, v12.4s, v4.s[2] \n" + "fmla v21.4s, v13.4s, v4.s[2] \n" + "fmla v22.4s, v12.4s, v4.s[3] \n" + "fmla v23.4s, v13.4s, v4.s[3] \n" + + "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n" + + "fmla v24.4s, v12.4s, v5.s[0] \n" + "fmla v25.4s, v13.4s, v5.s[0] \n" + "fmla v26.4s, v12.4s, v5.s[1] \n" + "fmla v27.4s, v13.4s, v5.s[1] \n" + "fmla v28.4s, v12.4s, v5.s[2] \n" + "fmla v29.4s, v13.4s, v5.s[2] \n" + "fmla v30.4s, v12.4s, v5.s[3] \n" + "fmla v31.4s, v13.4s, v5.s[3] \n" + + "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n" + "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n" + "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n" + + "fmla v16.4s, v14.4s, v6.s[0] \n" + "fmla v17.4s, v15.4s, v6.s[0] \n" + "fmla v18.4s, v14.4s, v6.s[1] \n" + "fmla v19.4s, v15.4s, v6.s[1] \n" + "fmla v20.4s, v14.4s, v6.s[2] \n" + "fmla v21.4s, v15.4s, v6.s[2] \n" + "fmla v22.4s, v14.4s, v6.s[3] \n" + "fmla v23.4s, v15.4s, v6.s[3] \n" + + "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n" + + "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" + + "fmla v24.4s, v14.4s, v7.s[0] \n" + "fmla v25.4s, v15.4s, v7.s[0] \n" + "fmla v26.4s, v14.4s, v7.s[1] \n" + "fmla v27.4s, v15.4s, v7.s[1] \n" + "fmla v28.4s, v14.4s, v7.s[2] \n" + "fmla v29.4s, v15.4s, v7.s[2] \n" + "fmla v30.4s, v14.4s, v7.s[3] \n" + "fmla v31.4s, v15.4s, v7.s[3] \n" + + "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n" + "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n" + "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n" + + "bne 0b \n" + + // prologue + "1:\n" + "fmla v16.4s, v8.4s, v0.s[0] \n" + "fmla v17.4s, v9.4s, v0.s[0] \n" + "fmla v18.4s, v8.4s, v0.s[1] \n" + "fmla v19.4s, v9.4s, v0.s[1] \n" + "fmla v20.4s, v8.4s, v0.s[2] \n" + "fmla v21.4s, v9.4s, v0.s[2] \n" + "fmla v22.4s, v8.4s, v0.s[3] \n" + "fmla v23.4s, v9.4s, v0.s[3] \n" + + "fmla v24.4s, v8.4s, v1.s[0] \n" + "fmla v25.4s, v9.4s, v1.s[0] \n" + "fmla v26.4s, v8.4s, v1.s[1] \n" + "fmla v27.4s, v9.4s, v1.s[1] \n" + "fmla v28.4s, v8.4s, v1.s[2] \n" + "fmla v29.4s, v9.4s, v1.s[2] \n" + "fmla v30.4s, v8.4s, v1.s[3] \n" + "fmla v31.4s, v9.4s, v1.s[3] \n" + + "fmla v16.4s, v10.4s, v2.s[0] \n" + "fmla v17.4s, v11.4s, v2.s[0] \n" + "fmla v18.4s, v10.4s, v2.s[1] \n" + "fmla v19.4s, v11.4s, v2.s[1] \n" + "fmla v20.4s, v10.4s, v2.s[2] \n" + "fmla v21.4s, v11.4s, v2.s[2] \n" + "fmla v22.4s, v10.4s, v2.s[3] \n" + "fmla v23.4s, v11.4s, v2.s[3] \n" + + "fmla v24.4s, v10.4s, v3.s[0] \n" + "fmla v25.4s, v11.4s, v3.s[0] \n" + "fmla v26.4s, v10.4s, v3.s[1] \n" + "fmla v27.4s, v11.4s, v3.s[1] \n" + "fmla v28.4s, v10.4s, v3.s[2] \n" + "fmla v29.4s, v11.4s, v3.s[2] \n" + "fmla v30.4s, v10.4s, v3.s[3] \n" + "fmla v31.4s, v11.4s, v3.s[3] \n" + + "fmla v16.4s, v12.4s, v4.s[0] \n" + "fmla v17.4s, v13.4s, v4.s[0] \n" + "fmla v18.4s, v12.4s, v4.s[1] \n" + "fmla v19.4s, v13.4s, v4.s[1] \n" + "fmla v20.4s, v12.4s, v4.s[2] \n" + "fmla v21.4s, v13.4s, v4.s[2] \n" + "fmla v22.4s, v12.4s, v4.s[3] \n" + "fmla v23.4s, v13.4s, v4.s[3] \n" + + "fmla v24.4s, v12.4s, v5.s[0] \n" + "fmla v25.4s, v13.4s, v5.s[0] \n" + "fmla v26.4s, v12.4s, v5.s[1] \n" + "fmla v27.4s, v13.4s, v5.s[1] \n" + "fmla v28.4s, v12.4s, v5.s[2] \n" + "fmla v29.4s, v13.4s, v5.s[2] \n" + "fmla v30.4s, v12.4s, v5.s[3] \n" + "fmla v31.4s, v13.4s, v5.s[3] \n" + + "fmla v16.4s, v14.4s, v6.s[0] \n" + "fmla v17.4s, v15.4s, v6.s[0] \n" + "fmla v18.4s, v14.4s, v6.s[1] \n" + "fmla v19.4s, v15.4s, v6.s[1] \n" + "fmla v20.4s, v14.4s, v6.s[2] \n" + "fmla v21.4s, v15.4s, v6.s[2] \n" + "fmla v22.4s, v14.4s, v6.s[3] \n" + "fmla v23.4s, v15.4s, v6.s[3] \n" + + "fmla v24.4s, v14.4s, v7.s[0] \n" + "fmla v25.4s, v15.4s, v7.s[0] \n" + "fmla v26.4s, v14.4s, v7.s[1] \n" + "fmla v27.4s, v15.4s, v7.s[1] \n" + "fmla v28.4s, v14.4s, v7.s[2] \n" + "fmla v29.4s, v15.4s, v7.s[2] \n" + "fmla v30.4s, v14.4s, v7.s[3] \n" + "fmla v31.4s, v15.4s, v7.s[3] \n" + + "st1 {v16.4s}, [%[packed_output_data]], #16 \n" + "st1 {v17.4s}, [%[packed_output_data]], #16 \n" + "st1 {v18.4s}, [%[packed_output_data]], #16 \n" + "st1 {v19.4s}, [%[packed_output_data]], #16 \n" + "st1 {v20.4s}, [%[packed_output_data]], #16 \n" + "st1 {v21.4s}, [%[packed_output_data]], #16 \n" + "st1 {v22.4s}, [%[packed_output_data]], #16 \n" + "st1 {v23.4s}, [%[packed_output_data]], #16 \n" + "st1 {v24.4s}, [%[packed_output_data]], #16 \n" + "st1 {v25.4s}, [%[packed_output_data]], #16 \n" + "st1 {v26.4s}, [%[packed_output_data]], #16 \n" + "st1 {v27.4s}, [%[packed_output_data]], #16 \n" + "st1 {v28.4s}, [%[packed_output_data]], #16 \n" + "st1 {v29.4s}, [%[packed_output_data]], #16 \n" + "st1 {v30.4s}, [%[packed_output_data]], #16 \n" + "st1 {v31.4s}, [%[packed_output_data]], #16 \n" + : // outputs + [lhs_ptr] "+r"(lhs_ptr), + [rhs_ptr] "+r"(rhs_ptr), + [packed_output_data] "+r"(packed_output_data), + [r_depth_block_count] "+r"(r_depth_block_count) + : // inputs + : // clabbers + "cc", "memory", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } +#else // armeabi-v7a + // Register layout: (4x4) x (4,8) + // + // +--------+--------+ + // | q4 ... | q5 ... | + // Rhs +--------+--------+ + // | q6 ... | q7 ... | + // +--------+--------+ + // | q4 ... | q5 ... | + // +--------+--------+ + // | q6 ... | q7 ... | + // +--------+--------+ + // + // Lhs + // + // +----+----+----+----+ - - +--------+--------+ + // | q0 | q1 | q2 | q3 | | q8... | q9... | + // | . | | | | | q10... | q11... | + // | . | | | | | q12... | q13... | + // | . | | | | | q14... | q15... | + // +----+----+----+----+ +--------+--------+ + // + // Accumulator + // - if (dst_major == matrix.matrix_major()) { - if (width < block_size) { - const index_t width_remain = block_size - width; - for (index_t d = 0; d < depth; ++d) { - memcpy(packed_ptr, data, sizeof(float) * width); - memset(packed_ptr + width, 0, sizeof(float) * width_remain); - data += depth_stride; - packed_ptr += block_size; - } - } else { - for (index_t d = 0; d < depth; ++d) { - float32x4_t vi = vld1q_f32(data); - vst1q_f32(packed_ptr, vi); - float32x4_t vin = vld1q_f32(data + 4); - vst1q_f32(packed_ptr + 4, vin); - data += depth_stride; - packed_ptr += block_size; - } - } - } else { - if (width < block_size) { - const index_t width_remain = block_size - width; - for (index_t d = 0; d < depth; ++d) { - for (index_t w = 0; w < width; ++w) { - packed_ptr[w] = data[w * width_stride + d]; - } // w - memset(packed_ptr + width, 0, sizeof(float) * width_remain); - packed_ptr += block_size; - } // d - } else { - const float *data0 = data; - const float *data1 = data + width_stride; - const float *data2 = data1 + width_stride; - const float *data3 = data2 + width_stride; - const float *data4 = data3 + width_stride; - const float *data5 = data4 + width_stride; - const float *data6 = data5 + width_stride; - const float *data7 = data6 + width_stride; + if (depth_block_count > 0) { + index_t r_depth_block_count = depth_block_count; + // just make compiler happy + MACE_UNUSED(r_depth_block_count); - const index_t depth_block = depth / 4; - const index_t depth_remain = depth - depth_block * 4; - for (index_t depth_block_idx = 0; depth_block_idx < depth_block; - ++depth_block_idx) { - float32x4_t v0 = vld1q_f32(data0); - float32x4_t v1 = vld1q_f32(data1); - float32x4_t v2 = vld1q_f32(data2); - float32x4_t v3 = vld1q_f32(data3); - float32x4x2_t v02_intertwined = vzipq_f32(v0, v2); - float32x4x2_t v13_intertwined = vzipq_f32(v1, v3); - float32x4x2_t v0123_intertwined = - vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]); - float32x4x2_t v0123n_intertwined = - vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]); + asm volatile( + "mov r0, #0\n" + "vdup.f32 q8, r0 \n" + "vdup.f32 q9, r0 \n" + "vdup.f32 q10, r0 \n" + "vdup.f32 q11, r0 \n" + "vdup.f32 q12, r0 \n" + "vdup.f32 q13, r0 \n" + "vdup.f32 q14, r0 \n" + "vdup.f32 q15, r0 \n" - float32x4_t v4 = vld1q_f32(data4); - float32x4_t v5 = vld1q_f32(data5); - float32x4_t v6 = vld1q_f32(data6); - float32x4_t v7 = vld1q_f32(data7); - float32x4x2_t v46_intertwined = vzipq_f32(v4, v6); - float32x4x2_t v57_intertwined = vzipq_f32(v5, v7); - float32x4x2_t v4567_intertwined = - vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]); - float32x4x2_t v4567n_intertwined = - vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]); + // prelogue + "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" + "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" + "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n" + "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n" - vst1q_f32(packed_ptr, v0123_intertwined.val[0]); - packed_ptr += 4; + "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" + "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" + "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" + "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" - vst1q_f32(packed_ptr, v4567_intertwined.val[0]); - packed_ptr += 4; + "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" + "beq 1f\n" - vst1q_f32(packed_ptr, v0123_intertwined.val[1]); - packed_ptr += 4; + "0: \n" - vst1q_f32(packed_ptr, v4567_intertwined.val[1]); - packed_ptr += 4; + "vmla.f32 q8, q4, d0[0] \n" + "vmla.f32 q9, q5, d0[0] \n" + "vmla.f32 q10, q4, d0[1] \n" + "vmla.f32 q11, q5, d0[1] \n" + "vmla.f32 q12, q4, d1[0] \n" + "vmla.f32 q13, q5, d1[0] \n" + "vmla.f32 q14, q4, d1[1] \n" + "vmla.f32 q15, q5, d1[1] \n" - vst1q_f32(packed_ptr, v0123n_intertwined.val[0]); - packed_ptr += 4; + "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" + "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" + "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" - vst1q_f32(packed_ptr, v4567n_intertwined.val[0]); - packed_ptr += 4; + "vmla.f32 q8, q6, d2[0] \n" + "vmla.f32 q9, q7, d2[0] \n" + "vmla.f32 q10, q6, d2[1] \n" + "vmla.f32 q11, q7, d2[1] \n" + "vmla.f32 q12, q6, d3[0] \n" + "vmla.f32 q13, q7, d3[0] \n" + "vmla.f32 q14, q6, d3[1] \n" + "vmla.f32 q15, q7, d3[1] \n" - vst1q_f32(packed_ptr, v0123n_intertwined.val[1]); - packed_ptr += 4; + "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" + "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" + "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" - vst1q_f32(packed_ptr, v4567n_intertwined.val[1]); - packed_ptr += 4; + "vmla.f32 q8, q4, d4[0] \n" + "vmla.f32 q9, q5, d4[0] \n" + "vmla.f32 q10, q4, d4[1] \n" + "vmla.f32 q11, q5, d4[1] \n" + "vmla.f32 q12, q4, d5[0] \n" + "vmla.f32 q13, q5, d5[0] \n" + "vmla.f32 q14, q4, d5[1] \n" + "vmla.f32 q15, q5, d5[1] \n" - data0 += 4; - data1 += 4; - data2 += 4; - data3 += 4; - data4 += 4; - data5 += 4; - data6 += 4; - data7 += 4; - } - for (index_t d = 0; d < depth_remain; ++d) { - float32x4_t vi = {*data0, *data1, *data2, *data3}; - vst1q_f32(packed_ptr, vi); - packed_ptr += 4; + "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n" + "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" + "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" - float32x4_t vin = {*data4, *data5, *data6, *data7}; - vst1q_f32(packed_ptr, vin); - packed_ptr += 4; + "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" - ++data0; - ++data1; - ++data2; - ++data3; - ++data4; - ++data5; - ++data6; - ++data7; - } // d - } - } -} + "vmla.f32 q8, q6, d6[0] \n" + "vmla.f32 q9, q7, d6[0] \n" + "vmla.f32 q10, q6, d6[1] \n" + "vmla.f32 q11, q7, d6[1] \n" + "vmla.f32 q12, q6, d7[0] \n" + "vmla.f32 q13, q7, d7[0] \n" + "vmla.f32 q14, q6, d7[1] \n" + "vmla.f32 q15, q7, d7[1] \n" -template<> -void Gemm::Unpack<4, 8>(const float *packed_output, MatrixMap *output) { - const index_t rows = output->rows(); - const index_t cols = output->cols(); - index_t row_stride = output->rows_stride(); - index_t col_stride = output->cols_stride(); + "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n" + "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" + "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" - float *output_ptr = output->data(); - const float *packed_ptr = packed_output; + "bne 0b \n" - const index_t block_size = 8; + // prologue + "1:\n" + "vmla.f32 q8, q4, d0[0] \n" + "vmla.f32 q9, q5, d0[0] \n" + "vmla.f32 q10, q4, d0[1] \n" + "vmla.f32 q11, q5, d0[1] \n" + "vmla.f32 q12, q4, d1[0] \n" + "vmla.f32 q13, q5, d1[0] \n" + "vmla.f32 q14, q4, d1[1] \n" + "vmla.f32 q15, q5, d1[1] \n" - // packed_output always has row-major - if (output->matrix_major() == RowMajor) { - if (cols < block_size) { - for (index_t r = 0; r < rows; ++r) { - memcpy(output_ptr, packed_ptr, sizeof(float) * cols); - output_ptr += row_stride; - packed_ptr += block_size; - } - } else { - for (index_t r = 0; r < rows; ++r) { - float32x4_t vi = vld1q_f32(packed_ptr); - vst1q_f32(output_ptr, vi); - float32x4_t vin = vld1q_f32(packed_ptr + 4); - vst1q_f32(output_ptr + 4, vin); + "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" + "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" - output_ptr += row_stride; - packed_ptr += block_size; - } - } - } else { - // ColMajor - if (rows < block_size) { - for (index_t c = 0; c < cols; ++c) { - for (index_t r = 0; r < rows; ++r) { - output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c]; - } // r - } // c - } else { - const float *data0 = packed_ptr; - const float *data1 = data0 + block_size; - const float *data2 = data1 + block_size; - const float *data3 = data2 + block_size; + "vmla.f32 q8, q6, d2[0] \n" + "vmla.f32 q9, q7, d2[0] \n" + "vmla.f32 q10, q6, d2[1] \n" + "vmla.f32 q11, q7, d2[1] \n" + "vmla.f32 q12, q6, d3[0] \n" + "vmla.f32 q13, q7, d3[0] \n" + "vmla.f32 q14, q6, d3[1] \n" + "vmla.f32 q15, q7, d3[1] \n" - index_t col_block = cols / 4; - index_t col_remain = cols - col_block * 4; - for (index_t col_block_idx = 0; col_block_idx < col_block; - ++col_block_idx) { - float32x4_t v0 = vld1q_f32(data0); - float32x4_t v1 = vld1q_f32(data1); - float32x4_t v2 = vld1q_f32(data2); - float32x4_t v3 = vld1q_f32(data3); - float32x4x2_t v02_intertwined = vzipq_f32(v0, v2); - float32x4x2_t v13_intertwined = vzipq_f32(v1, v3); - float32x4x2_t v0123_intertwined = - vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]); - float32x4x2_t v0123n_intertwined = - vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]); + "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" + "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" - vst1q_f32(output_ptr, v0123_intertwined.val[0]); - output_ptr += col_stride; + "vmla.f32 q8, q4, d4[0] \n" + "vmla.f32 q9, q5, d4[0] \n" + "vmla.f32 q10, q4, d4[1] \n" + "vmla.f32 q11, q5, d4[1] \n" + "vmla.f32 q12, q4, d5[0] \n" + "vmla.f32 q13, q5, d5[0] \n" + "vmla.f32 q14, q4, d5[1] \n" + "vmla.f32 q15, q5, d5[1] \n" - vst1q_f32(output_ptr, v0123_intertwined.val[1]); - output_ptr += col_stride; + "vmla.f32 q8, q6, d6[0] \n" + "vmla.f32 q9, q7, d6[0] \n" + "vmla.f32 q10, q6, d6[1] \n" + "vmla.f32 q11, q7, d6[1] \n" + "vmla.f32 q12, q6, d7[0] \n" + "vmla.f32 q13, q7, d7[0] \n" + "vmla.f32 q14, q6, d7[1] \n" + "vmla.f32 q15, q7, d7[1] \n" - vst1q_f32(output_ptr, v0123n_intertwined.val[0]); - output_ptr += col_stride; + "vst1.f32 {d16-d17}, [%[packed_output_data]]! \n" + "vst1.f32 {d18-d19}, [%[packed_output_data]]! \n" + "vst1.f32 {d20-d21}, [%[packed_output_data]]! \n" + "vst1.f32 {d22-d23}, [%[packed_output_data]]! \n" + "vst1.f32 {d24-d25}, [%[packed_output_data]]! \n" + "vst1.f32 {d26-d27}, [%[packed_output_data]]! \n" + "vst1.f32 {d28-d29}, [%[packed_output_data]]! \n" + "vst1.f32 {d30-d31}, [%[packed_output_data]]! \n" + : // outputs + [lhs_ptr] "+r"(lhs_ptr), + [rhs_ptr] "+r"(rhs_ptr), + [packed_output_data] "+r"(packed_output_data), + [r_depth_block_count] "+r"(r_depth_block_count) + : // inputs + : // clabbers + "cc", "memory", "r0", + "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + } +#endif +} - vst1q_f32(output_ptr, v0123n_intertwined.val[1]); - output_ptr += col_stride; +template<> +MaceStatus Gemm::Compute(const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const index_t batch, + const index_t rows, + const index_t cols, + const index_t depth, + const MatrixMajor lhs_major, + const MatrixMajor rhs_major, + const MatrixMajor output_major, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) { + MACE_CHECK(output->size() == batch * rows * cols, + "Need resize output tensor before call gemm."); + Tensor::MappingGuard lhs_guard(lhs); + Tensor::MappingGuard rhs_guard(rhs); + Tensor::MappingGuard output_guard(output); + const float *lhs_data = lhs->data(); + const float *rhs_data = rhs->data(); + float *output_data = output->mutable_data(); + +#ifdef __aarch64__ + const index_t row_block_size = 8; +#else + const index_t row_block_size = 4; +#endif + const index_t col_block_size = 8; + const index_t depth_block_size = 4; + const index_t row_block_count = RoundUpDiv(rows, row_block_size); + const index_t col_block_count = RoundUpDiv(cols, col_block_size); + const index_t rows_padded = RoundUp(rows, row_block_size); + const index_t cols_padded = RoundUp(cols, col_block_size); + const index_t depth_padded = RoundUp(depth, depth_block_size); - data0 += 4; - data1 += 4; - data2 += 4; - data3 += 4; - } - for (index_t c = 0; c < col_remain; ++c) { - float32x4_t vi = {*data0, *data1, *data2, *data3}; - vst1q_f32(output_ptr, vi); - output_ptr += col_stride; + ScratchBuffer *scratch = context->device()->scratch_buffer(); - ++data0; - ++data1; - ++data2; - ++data3; - } // d + index_t packed_lhs_size = + PadAlignSize(sizeof(float) * rows_padded * depth_padded); + index_t packed_rhs_size = + PadAlignSize(sizeof(float) * depth_padded * cols_padded); + index_t packed_output_size = + PadAlignSize(sizeof(float) * rows_padded * cols_padded); + // resize to the total size of lhs & rhs & output anyway, + // in case we do not cache const tensor for saving memory + MACE_RETURN_IF_ERROR(scratch->GrowSize( + packed_lhs_size + packed_rhs_size + packed_output_size)); + float *packed_lhs_data = + scratch->Scratch(packed_lhs_size).mutable_data(); + float *packed_rhs_data = + scratch->Scratch(packed_rhs_size).mutable_data(); + float *packed_output_data = + scratch->Scratch(packed_output_size).mutable_data(); + + int cache_side = kNoCache; + if (cached_ == kCacheLhs) { + packed_lhs_data = pack_cache_.mutable_data(); + } else if (cached_ == kCacheRhs) { + packed_rhs_data = pack_cache_.mutable_data(); + } else if (should_cache_pack_) { + if (lhs->is_weight() && (!lhs_batched || batch == 1)) { + cache_side = kCacheLhs; + pack_cache_.Resize(packed_lhs_size); + packed_lhs_data = pack_cache_.mutable_data(); + } else if (rhs->is_weight() && (!rhs_batched || batch == 1)) { + cache_side = kCacheRhs; + pack_cache_.Resize(packed_rhs_size); + packed_rhs_data = pack_cache_.mutable_data(); } } -} - -template<> -void Gemm::Unpack<8, 8>(const float *packed_output, MatrixMap *output) { - const index_t rows = output->rows(); - const index_t cols = output->cols(); - index_t row_stride = output->rows_stride(); - index_t col_stride = output->cols_stride(); - float *output_ptr = output->data(); - const float *packed_ptr = packed_output; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); - const index_t block_size = 8; + for (index_t b = 0; b < batch; ++b) { + MatrixMap + lhs_matrix + (lhs_data + static_cast(lhs_batched) * b * rows * depth, + lhs_major, + rows, + depth); + MatrixMap + rhs_matrix + (rhs_data + static_cast(rhs_batched) * b * depth * cols, + rhs_major, + depth, + cols); + MatrixMap output_matrix + (output_data + b * rows * cols, output_major, rows, cols); - // packed_output always has row-major - if (output->matrix_major() == RowMajor) { - if (cols < block_size) { - for (index_t r = 0; r < rows; ++r) { - memcpy(output_ptr, packed_ptr, sizeof(float) * cols); - output_ptr += row_stride; - packed_ptr += block_size; - } - } else { - for (index_t r = 0; r < rows; ++r) { - float32x4_t vi = vld1q_f32(packed_ptr); - vst1q_f32(output_ptr, vi); - float32x4_t vin = vld1q_f32(packed_ptr + 4); - vst1q_f32(output_ptr + 4, vin); + // pack lhs + if (cached_ != kCacheLhs) { + thread_pool.Compute1D([=, &lhs_matrix](index_t start, + index_t end, + index_t step) { + for (index_t row_block_idx = start; row_block_idx < end; + row_block_idx += step) { + const index_t start_row = row_block_idx * row_block_size; + const index_t + row_block_len = std::min(row_block_size, rows - start_row); + float *packed_lhs_data_block = + packed_lhs_data + row_block_idx * row_block_size * depth_padded; + PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth), + packed_lhs_data_block); + } + }, 0, row_block_count, 1); - output_ptr += row_stride; - packed_ptr += block_size; + if (cache_side == kCacheLhs) { + cached_ = kCacheLhs; + if (lhs->UnderlyingBuffer()->OnHost()) { + AdviseFree(reinterpret_cast(const_cast(lhs->data< + float>())), + lhs->raw_size()); + } } } - } else { - // ColMajor - if (rows < block_size) { - for (index_t c = 0; c < cols; ++c) { - for (index_t r = 0; r < rows; ++r) { - output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c]; - } // r - } // c - } else { - const float *data0 = packed_ptr; - const float *data1 = data0 + block_size; - const float *data2 = data1 + block_size; - const float *data3 = data2 + block_size; - const float *data4 = data3 + block_size; - const float *data5 = data4 + block_size; - const float *data6 = data5 + block_size; - const float *data7 = data6 + block_size; - - index_t col_block = cols / 4; - index_t col_remain = cols - col_block * 4; - for (index_t col_block_idx = 0; col_block_idx < col_block; - ++col_block_idx) { - float32x4_t v0 = vld1q_f32(data0); - float32x4_t v1 = vld1q_f32(data1); - float32x4_t v2 = vld1q_f32(data2); - float32x4_t v3 = vld1q_f32(data3); - float32x4x2_t v02_intertwined = vzipq_f32(v0, v2); - float32x4x2_t v13_intertwined = vzipq_f32(v1, v3); - float32x4x2_t v0123_intertwined = - vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]); - float32x4x2_t v0123n_intertwined = - vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]); - - float32x4_t v4 = vld1q_f32(data4); - float32x4_t v5 = vld1q_f32(data5); - float32x4_t v6 = vld1q_f32(data6); - float32x4_t v7 = vld1q_f32(data7); - float32x4x2_t v46_intertwined = vzipq_f32(v4, v6); - float32x4x2_t v57_intertwined = vzipq_f32(v5, v7); - float32x4x2_t v4567_intertwined = - vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]); - float32x4x2_t v4567n_intertwined = - vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]); - - vst1q_f32(output_ptr, v0123_intertwined.val[0]); - vst1q_f32(output_ptr + 4, v4567_intertwined.val[0]); - output_ptr += col_stride; - - vst1q_f32(output_ptr, v0123_intertwined.val[1]); - vst1q_f32(output_ptr + 4, v4567_intertwined.val[1]); - output_ptr += col_stride; - - vst1q_f32(output_ptr, v0123n_intertwined.val[0]); - vst1q_f32(output_ptr + 4, v4567n_intertwined.val[0]); - output_ptr += col_stride; - vst1q_f32(output_ptr, v0123n_intertwined.val[1]); - vst1q_f32(output_ptr + 4, v4567n_intertwined.val[1]); - output_ptr += col_stride; + // pack rhs + if (cached_ != kCacheRhs) { + thread_pool.Compute1D([=, &rhs_matrix](index_t start, + index_t end, + index_t step) { + for (index_t col_block_idx = start; col_block_idx < end; + col_block_idx += step) { + const index_t start_col = col_block_idx * col_block_size; + const index_t + col_block_len = std::min(col_block_size, cols - start_col); + float *packed_rhs_data_block = + packed_rhs_data + col_block_idx * col_block_size * depth_padded; + PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len), + packed_rhs_data_block); + } + }, 0, col_block_count, 1); - data0 += 4; - data1 += 4; - data2 += 4; - data3 += 4; - data4 += 4; - data5 += 4; - data6 += 4; - data7 += 4; + if (cache_side == kCacheRhs) { + cached_ = kCacheRhs; + if (rhs->UnderlyingBuffer()->OnHost()) { + AdviseFree(reinterpret_cast(const_cast(rhs->data< + float>())), + rhs->raw_size()); + } } - for (index_t c = 0; c < col_remain; ++c) { - float32x4_t vi = {*data0, *data1, *data2, *data3}; - vst1q_f32(output_ptr, vi); - float32x4_t vin = {*data4, *data5, *data6, *data7}; - vst1q_f32(output_ptr + 4, vin); - output_ptr += col_stride; - - ++data0; - ++data1; - ++data2; - ++data3; - ++data4; - ++data5; - ++data6; - ++data7; - } // d } - } -} -MaceStatus Gemm::Compute(const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const index_t batch, - const index_t lhs_rows, - const index_t lhs_cols, - const index_t rhs_rows, - const index_t rhs_cols, - const bool transpose_lhs, - const bool transpose_rhs, - const bool transpose_out, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) { - index_t rows = transpose_lhs ? lhs_cols : lhs_rows; - index_t depth = transpose_lhs ? lhs_rows : lhs_cols; - index_t cols = transpose_rhs ? rhs_rows : rhs_cols; - index_t depth2 = transpose_rhs ? rhs_cols : rhs_rows; - MACE_CHECK(depth == depth2, - "Matrices that multiply have inconsistent depth dim: ", - depth, - " vs. ", - depth2); - - return Compute(context, - lhs, - rhs, - batch, - rows, - cols, - depth, - transpose_lhs ? ColMajor : RowMajor, - transpose_rhs ? ColMajor : RowMajor, - transpose_out ? ColMajor : RowMajor, - lhs_batched, - rhs_batched, - output); -} + // multiply lhs and rhs + thread_pool.Compute1D([=, &output_matrix](index_t start, + index_t end, + index_t step) { + for (index_t row_block_idx = start; row_block_idx < end; + row_block_idx += step) { + const index_t start_row = row_block_idx * row_block_size; + const index_t + row_block_len = std::min(row_block_size, rows - start_row); + const float *packed_lhs_data_block = + packed_lhs_data + row_block_idx * row_block_size * depth_padded; + + for (index_t col_block_idx = 0; col_block_idx < col_block_count; + ++col_block_idx) { + const index_t start_col = col_block_idx * col_block_size; + const index_t + col_block_len = std::min(col_block_size, cols - start_col); + const float *packed_rhs_data_block = + packed_rhs_data + col_block_idx * col_block_size * depth_padded; + float *packed_output_data_block = + packed_output_data + row_block_idx * row_block_size * cols_padded + + col_block_idx * col_block_size; + ComputeBlock(packed_lhs_data_block, + packed_rhs_data_block, + depth_padded, + packed_output_data_block); + MatrixMap output_block = output_matrix.block(start_row, + start_col, + row_block_len, + col_block_len); + UnpackOutput(packed_output_data_block, &output_block); + } // col_block_idx + } // row_block_idx + }, 0, row_block_count, 1); + } // b -void RegisterGemmDelegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, Gemm, delegator::GemmParam, - MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON)); + return MaceStatus::MACE_SUCCESS; } -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/gemv.cc b/mace/ops/arm/fp32/gemv.cc index 57f2f248ebbbf738793bd3df1cc509f88ffcf3e6..257b1665a39c68c76016630c2f5394f03542cd15 100644 --- a/mace/ops/arm/fp32/gemv.cc +++ b/mace/ops/arm/fp32/gemv.cc @@ -12,12 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. - -#include "mace/ops/arm/fp32/gemv.h" - #include #include +#include "mace/ops/arm/base/gemv.h" #include "mace/utils/math.h" #if !defined(__aarch64__) @@ -34,18 +32,18 @@ float vaddvq_f32(float32x4_t v) { namespace mace { namespace ops { namespace arm { -namespace fp32 { - -MaceStatus Gemv::Compute(const OpContext *context, - const Tensor *lhs, - const Tensor *rhs, - const Tensor *bias, - const index_t batch, - const index_t lhs_height, - const index_t lhs_width, - const bool lhs_batched, - const bool rhs_batched, - Tensor *output) { + +template<> +MaceStatus Gemv::Compute(const OpContext *context, + const Tensor *lhs, + const Tensor *rhs, + const Tensor *bias, + const index_t batch, + const index_t lhs_height, + const index_t lhs_width, + const bool lhs_batched, + const bool rhs_batched, + Tensor *output) { MACE_UNUSED(context); MACE_CHECK(output->size() == batch * lhs_height, @@ -378,13 +376,6 @@ MaceStatus Gemv::Compute(const OpContext *context, #undef vaddvq_f32 #endif -void RegisterGemvDelegator(OpDelegatorRegistry *registry) { - MACE_REGISTER_DELEGATOR( - registry, Gemv, DelegatorParam, - MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON)); -} - -} // namespace fp32 } // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/registry/op_delegators_registry.cc b/mace/ops/registry/op_delegators_registry.cc index 4aac7282edae65c83211a50b16bfb641c18c7881..9ef615d584a1277c02a1507041e477f616d6e716 100644 --- a/mace/ops/registry/op_delegators_registry.cc +++ b/mace/ops/registry/op_delegators_registry.cc @@ -38,13 +38,15 @@ extern void RegisterGemvDelegator(OpDelegatorRegistry *registry); #ifdef MACE_ENABLE_NEON namespace arm { namespace fp32 { +extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry); +} // namespace fp32 + extern void RegisterActivationDelegator(OpDelegatorRegistry *registry); extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry); -extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry); @@ -69,7 +71,6 @@ extern void RegisterGroupDeconv2dGeneralDelegator( extern void RegisterGemmDelegator(OpDelegatorRegistry *registry); extern void RegisterGemvDelegator(OpDelegatorRegistry *registry); -} // namespace fp32 #ifdef MACE_ENABLE_QUANTIZE namespace q8 { @@ -97,32 +98,33 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) { #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_NEON - arm::fp32::RegisterActivationDelegator(registry); - arm::fp32::RegisterBiasAddDelegator(registry); - - arm::fp32::RegisterConv2dK1x1Delegator(registry); - arm::fp32::RegisterConv2dK1xNDelegator(registry); - arm::fp32::RegisterConv2dK3x3Delegator(registry); arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry); - arm::fp32::RegisterConv2dK5x5Delegator(registry); - arm::fp32::RegisterConv2dK7x7Delegator(registry); - arm::fp32::RegisterConv2dGeneralDelegator(registry); - - arm::fp32::RegisterDeconv2dK2x2Delegator(registry); - arm::fp32::RegisterDeconv2dK3x3Delegator(registry); - arm::fp32::RegisterDeconv2dK4x4Delegator(registry); - arm::fp32::RegisterDeconv2dGeneralDelegator(registry); - - arm::fp32::RegisterDepthwiseConv2dK3x3Delegator(registry); - arm::fp32::RegisterDepthwiseDeconv2dK3x3Delegator(registry); - arm::fp32::RegisterGroupDeconv2dK3x3Delegator(registry); - arm::fp32::RegisterDepthwiseDeconv2dK4x4Delegator(registry); - arm::fp32::RegisterGroupDeconv2dK4x4Delegator(registry); - arm::fp32::RegisterDepthwiseDeconv2dGeneralDelegator(registry); - arm::fp32::RegisterGroupDeconv2dGeneralDelegator(registry); - - arm::fp32::RegisterGemmDelegator(registry); - arm::fp32::RegisterGemvDelegator(registry); + + arm::RegisterActivationDelegator(registry); + arm::RegisterBiasAddDelegator(registry); + + arm::RegisterConv2dK1x1Delegator(registry); + arm::RegisterConv2dK1xNDelegator(registry); + arm::RegisterConv2dK3x3Delegator(registry); + arm::RegisterConv2dK5x5Delegator(registry); + arm::RegisterConv2dK7x7Delegator(registry); + arm::RegisterConv2dGeneralDelegator(registry); + + arm::RegisterDeconv2dK2x2Delegator(registry); + arm::RegisterDeconv2dK3x3Delegator(registry); + arm::RegisterDeconv2dK4x4Delegator(registry); + arm::RegisterDeconv2dGeneralDelegator(registry); + + arm::RegisterDepthwiseConv2dK3x3Delegator(registry); + arm::RegisterDepthwiseDeconv2dK3x3Delegator(registry); + arm::RegisterGroupDeconv2dK3x3Delegator(registry); + arm::RegisterDepthwiseDeconv2dK4x4Delegator(registry); + arm::RegisterGroupDeconv2dK4x4Delegator(registry); + arm::RegisterDepthwiseDeconv2dGeneralDelegator(registry); + arm::RegisterGroupDeconv2dGeneralDelegator(registry); + + arm::RegisterGemmDelegator(registry); + arm::RegisterGemvDelegator(registry); #ifdef MACE_ENABLE_QUANTIZE arm::q8::RegisterEltwiseDelegator(registry);